Move static code outside of unciode.py.
This commit is contained in:
76
src/libstd_unicode/bool_trie.rs
Normal file
76
src/libstd_unicode/bool_trie.rs
Normal file
@@ -0,0 +1,76 @@
|
|||||||
|
// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
|
||||||
|
// file at the top-level directory of this distribution and at
|
||||||
|
// http://rust-lang.org/COPYRIGHT.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||||
|
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||||
|
// option. This file may not be copied, modified, or distributed
|
||||||
|
// except according to those terms.
|
||||||
|
|
||||||
|
/// BoolTrie is a trie for representing a set of Unicode codepoints. It is
|
||||||
|
/// implemented with postfix compression (sharing of identical child nodes),
|
||||||
|
/// which gives both compact size and fast lookup.
|
||||||
|
///
|
||||||
|
/// The space of Unicode codepoints is divided into 3 subareas, each
|
||||||
|
/// represented by a trie with different depth. In the first (0..0x800), there
|
||||||
|
/// is no trie structure at all; each u64 entry corresponds to a bitvector
|
||||||
|
/// effectively holding 64 bool values.
|
||||||
|
///
|
||||||
|
/// In the second (0x800..0x10000), each child of the root node represents a
|
||||||
|
/// 64-wide subrange, but instead of storing the full 64-bit value of the leaf,
|
||||||
|
/// the trie stores an 8-bit index into a shared table of leaf values. This
|
||||||
|
/// exploits the fact that in reasonable sets, many such leaves can be shared.
|
||||||
|
///
|
||||||
|
/// In the third (0x10000..0x110000), each child of the root node represents a
|
||||||
|
/// 4096-wide subrange, and the trie stores an 8-bit index into a 64-byte slice
|
||||||
|
/// of a child tree. Each of these 64 bytes represents an index into the table
|
||||||
|
/// of shared 64-bit leaf values. This exploits the sparse structure in the
|
||||||
|
/// non-BMP range of most Unicode sets.
|
||||||
|
pub struct BoolTrie {
|
||||||
|
// 0..0x800 (corresponding to 1 and 2 byte utf-8 sequences)
|
||||||
|
pub r1: [u64; 32], // leaves
|
||||||
|
|
||||||
|
// 0x800..0x10000 (corresponding to 3 byte utf-8 sequences)
|
||||||
|
pub r2: [u8; 992], // first level
|
||||||
|
pub r3: &'static [u64], // leaves
|
||||||
|
|
||||||
|
// 0x10000..0x110000 (corresponding to 4 byte utf-8 sequences)
|
||||||
|
pub r4: [u8; 256], // first level
|
||||||
|
pub r5: &'static [u8], // second level
|
||||||
|
pub r6: &'static [u64], // leaves
|
||||||
|
}
|
||||||
|
impl BoolTrie {
|
||||||
|
pub fn lookup(&self, c: char) -> bool {
|
||||||
|
let c = c as usize;
|
||||||
|
if c < 0x800 {
|
||||||
|
trie_range_leaf(c, self.r1[c >> 6])
|
||||||
|
} else if c < 0x10000 {
|
||||||
|
let child = self.r2[(c >> 6) - 0x20];
|
||||||
|
trie_range_leaf(c, self.r3[child as usize])
|
||||||
|
} else {
|
||||||
|
let child = self.r4[(c >> 12) - 0x10];
|
||||||
|
let leaf = self.r5[((child as usize) << 6) + ((c >> 6) & 0x3f)];
|
||||||
|
trie_range_leaf(c, self.r6[leaf as usize])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct SmallBoolTrie {
|
||||||
|
pub(crate) r1: &'static [u8], // first level
|
||||||
|
pub(crate) r2: &'static [u64], // leaves
|
||||||
|
}
|
||||||
|
|
||||||
|
impl SmallBoolTrie {
|
||||||
|
pub fn lookup(&self, c: char) -> bool {
|
||||||
|
let c = c as usize;
|
||||||
|
match self.r1.get(c >> 6) {
|
||||||
|
Some(&child) => trie_range_leaf(c, self.r2[child as usize]),
|
||||||
|
None => false,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn trie_range_leaf(c: usize, bitmap_chunk: u64) -> bool {
|
||||||
|
((bitmap_chunk >> (c & 63)) & 1) != 0
|
||||||
|
}
|
||||||
@@ -47,7 +47,9 @@ pub use core::char::CharTryFromError;
|
|||||||
#[unstable(feature = "decode_utf8", issue = "33906")]
|
#[unstable(feature = "decode_utf8", issue = "33906")]
|
||||||
pub use core::char::{DecodeUtf8, decode_utf8};
|
pub use core::char::{DecodeUtf8, decode_utf8};
|
||||||
#[unstable(feature = "unicode", issue = "27783")]
|
#[unstable(feature = "unicode", issue = "27783")]
|
||||||
pub use tables::{UnicodeVersion, UNICODE_VERSION};
|
pub use tables::{UNICODE_VERSION};
|
||||||
|
#[unstable(feature = "unicode", issue = "27783")]
|
||||||
|
pub use version::UnicodeVersion;
|
||||||
|
|
||||||
/// Returns an iterator that yields the lowercase equivalent of a `char`.
|
/// Returns an iterator that yields the lowercase equivalent of a `char`.
|
||||||
///
|
///
|
||||||
|
|||||||
@@ -35,15 +35,18 @@
|
|||||||
#![feature(core_char_ext)]
|
#![feature(core_char_ext)]
|
||||||
#![feature(str_internals)]
|
#![feature(str_internals)]
|
||||||
#![feature(decode_utf8)]
|
#![feature(decode_utf8)]
|
||||||
#![feature(fused)]
|
|
||||||
#![feature(fn_traits)]
|
#![feature(fn_traits)]
|
||||||
|
#![feature(fused)]
|
||||||
#![feature(lang_items)]
|
#![feature(lang_items)]
|
||||||
|
#![feature(non_exhaustive)]
|
||||||
#![feature(staged_api)]
|
#![feature(staged_api)]
|
||||||
#![feature(try_from)]
|
#![feature(try_from)]
|
||||||
#![feature(unboxed_closures)]
|
#![feature(unboxed_closures)]
|
||||||
|
|
||||||
|
mod bool_trie;
|
||||||
mod tables;
|
mod tables;
|
||||||
mod u_str;
|
mod u_str;
|
||||||
|
mod version;
|
||||||
pub mod char;
|
pub mod char;
|
||||||
pub mod lossy;
|
pub mod lossy;
|
||||||
|
|
||||||
|
|||||||
@@ -12,23 +12,8 @@
|
|||||||
|
|
||||||
#![allow(missing_docs, non_upper_case_globals, non_snake_case)]
|
#![allow(missing_docs, non_upper_case_globals, non_snake_case)]
|
||||||
|
|
||||||
/// Represents a Unicode Version.
|
use version::UnicodeVersion;
|
||||||
///
|
use bool_trie::{BoolTrie, SmallBoolTrie};
|
||||||
/// See also: <http://www.unicode.org/versions/>
|
|
||||||
#[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd)]
|
|
||||||
pub struct UnicodeVersion {
|
|
||||||
/// Major version.
|
|
||||||
pub major: u32,
|
|
||||||
|
|
||||||
/// Minor version.
|
|
||||||
pub minor: u32,
|
|
||||||
|
|
||||||
/// Micro (or Update) version.
|
|
||||||
pub micro: u32,
|
|
||||||
|
|
||||||
// Private field to keep struct expandable.
|
|
||||||
_priv: (),
|
|
||||||
}
|
|
||||||
|
|
||||||
/// The version of [Unicode](http://www.unicode.org/) that the Unicode parts of
|
/// The version of [Unicode](http://www.unicode.org/) that the Unicode parts of
|
||||||
/// `CharExt` and `UnicodeStrPrelude` traits are based on.
|
/// `CharExt` and `UnicodeStrPrelude` traits are based on.
|
||||||
@@ -38,76 +23,8 @@ pub const UNICODE_VERSION: UnicodeVersion = UnicodeVersion {
|
|||||||
micro: 0,
|
micro: 0,
|
||||||
_priv: (),
|
_priv: (),
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
// BoolTrie is a trie for representing a set of Unicode codepoints. It is
|
|
||||||
// implemented with postfix compression (sharing of identical child nodes),
|
|
||||||
// which gives both compact size and fast lookup.
|
|
||||||
//
|
|
||||||
// The space of Unicode codepoints is divided into 3 subareas, each
|
|
||||||
// represented by a trie with different depth. In the first (0..0x800), there
|
|
||||||
// is no trie structure at all; each u64 entry corresponds to a bitvector
|
|
||||||
// effectively holding 64 bool values.
|
|
||||||
//
|
|
||||||
// In the second (0x800..0x10000), each child of the root node represents a
|
|
||||||
// 64-wide subrange, but instead of storing the full 64-bit value of the leaf,
|
|
||||||
// the trie stores an 8-bit index into a shared table of leaf values. This
|
|
||||||
// exploits the fact that in reasonable sets, many such leaves can be shared.
|
|
||||||
//
|
|
||||||
// In the third (0x10000..0x110000), each child of the root node represents a
|
|
||||||
// 4096-wide subrange, and the trie stores an 8-bit index into a 64-byte slice
|
|
||||||
// of a child tree. Each of these 64 bytes represents an index into the table
|
|
||||||
// of shared 64-bit leaf values. This exploits the sparse structure in the
|
|
||||||
// non-BMP range of most Unicode sets.
|
|
||||||
pub struct BoolTrie {
|
|
||||||
// 0..0x800 (corresponding to 1 and 2 byte utf-8 sequences)
|
|
||||||
r1: [u64; 32], // leaves
|
|
||||||
|
|
||||||
// 0x800..0x10000 (corresponding to 3 byte utf-8 sequences)
|
|
||||||
r2: [u8; 992], // first level
|
|
||||||
r3: &'static [u64], // leaves
|
|
||||||
|
|
||||||
// 0x10000..0x110000 (corresponding to 4 byte utf-8 sequences)
|
|
||||||
r4: [u8; 256], // first level
|
|
||||||
r5: &'static [u8], // second level
|
|
||||||
r6: &'static [u64], // leaves
|
|
||||||
}
|
|
||||||
|
|
||||||
fn trie_range_leaf(c: usize, bitmap_chunk: u64) -> bool {
|
|
||||||
((bitmap_chunk >> (c & 63)) & 1) != 0
|
|
||||||
}
|
|
||||||
|
|
||||||
fn trie_lookup_range_table(c: char, r: &'static BoolTrie) -> bool {
|
|
||||||
let c = c as usize;
|
|
||||||
if c < 0x800 {
|
|
||||||
trie_range_leaf(c, r.r1[c >> 6])
|
|
||||||
} else if c < 0x10000 {
|
|
||||||
let child = r.r2[(c >> 6) - 0x20];
|
|
||||||
trie_range_leaf(c, r.r3[child as usize])
|
|
||||||
} else {
|
|
||||||
let child = r.r4[(c >> 12) - 0x10];
|
|
||||||
let leaf = r.r5[((child as usize) << 6) + ((c >> 6) & 0x3f)];
|
|
||||||
trie_range_leaf(c, r.r6[leaf as usize])
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub struct SmallBoolTrie {
|
|
||||||
r1: &'static [u8], // first level
|
|
||||||
r2: &'static [u64], // leaves
|
|
||||||
}
|
|
||||||
|
|
||||||
impl SmallBoolTrie {
|
|
||||||
fn lookup(&self, c: char) -> bool {
|
|
||||||
let c = c as usize;
|
|
||||||
match self.r1.get(c >> 6) {
|
|
||||||
Some(&child) => trie_range_leaf(c, self.r2[child as usize]),
|
|
||||||
None => false,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub mod general_category {
|
pub mod general_category {
|
||||||
pub const Cc_table: &'static super::SmallBoolTrie = &super::SmallBoolTrie {
|
pub const Cc_table: &super::SmallBoolTrie = &super::SmallBoolTrie {
|
||||||
r1: &[
|
r1: &[
|
||||||
0, 1, 0
|
0, 1, 0
|
||||||
],
|
],
|
||||||
@@ -120,7 +37,7 @@ pub mod general_category {
|
|||||||
Cc_table.lookup(c)
|
Cc_table.lookup(c)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub const N_table: &'static super::BoolTrie = &super::BoolTrie {
|
pub const N_table: &super::BoolTrie = &super::BoolTrie {
|
||||||
r1: [
|
r1: [
|
||||||
0x03ff000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
|
0x03ff000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
|
||||||
0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
|
0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
|
||||||
@@ -212,13 +129,13 @@ pub mod general_category {
|
|||||||
};
|
};
|
||||||
|
|
||||||
pub fn N(c: char) -> bool {
|
pub fn N(c: char) -> bool {
|
||||||
super::trie_lookup_range_table(c, N_table)
|
N_table.lookup(c)
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub mod derived_property {
|
pub mod derived_property {
|
||||||
pub const Alphabetic_table: &'static super::BoolTrie = &super::BoolTrie {
|
pub const Alphabetic_table: &super::BoolTrie = &super::BoolTrie {
|
||||||
r1: [
|
r1: [
|
||||||
0x0000000000000000, 0x07fffffe07fffffe, 0x0420040000000000, 0xff7fffffff7fffff,
|
0x0000000000000000, 0x07fffffe07fffffe, 0x0420040000000000, 0xff7fffffff7fffff,
|
||||||
0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
|
0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
|
||||||
@@ -397,10 +314,10 @@ pub mod derived_property {
|
|||||||
};
|
};
|
||||||
|
|
||||||
pub fn Alphabetic(c: char) -> bool {
|
pub fn Alphabetic(c: char) -> bool {
|
||||||
super::trie_lookup_range_table(c, Alphabetic_table)
|
Alphabetic_table.lookup(c)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub const Case_Ignorable_table: &'static super::BoolTrie = &super::BoolTrie {
|
pub const Case_Ignorable_table: &super::BoolTrie = &super::BoolTrie {
|
||||||
r1: [
|
r1: [
|
||||||
0x0400408000000000, 0x0000000140000000, 0x0190a10000000000, 0x0000000000000000,
|
0x0400408000000000, 0x0000000140000000, 0x0190a10000000000, 0x0000000000000000,
|
||||||
0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
|
0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
|
||||||
@@ -529,10 +446,10 @@ pub mod derived_property {
|
|||||||
};
|
};
|
||||||
|
|
||||||
pub fn Case_Ignorable(c: char) -> bool {
|
pub fn Case_Ignorable(c: char) -> bool {
|
||||||
super::trie_lookup_range_table(c, Case_Ignorable_table)
|
Case_Ignorable_table.lookup(c)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub const Cased_table: &'static super::BoolTrie = &super::BoolTrie {
|
pub const Cased_table: &super::BoolTrie = &super::BoolTrie {
|
||||||
r1: [
|
r1: [
|
||||||
0x0000000000000000, 0x07fffffe07fffffe, 0x0420040000000000, 0xff7fffffff7fffff,
|
0x0000000000000000, 0x07fffffe07fffffe, 0x0420040000000000, 0xff7fffffff7fffff,
|
||||||
0xffffffffffffffff, 0xffffffffffffffff, 0xf7ffffffffffffff, 0xfffffffffffffff0,
|
0xffffffffffffffff, 0xffffffffffffffff, 0xf7ffffffffffffff, 0xfffffffffffffff0,
|
||||||
@@ -628,10 +545,10 @@ pub mod derived_property {
|
|||||||
};
|
};
|
||||||
|
|
||||||
pub fn Cased(c: char) -> bool {
|
pub fn Cased(c: char) -> bool {
|
||||||
super::trie_lookup_range_table(c, Cased_table)
|
Cased_table.lookup(c)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub const Lowercase_table: &'static super::BoolTrie = &super::BoolTrie {
|
pub const Lowercase_table: &super::BoolTrie = &super::BoolTrie {
|
||||||
r1: [
|
r1: [
|
||||||
0x0000000000000000, 0x07fffffe00000000, 0x0420040000000000, 0xff7fffff80000000,
|
0x0000000000000000, 0x07fffffe00000000, 0x0420040000000000, 0xff7fffff80000000,
|
||||||
0x55aaaaaaaaaaaaaa, 0xd4aaaaaaaaaaab55, 0xe6512d2a4e243129, 0xaa29aaaab5555240,
|
0x55aaaaaaaaaaaaaa, 0xd4aaaaaaaaaaab55, 0xe6512d2a4e243129, 0xaa29aaaab5555240,
|
||||||
@@ -725,10 +642,10 @@ pub mod derived_property {
|
|||||||
};
|
};
|
||||||
|
|
||||||
pub fn Lowercase(c: char) -> bool {
|
pub fn Lowercase(c: char) -> bool {
|
||||||
super::trie_lookup_range_table(c, Lowercase_table)
|
Lowercase_table.lookup(c)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub const Uppercase_table: &'static super::BoolTrie = &super::BoolTrie {
|
pub const Uppercase_table: &super::BoolTrie = &super::BoolTrie {
|
||||||
r1: [
|
r1: [
|
||||||
0x0000000000000000, 0x0000000007fffffe, 0x0000000000000000, 0x000000007f7fffff,
|
0x0000000000000000, 0x0000000007fffffe, 0x0000000000000000, 0x000000007f7fffff,
|
||||||
0xaa55555555555555, 0x2b555555555554aa, 0x11aed2d5b1dbced6, 0x55d255554aaaa490,
|
0xaa55555555555555, 0x2b555555555554aa, 0x11aed2d5b1dbced6, 0x55d255554aaaa490,
|
||||||
@@ -823,10 +740,10 @@ pub mod derived_property {
|
|||||||
};
|
};
|
||||||
|
|
||||||
pub fn Uppercase(c: char) -> bool {
|
pub fn Uppercase(c: char) -> bool {
|
||||||
super::trie_lookup_range_table(c, Uppercase_table)
|
Uppercase_table.lookup(c)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub const XID_Continue_table: &'static super::BoolTrie = &super::BoolTrie {
|
pub const XID_Continue_table: &super::BoolTrie = &super::BoolTrie {
|
||||||
r1: [
|
r1: [
|
||||||
0x03ff000000000000, 0x07fffffe87fffffe, 0x04a0040000000000, 0xff7fffffff7fffff,
|
0x03ff000000000000, 0x07fffffe87fffffe, 0x04a0040000000000, 0xff7fffffff7fffff,
|
||||||
0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
|
0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
|
||||||
@@ -998,10 +915,10 @@ pub mod derived_property {
|
|||||||
};
|
};
|
||||||
|
|
||||||
pub fn XID_Continue(c: char) -> bool {
|
pub fn XID_Continue(c: char) -> bool {
|
||||||
super::trie_lookup_range_table(c, XID_Continue_table)
|
XID_Continue_table.lookup(c)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub const XID_Start_table: &'static super::BoolTrie = &super::BoolTrie {
|
pub const XID_Start_table: &super::BoolTrie = &super::BoolTrie {
|
||||||
r1: [
|
r1: [
|
||||||
0x0000000000000000, 0x07fffffe07fffffe, 0x0420040000000000, 0xff7fffffff7fffff,
|
0x0000000000000000, 0x07fffffe07fffffe, 0x0420040000000000, 0xff7fffffff7fffff,
|
||||||
0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
|
0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
|
||||||
@@ -1175,13 +1092,13 @@ pub mod derived_property {
|
|||||||
};
|
};
|
||||||
|
|
||||||
pub fn XID_Start(c: char) -> bool {
|
pub fn XID_Start(c: char) -> bool {
|
||||||
super::trie_lookup_range_table(c, XID_Start_table)
|
XID_Start_table.lookup(c)
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub mod property {
|
pub mod property {
|
||||||
pub const Pattern_White_Space_table: &'static super::SmallBoolTrie = &super::SmallBoolTrie {
|
pub const Pattern_White_Space_table: &super::SmallBoolTrie = &super::SmallBoolTrie {
|
||||||
r1: &[
|
r1: &[
|
||||||
0, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
0, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||||
@@ -1198,7 +1115,7 @@ pub mod property {
|
|||||||
Pattern_White_Space_table.lookup(c)
|
Pattern_White_Space_table.lookup(c)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub const White_Space_table: &'static super::SmallBoolTrie = &super::SmallBoolTrie {
|
pub const White_Space_table: &super::SmallBoolTrie = &super::SmallBoolTrie {
|
||||||
r1: &[
|
r1: &[
|
||||||
0, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
0, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||||
@@ -1238,11 +1155,11 @@ pub mod conversions {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn bsearch_case_table(c: char, table: &'static [(char, [char; 3])]) -> Option<usize> {
|
fn bsearch_case_table(c: char, table: &[(char, [char; 3])]) -> Option<usize> {
|
||||||
table.binary_search_by(|&(key, _)| key.cmp(&c)).ok()
|
table.binary_search_by(|&(key, _)| key.cmp(&c)).ok()
|
||||||
}
|
}
|
||||||
|
|
||||||
const to_lowercase_table: &'static [(char, [char; 3])] = &[
|
const to_lowercase_table: &[(char, [char; 3])] = &[
|
||||||
('\u{41}', ['\u{61}', '\0', '\0']), ('\u{42}', ['\u{62}', '\0', '\0']), ('\u{43}',
|
('\u{41}', ['\u{61}', '\0', '\0']), ('\u{42}', ['\u{62}', '\0', '\0']), ('\u{43}',
|
||||||
['\u{63}', '\0', '\0']), ('\u{44}', ['\u{64}', '\0', '\0']), ('\u{45}', ['\u{65}', '\0',
|
['\u{63}', '\0', '\0']), ('\u{44}', ['\u{64}', '\0', '\0']), ('\u{45}', ['\u{65}', '\0',
|
||||||
'\0']), ('\u{46}', ['\u{66}', '\0', '\0']), ('\u{47}', ['\u{67}', '\0', '\0']), ('\u{48}',
|
'\0']), ('\u{46}', ['\u{66}', '\0', '\0']), ('\u{47}', ['\u{67}', '\0', '\0']), ('\u{48}',
|
||||||
@@ -1826,7 +1743,7 @@ pub mod conversions {
|
|||||||
('\u{1e920}', ['\u{1e942}', '\0', '\0']), ('\u{1e921}', ['\u{1e943}', '\0', '\0'])
|
('\u{1e920}', ['\u{1e942}', '\0', '\0']), ('\u{1e921}', ['\u{1e943}', '\0', '\0'])
|
||||||
];
|
];
|
||||||
|
|
||||||
const to_uppercase_table: &'static [(char, [char; 3])] = &[
|
const to_uppercase_table: &[(char, [char; 3])] = &[
|
||||||
('\u{61}', ['\u{41}', '\0', '\0']), ('\u{62}', ['\u{42}', '\0', '\0']), ('\u{63}',
|
('\u{61}', ['\u{41}', '\0', '\0']), ('\u{62}', ['\u{42}', '\0', '\0']), ('\u{63}',
|
||||||
['\u{43}', '\0', '\0']), ('\u{64}', ['\u{44}', '\0', '\0']), ('\u{65}', ['\u{45}', '\0',
|
['\u{43}', '\0', '\0']), ('\u{64}', ['\u{44}', '\0', '\0']), ('\u{65}', ['\u{45}', '\0',
|
||||||
'\0']), ('\u{66}', ['\u{46}', '\0', '\0']), ('\u{67}', ['\u{47}', '\0', '\0']), ('\u{68}',
|
'\0']), ('\u{66}', ['\u{46}', '\0', '\0']), ('\u{67}', ['\u{47}', '\0', '\0']), ('\u{68}',
|
||||||
|
|||||||
@@ -38,6 +38,9 @@ preamble = '''// Copyright 2012-2016 The Rust Project Developers. See the COPYRI
|
|||||||
// NOTE: The following code was generated by "./unicode.py", do not edit directly
|
// NOTE: The following code was generated by "./unicode.py", do not edit directly
|
||||||
|
|
||||||
#![allow(missing_docs, non_upper_case_globals, non_snake_case)]
|
#![allow(missing_docs, non_upper_case_globals, non_snake_case)]
|
||||||
|
|
||||||
|
use version::UnicodeVersion;
|
||||||
|
use bool_trie::{BoolTrie, SmallBoolTrie};
|
||||||
'''
|
'''
|
||||||
|
|
||||||
# Mapping taken from Table 12 from:
|
# Mapping taken from Table 12 from:
|
||||||
@@ -274,24 +277,7 @@ def load_properties(f, interestingprops):
|
|||||||
def escape_char(c):
|
def escape_char(c):
|
||||||
return "'\\u{%x}'" % c if c != 0 else "'\\0'"
|
return "'\\u{%x}'" % c if c != 0 else "'\\0'"
|
||||||
|
|
||||||
def emit_bsearch_range_table(f):
|
def emit_table(f, name, t_data, t_type = "&[(char, char)]", is_pub=True,
|
||||||
f.write("""
|
|
||||||
fn bsearch_range_table(c: char, r: &'static [(char, char)]) -> bool {
|
|
||||||
use core::cmp::Ordering::{Equal, Less, Greater};
|
|
||||||
r.binary_search_by(|&(lo, hi)| {
|
|
||||||
if c < lo {
|
|
||||||
Greater
|
|
||||||
} else if hi < c {
|
|
||||||
Less
|
|
||||||
} else {
|
|
||||||
Equal
|
|
||||||
}
|
|
||||||
})
|
|
||||||
.is_ok()
|
|
||||||
}\n
|
|
||||||
""")
|
|
||||||
|
|
||||||
def emit_table(f, name, t_data, t_type = "&'static [(char, char)]", is_pub=True,
|
|
||||||
pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1]))):
|
pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1]))):
|
||||||
pub_string = ""
|
pub_string = ""
|
||||||
if is_pub:
|
if is_pub:
|
||||||
@@ -307,77 +293,6 @@ def emit_table(f, name, t_data, t_type = "&'static [(char, char)]", is_pub=True,
|
|||||||
format_table_content(f, data, 8)
|
format_table_content(f, data, 8)
|
||||||
f.write("\n ];\n\n")
|
f.write("\n ];\n\n")
|
||||||
|
|
||||||
def emit_trie_lookup_range_table(f):
|
|
||||||
f.write("""
|
|
||||||
|
|
||||||
// BoolTrie is a trie for representing a set of Unicode codepoints. It is
|
|
||||||
// implemented with postfix compression (sharing of identical child nodes),
|
|
||||||
// which gives both compact size and fast lookup.
|
|
||||||
//
|
|
||||||
// The space of Unicode codepoints is divided into 3 subareas, each
|
|
||||||
// represented by a trie with different depth. In the first (0..0x800), there
|
|
||||||
// is no trie structure at all; each u64 entry corresponds to a bitvector
|
|
||||||
// effectively holding 64 bool values.
|
|
||||||
//
|
|
||||||
// In the second (0x800..0x10000), each child of the root node represents a
|
|
||||||
// 64-wide subrange, but instead of storing the full 64-bit value of the leaf,
|
|
||||||
// the trie stores an 8-bit index into a shared table of leaf values. This
|
|
||||||
// exploits the fact that in reasonable sets, many such leaves can be shared.
|
|
||||||
//
|
|
||||||
// In the third (0x10000..0x110000), each child of the root node represents a
|
|
||||||
// 4096-wide subrange, and the trie stores an 8-bit index into a 64-byte slice
|
|
||||||
// of a child tree. Each of these 64 bytes represents an index into the table
|
|
||||||
// of shared 64-bit leaf values. This exploits the sparse structure in the
|
|
||||||
// non-BMP range of most Unicode sets.
|
|
||||||
pub struct BoolTrie {
|
|
||||||
// 0..0x800 (corresponding to 1 and 2 byte utf-8 sequences)
|
|
||||||
r1: [u64; 32], // leaves
|
|
||||||
|
|
||||||
// 0x800..0x10000 (corresponding to 3 byte utf-8 sequences)
|
|
||||||
r2: [u8; 992], // first level
|
|
||||||
r3: &'static [u64], // leaves
|
|
||||||
|
|
||||||
// 0x10000..0x110000 (corresponding to 4 byte utf-8 sequences)
|
|
||||||
r4: [u8; 256], // first level
|
|
||||||
r5: &'static [u8], // second level
|
|
||||||
r6: &'static [u64], // leaves
|
|
||||||
}
|
|
||||||
|
|
||||||
fn trie_range_leaf(c: usize, bitmap_chunk: u64) -> bool {
|
|
||||||
((bitmap_chunk >> (c & 63)) & 1) != 0
|
|
||||||
}
|
|
||||||
|
|
||||||
fn trie_lookup_range_table(c: char, r: &'static BoolTrie) -> bool {
|
|
||||||
let c = c as usize;
|
|
||||||
if c < 0x800 {
|
|
||||||
trie_range_leaf(c, r.r1[c >> 6])
|
|
||||||
} else if c < 0x10000 {
|
|
||||||
let child = r.r2[(c >> 6) - 0x20];
|
|
||||||
trie_range_leaf(c, r.r3[child as usize])
|
|
||||||
} else {
|
|
||||||
let child = r.r4[(c >> 12) - 0x10];
|
|
||||||
let leaf = r.r5[((child as usize) << 6) + ((c >> 6) & 0x3f)];
|
|
||||||
trie_range_leaf(c, r.r6[leaf as usize])
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub struct SmallBoolTrie {
|
|
||||||
r1: &'static [u8], // first level
|
|
||||||
r2: &'static [u64], // leaves
|
|
||||||
}
|
|
||||||
|
|
||||||
impl SmallBoolTrie {
|
|
||||||
fn lookup(&self, c: char) -> bool {
|
|
||||||
let c = c as usize;
|
|
||||||
match self.r1.get(c >> 6) {
|
|
||||||
Some(&child) => trie_range_leaf(c, self.r2[child as usize]),
|
|
||||||
None => false,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
""")
|
|
||||||
|
|
||||||
def compute_trie(rawdata, chunksize):
|
def compute_trie(rawdata, chunksize):
|
||||||
root = []
|
root = []
|
||||||
childmap = {}
|
childmap = {}
|
||||||
@@ -410,7 +325,7 @@ def emit_bool_trie(f, name, t_data, is_pub=True):
|
|||||||
pub_string = ""
|
pub_string = ""
|
||||||
if is_pub:
|
if is_pub:
|
||||||
pub_string = "pub "
|
pub_string = "pub "
|
||||||
f.write(" %sconst %s: &'static super::BoolTrie = &super::BoolTrie {\n" % (pub_string, name))
|
f.write(" %sconst %s: &super::BoolTrie = &super::BoolTrie {\n" % (pub_string, name))
|
||||||
f.write(" r1: [\n")
|
f.write(" r1: [\n")
|
||||||
data = ','.join('0x%016x' % chunk for chunk in chunks[0:0x800 // CHUNK])
|
data = ','.join('0x%016x' % chunk for chunk in chunks[0:0x800 // CHUNK])
|
||||||
format_table_content(f, data, 12)
|
format_table_content(f, data, 12)
|
||||||
@@ -458,7 +373,7 @@ def emit_small_bool_trie(f, name, t_data, is_pub=True):
|
|||||||
pub_string = ""
|
pub_string = ""
|
||||||
if is_pub:
|
if is_pub:
|
||||||
pub_string = "pub "
|
pub_string = "pub "
|
||||||
f.write(" %sconst %s: &'static super::SmallBoolTrie = &super::SmallBoolTrie {\n"
|
f.write(" %sconst %s: &super::SmallBoolTrie = &super::SmallBoolTrie {\n"
|
||||||
% (pub_string, name))
|
% (pub_string, name))
|
||||||
|
|
||||||
(r1, r2) = compute_trie(chunks, 1)
|
(r1, r2) = compute_trie(chunks, 1)
|
||||||
@@ -486,7 +401,7 @@ def emit_property_module(f, mod, tbl, emit):
|
|||||||
else:
|
else:
|
||||||
emit_bool_trie(f, "%s_table" % cat, tbl[cat])
|
emit_bool_trie(f, "%s_table" % cat, tbl[cat])
|
||||||
f.write(" pub fn %s(c: char) -> bool {\n" % cat)
|
f.write(" pub fn %s(c: char) -> bool {\n" % cat)
|
||||||
f.write(" super::trie_lookup_range_table(c, %s_table)\n" % cat)
|
f.write(" %s_table.lookup(c)\n" % cat)
|
||||||
f.write(" }\n\n")
|
f.write(" }\n\n")
|
||||||
f.write("}\n\n")
|
f.write("}\n\n")
|
||||||
|
|
||||||
@@ -510,12 +425,12 @@ def emit_conversions_module(f, to_upper, to_lower, to_title):
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn bsearch_case_table(c: char, table: &'static [(char, [char; 3])]) -> Option<usize> {
|
fn bsearch_case_table(c: char, table: &[(char, [char; 3])]) -> Option<usize> {
|
||||||
table.binary_search_by(|&(key, _)| key.cmp(&c)).ok()
|
table.binary_search_by(|&(key, _)| key.cmp(&c)).ok()
|
||||||
}
|
}
|
||||||
|
|
||||||
""")
|
""")
|
||||||
t_type = "&'static [(char, [char; 3])]"
|
t_type = "&[(char, [char; 3])]"
|
||||||
pfun = lambda x: "(%s,[%s,%s,%s])" % (
|
pfun = lambda x: "(%s,[%s,%s,%s])" % (
|
||||||
escape_char(x[0]), escape_char(x[1][0]), escape_char(x[1][1]), escape_char(x[1][2]))
|
escape_char(x[0]), escape_char(x[1][0]), escape_char(x[1][1]), escape_char(x[1][2]))
|
||||||
emit_table(f, "to_lowercase_table",
|
emit_table(f, "to_lowercase_table",
|
||||||
@@ -557,24 +472,6 @@ if __name__ == "__main__":
|
|||||||
pattern = "for Version (\d+)\.(\d+)\.(\d+) of the Unicode"
|
pattern = "for Version (\d+)\.(\d+)\.(\d+) of the Unicode"
|
||||||
unicode_version = re.search(pattern, readme.read()).groups()
|
unicode_version = re.search(pattern, readme.read()).groups()
|
||||||
rf.write("""
|
rf.write("""
|
||||||
/// Represents a Unicode Version.
|
|
||||||
///
|
|
||||||
/// See also: <http://www.unicode.org/versions/>
|
|
||||||
#[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd)]
|
|
||||||
pub struct UnicodeVersion {
|
|
||||||
/// Major version.
|
|
||||||
pub major: u32,
|
|
||||||
|
|
||||||
/// Minor version.
|
|
||||||
pub minor: u32,
|
|
||||||
|
|
||||||
/// Micro (or Update) version.
|
|
||||||
pub micro: u32,
|
|
||||||
|
|
||||||
// Private field to keep struct expandable.
|
|
||||||
_priv: (),
|
|
||||||
}
|
|
||||||
|
|
||||||
/// The version of [Unicode](http://www.unicode.org/) that the Unicode parts of
|
/// The version of [Unicode](http://www.unicode.org/) that the Unicode parts of
|
||||||
/// `CharExt` and `UnicodeStrPrelude` traits are based on.
|
/// `CharExt` and `UnicodeStrPrelude` traits are based on.
|
||||||
pub const UNICODE_VERSION: UnicodeVersion = UnicodeVersion {
|
pub const UNICODE_VERSION: UnicodeVersion = UnicodeVersion {
|
||||||
@@ -596,10 +493,6 @@ pub const UNICODE_VERSION: UnicodeVersion = UnicodeVersion {
|
|||||||
norm_props = load_properties("DerivedNormalizationProps.txt",
|
norm_props = load_properties("DerivedNormalizationProps.txt",
|
||||||
["Full_Composition_Exclusion"])
|
["Full_Composition_Exclusion"])
|
||||||
|
|
||||||
# trie_lookup_table is used in all the property modules below
|
|
||||||
emit_trie_lookup_range_table(rf)
|
|
||||||
# emit_bsearch_range_table(rf)
|
|
||||||
|
|
||||||
# category tables
|
# category tables
|
||||||
for (name, cat, pfuns) in ("general_category", gencats, ["N", "Cc"]), \
|
for (name, cat, pfuns) in ("general_category", gencats, ["N", "Cc"]), \
|
||||||
("derived_property", derived, want_derived), \
|
("derived_property", derived, want_derived), \
|
||||||
|
|||||||
27
src/libstd_unicode/version.rs
Normal file
27
src/libstd_unicode/version.rs
Normal file
@@ -0,0 +1,27 @@
|
|||||||
|
// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
|
||||||
|
// file at the top-level directory of this distribution and at
|
||||||
|
// http://rust-lang.org/COPYRIGHT.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||||
|
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||||
|
// option. This file may not be copied, modified, or distributed
|
||||||
|
// except according to those terms.
|
||||||
|
|
||||||
|
/// Represents a Unicode Version.
|
||||||
|
///
|
||||||
|
/// See also: <http://www.unicode.org/versions/>
|
||||||
|
#[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd)]
|
||||||
|
pub struct UnicodeVersion {
|
||||||
|
/// Major version.
|
||||||
|
pub major: u32,
|
||||||
|
|
||||||
|
/// Minor version.
|
||||||
|
pub minor: u32,
|
||||||
|
|
||||||
|
/// Micro (or Update) version.
|
||||||
|
pub micro: u32,
|
||||||
|
|
||||||
|
// Private field to keep struct expandable.
|
||||||
|
pub(crate) _priv: (),
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user