Add Unicode decomposition mappings to std::unicode
This commit is contained in:
@@ -178,50 +178,118 @@ def emit_property_module_old(f, mod, tbl):
|
|||||||
f.write(" }\n\n")
|
f.write(" }\n\n")
|
||||||
f.write("}\n")
|
f.write("}\n")
|
||||||
|
|
||||||
|
def format_table_content(f, content, indent):
|
||||||
|
line = " "*indent
|
||||||
|
first = True
|
||||||
|
for chunk in content.split(","):
|
||||||
|
if len(line) + len(chunk) < 98:
|
||||||
|
if first:
|
||||||
|
line += chunk
|
||||||
|
else:
|
||||||
|
line += ", " + chunk
|
||||||
|
first = False
|
||||||
|
else:
|
||||||
|
f.write(line + ",\n")
|
||||||
|
line = " "*indent + chunk
|
||||||
|
f.write(line)
|
||||||
|
|
||||||
def emit_decomp_module(f, canon, compat):
|
def emit_decomp_module(f, canon, compat):
|
||||||
canon_keys = canon.keys()
|
canon_keys = canon.keys()
|
||||||
canon_keys.sort()
|
canon_keys.sort()
|
||||||
|
|
||||||
compat_keys = compat.keys()
|
compat_keys = compat.keys()
|
||||||
compat_keys.sort()
|
compat_keys.sort()
|
||||||
f.write("mod decompose {\n\n");
|
f.write("pub mod decompose {\n");
|
||||||
f.write(" export canonical, compatibility;\n\n")
|
f.write(" use option::Option;\n");
|
||||||
f.write(" fn canonical(c: char, i: block(char)) "
|
f.write(" use option::{Some, None};\n");
|
||||||
+ "{ d(c, i, false); }\n\n")
|
f.write(" use vec::ImmutableVector;\n");
|
||||||
f.write(" fn compatibility(c: char, i: block(char)) "
|
f.write("""
|
||||||
|
fn bsearch_table(c: char, r: &'static [(char, &'static [char])]) -> Option<&'static [char]> {
|
||||||
|
use cmp::{Equal, Less, Greater};
|
||||||
|
match r.bsearch(|&(val, _)| {
|
||||||
|
if c == val { Equal }
|
||||||
|
else if val < c { Less }
|
||||||
|
else { Greater }
|
||||||
|
}) {
|
||||||
|
Some(idx) => {
|
||||||
|
let (_, result) = r[idx];
|
||||||
|
Some(result)
|
||||||
|
}
|
||||||
|
None => None
|
||||||
|
}
|
||||||
|
}\n\n
|
||||||
|
""")
|
||||||
|
f.write(" // Canonical decompositions\n")
|
||||||
|
f.write(" static canonical_table : &'static [(char, &'static [char])] = &[\n")
|
||||||
|
data = ""
|
||||||
|
first = True
|
||||||
|
for char in canon_keys:
|
||||||
|
if not first:
|
||||||
|
data += ","
|
||||||
|
first = False
|
||||||
|
data += "(%s,&[" % escape_char(char)
|
||||||
|
first2 = True
|
||||||
|
for d in canon[char]:
|
||||||
|
if not first2:
|
||||||
|
data += ","
|
||||||
|
first2 = False
|
||||||
|
data += escape_char(d)
|
||||||
|
data += "])"
|
||||||
|
format_table_content(f, data, 8)
|
||||||
|
f.write("\n ];\n\n")
|
||||||
|
f.write(" // Compatibility decompositions\n")
|
||||||
|
f.write(" static compatibility_table : &'static [(char, &'static [char])] = &[\n")
|
||||||
|
data = ""
|
||||||
|
first = True
|
||||||
|
for char in compat_keys:
|
||||||
|
if not first:
|
||||||
|
data += ","
|
||||||
|
first = False
|
||||||
|
data += "(%s,&[" % escape_char(char)
|
||||||
|
first2 = True
|
||||||
|
for d in compat[char]:
|
||||||
|
if not first2:
|
||||||
|
data += ","
|
||||||
|
first2 = False
|
||||||
|
data += escape_char(d)
|
||||||
|
data += "])"
|
||||||
|
format_table_content(f, data, 8)
|
||||||
|
f.write("\n ];\n\n")
|
||||||
|
f.write(" pub fn canonical(c: char, i: &fn(char)) "
|
||||||
|
+ "{ d(c, i, false); }\n\n")
|
||||||
|
f.write(" pub fn compatibility(c: char, i: &fn(char)) "
|
||||||
+"{ d(c, i, true); }\n\n")
|
+"{ d(c, i, true); }\n\n")
|
||||||
f.write(" fn d(c: char, i: block(char), k: bool) {\n")
|
f.write(" fn d(c: char, i: &fn(char), k: bool) {\n")
|
||||||
|
f.write(" use iterator::Iterator;\n");
|
||||||
|
|
||||||
f.write(" if c <= '\\x7f' { i(c); ret; }\n")
|
f.write(" if c <= '\\x7f' { i(c); return; }\n")
|
||||||
|
|
||||||
# First check the canonical decompositions
|
# First check the canonical decompositions
|
||||||
f.write(" // Canonical decomposition\n")
|
f.write("""
|
||||||
f.write(" alt c {\n")
|
match bsearch_table(c, canonical_table) {
|
||||||
for char in canon_keys:
|
Some(canon) => {
|
||||||
f.write(" %s {\n" % escape_char(char))
|
for x in canon.iter() {
|
||||||
for d in canon[char]:
|
d(*x, |b| i(b), k);
|
||||||
f.write(" d(%s, i, k);\n"
|
}
|
||||||
% escape_char(d))
|
return;
|
||||||
f.write(" }\n")
|
}
|
||||||
|
None => ()
|
||||||
f.write(" _ { }\n")
|
}\n\n""")
|
||||||
f.write(" }\n\n")
|
|
||||||
|
|
||||||
# Bottom out if we're not doing compat.
|
# Bottom out if we're not doing compat.
|
||||||
f.write(" if !k { i(c); ret; }\n\n ")
|
f.write(" if !k { i(c); return; }\n")
|
||||||
|
|
||||||
# Then check the compatibility decompositions
|
# Then check the compatibility decompositions
|
||||||
f.write(" // Compatibility decomposition\n")
|
f.write("""
|
||||||
f.write(" alt c {\n")
|
match bsearch_table(c, compatibility_table) {
|
||||||
for char in compat_keys:
|
Some(compat) => {
|
||||||
f.write(" %s {\n" % escape_char(char))
|
for x in compat.iter() {
|
||||||
for d in compat[char]:
|
d(*x, |b| i(b), k);
|
||||||
f.write(" d(%s, i, k);\n"
|
}
|
||||||
% escape_char(d))
|
return;
|
||||||
f.write(" }\n")
|
}
|
||||||
|
None => ()
|
||||||
f.write(" _ { }\n")
|
}\n\n""")
|
||||||
f.write(" }\n\n")
|
|
||||||
|
|
||||||
# Finally bottom out.
|
# Finally bottom out.
|
||||||
f.write(" i(c);\n")
|
f.write(" i(c);\n")
|
||||||
@@ -256,7 +324,7 @@ rf.write('''// Copyright 2012-2013 The Rust Project Developers. See the COPYRIGH
|
|||||||
|
|
||||||
emit_property_module(rf, "general_category", gencats)
|
emit_property_module(rf, "general_category", gencats)
|
||||||
|
|
||||||
#emit_decomp_module(rf, canon_decomp, compat_decomp)
|
emit_decomp_module(rf, canon_decomp, compat_decomp)
|
||||||
|
|
||||||
derived = load_derived_core_properties("DerivedCoreProperties.txt")
|
derived = load_derived_core_properties("DerivedCoreProperties.txt")
|
||||||
emit_property_module(rf, "derived_property", derived)
|
emit_property_module(rf, "derived_property", derived)
|
||||||
|
|||||||
@@ -13,7 +13,7 @@
|
|||||||
use option::{None, Option, Some};
|
use option::{None, Option, Some};
|
||||||
use int;
|
use int;
|
||||||
use str::StrSlice;
|
use str::StrSlice;
|
||||||
use unicode::{derived_property, general_category};
|
use unicode::{derived_property, general_category, decompose};
|
||||||
|
|
||||||
#[cfg(test)] use str::OwnedStr;
|
#[cfg(test)] use str::OwnedStr;
|
||||||
|
|
||||||
@@ -202,6 +202,51 @@ pub fn from_digit(num: uint, radix: uint) -> Option<char> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Constants from Unicode 6.2.0 Section 3.12 Conjoining Jamo Behavior
|
||||||
|
static S_BASE: uint = 0xAC00;
|
||||||
|
static L_BASE: uint = 0x1100;
|
||||||
|
static V_BASE: uint = 0x1161;
|
||||||
|
static T_BASE: uint = 0x11A7;
|
||||||
|
static L_COUNT: uint = 19;
|
||||||
|
static V_COUNT: uint = 21;
|
||||||
|
static T_COUNT: uint = 28;
|
||||||
|
static N_COUNT: uint = (V_COUNT * T_COUNT);
|
||||||
|
static S_COUNT: uint = (L_COUNT * N_COUNT);
|
||||||
|
|
||||||
|
// Decompose a precomposed Hangul syllable
|
||||||
|
fn decompose_hangul(s: char, f: &fn(char)) {
|
||||||
|
let si = s as uint - S_BASE;
|
||||||
|
|
||||||
|
let li = si / N_COUNT;
|
||||||
|
f((L_BASE + li) as char);
|
||||||
|
|
||||||
|
let vi = (si % N_COUNT) / T_COUNT;
|
||||||
|
f((V_BASE + vi) as char);
|
||||||
|
|
||||||
|
let ti = si % T_COUNT;
|
||||||
|
if ti > 0 {
|
||||||
|
f((T_BASE + ti) as char);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns the canonical decompostion of a character
|
||||||
|
pub fn decompose_canonical(c: char, f: &fn(char)) {
|
||||||
|
if (c as uint) < S_BASE || (c as uint) >= (S_BASE + S_COUNT) {
|
||||||
|
decompose::canonical(c, f);
|
||||||
|
} else {
|
||||||
|
decompose_hangul(c, f);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns the compatibility decompostion of a character
|
||||||
|
pub fn decompose_compatible(c: char, f: &fn(char)) {
|
||||||
|
if (c as uint) < S_BASE || (c as uint) >= (S_BASE + S_COUNT) {
|
||||||
|
decompose::compatibility(c, f);
|
||||||
|
} else {
|
||||||
|
decompose_hangul(c, f);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
///
|
///
|
||||||
/// Return the hexadecimal unicode escape of a char.
|
/// Return the hexadecimal unicode escape of a char.
|
||||||
///
|
///
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user