Add Unicode decomposition mappings to std::unicode

This commit is contained in:
Florian Zeitz
2013-08-07 20:48:10 +02:00
parent d4d856b129
commit 83f4bee44f
3 changed files with 2196 additions and 32 deletions

View File

@@ -178,50 +178,118 @@ def emit_property_module_old(f, mod, tbl):
f.write(" }\n\n")
f.write("}\n")
def format_table_content(f, content, indent):
line = " "*indent
first = True
for chunk in content.split(","):
if len(line) + len(chunk) < 98:
if first:
line += chunk
else:
line += ", " + chunk
first = False
else:
f.write(line + ",\n")
line = " "*indent + chunk
f.write(line)
def emit_decomp_module(f, canon, compat):
canon_keys = canon.keys()
canon_keys.sort()
compat_keys = compat.keys()
compat_keys.sort()
f.write("mod decompose {\n\n");
f.write(" export canonical, compatibility;\n\n")
f.write(" fn canonical(c: char, i: block(char)) "
+ "{ d(c, i, false); }\n\n")
f.write(" fn compatibility(c: char, i: block(char)) "
f.write("pub mod decompose {\n");
f.write(" use option::Option;\n");
f.write(" use option::{Some, None};\n");
f.write(" use vec::ImmutableVector;\n");
f.write("""
fn bsearch_table(c: char, r: &'static [(char, &'static [char])]) -> Option<&'static [char]> {
use cmp::{Equal, Less, Greater};
match r.bsearch(|&(val, _)| {
if c == val { Equal }
else if val < c { Less }
else { Greater }
}) {
Some(idx) => {
let (_, result) = r[idx];
Some(result)
}
None => None
}
}\n\n
""")
f.write(" // Canonical decompositions\n")
f.write(" static canonical_table : &'static [(char, &'static [char])] = &[\n")
data = ""
first = True
for char in canon_keys:
if not first:
data += ","
first = False
data += "(%s,&[" % escape_char(char)
first2 = True
for d in canon[char]:
if not first2:
data += ","
first2 = False
data += escape_char(d)
data += "])"
format_table_content(f, data, 8)
f.write("\n ];\n\n")
f.write(" // Compatibility decompositions\n")
f.write(" static compatibility_table : &'static [(char, &'static [char])] = &[\n")
data = ""
first = True
for char in compat_keys:
if not first:
data += ","
first = False
data += "(%s,&[" % escape_char(char)
first2 = True
for d in compat[char]:
if not first2:
data += ","
first2 = False
data += escape_char(d)
data += "])"
format_table_content(f, data, 8)
f.write("\n ];\n\n")
f.write(" pub fn canonical(c: char, i: &fn(char)) "
+ "{ d(c, i, false); }\n\n")
f.write(" pub fn compatibility(c: char, i: &fn(char)) "
+"{ d(c, i, true); }\n\n")
f.write(" fn d(c: char, i: block(char), k: bool) {\n")
f.write(" fn d(c: char, i: &fn(char), k: bool) {\n")
f.write(" use iterator::Iterator;\n");
f.write(" if c <= '\\x7f' { i(c); ret; }\n")
f.write(" if c <= '\\x7f' { i(c); return; }\n")
# First check the canonical decompositions
f.write(" // Canonical decomposition\n")
f.write(" alt c {\n")
for char in canon_keys:
f.write(" %s {\n" % escape_char(char))
for d in canon[char]:
f.write(" d(%s, i, k);\n"
% escape_char(d))
f.write(" }\n")
f.write(" _ { }\n")
f.write(" }\n\n")
f.write("""
match bsearch_table(c, canonical_table) {
Some(canon) => {
for x in canon.iter() {
d(*x, |b| i(b), k);
}
return;
}
None => ()
}\n\n""")
# Bottom out if we're not doing compat.
f.write(" if !k { i(c); ret; }\n\n ")
f.write(" if !k { i(c); return; }\n")
# Then check the compatibility decompositions
f.write(" // Compatibility decomposition\n")
f.write(" alt c {\n")
for char in compat_keys:
f.write(" %s {\n" % escape_char(char))
for d in compat[char]:
f.write(" d(%s, i, k);\n"
% escape_char(d))
f.write(" }\n")
f.write(" _ { }\n")
f.write(" }\n\n")
f.write("""
match bsearch_table(c, compatibility_table) {
Some(compat) => {
for x in compat.iter() {
d(*x, |b| i(b), k);
}
return;
}
None => ()
}\n\n""")
# Finally bottom out.
f.write(" i(c);\n")
@@ -256,7 +324,7 @@ rf.write('''// Copyright 2012-2013 The Rust Project Developers. See the COPYRIGH
emit_property_module(rf, "general_category", gencats)
#emit_decomp_module(rf, canon_decomp, compat_decomp)
emit_decomp_module(rf, canon_decomp, compat_decomp)
derived = load_derived_core_properties("DerivedCoreProperties.txt")
emit_property_module(rf, "derived_property", derived)