Add canonical combining class to std::unicode

This commit is contained in:
Florian Zeitz
2013-08-11 01:57:59 +02:00
parent 83f4bee44f
commit 2675f3e9e7
2 changed files with 219 additions and 4 deletions

View File

@@ -26,11 +26,15 @@ def fetch(f):
def load_unicode_data(f):
fetch(f)
gencats = {}
combines = []
canon_decomp = {}
compat_decomp = {}
curr_cat = ""
curr_combine = ""
c_lo = 0
c_hi = 0
com_lo = 0
com_hi = 0
for line in fileinput.input(f):
fields = line.split(";")
if len(fields) != 15:
@@ -69,7 +73,21 @@ def load_unicode_data(f):
c_lo = code
c_hi = code
return (canon_decomp, compat_decomp, gencats)
if curr_combine == "":
curr_combine = combine
com_lo = code
com_hi = code
if curr_combine == combine:
com_hi = code
else:
if curr_combine != "0":
combines.append((com_lo, com_hi, curr_combine))
curr_combine = combine
com_lo = code
com_hi = code
return (canon_decomp, compat_decomp, gencats, combines)
def load_derived_core_properties(f):
@@ -193,7 +211,7 @@ def format_table_content(f, content, indent):
line = " "*indent + chunk
f.write(line)
def emit_decomp_module(f, canon, compat):
def emit_decomp_module(f, canon, compat, combine):
canon_keys = canon.keys()
canon_keys.sort()
@@ -217,8 +235,26 @@ def emit_decomp_module(f, canon, compat):
}
None => None
}
}\n
""")
f.write("""
fn bsearch_range_value_table(c: char, r: &'static [(char, char, u8)]) -> u8 {
use cmp::{Equal, Less, Greater};
match r.bsearch(|&(lo, hi, _)| {
if lo <= c && c <= hi { Equal }
else if hi < c { Less }
else { Greater }
}) {
Some(idx) => {
let (_, _, result) = r[idx];
result
}
None => 0
}
}\n\n
""")
f.write(" // Canonical decompositions\n")
f.write(" static canonical_table : &'static [(char, &'static [char])] = &[\n")
data = ""
@@ -237,6 +273,7 @@ def emit_decomp_module(f, canon, compat):
data += "])"
format_table_content(f, data, 8)
f.write("\n ];\n\n")
f.write(" // Compatibility decompositions\n")
f.write(" static compatibility_table : &'static [(char, &'static [char])] = &[\n")
data = ""
@@ -255,10 +292,22 @@ def emit_decomp_module(f, canon, compat):
data += "])"
format_table_content(f, data, 8)
f.write("\n ];\n\n")
f.write(" static combining_class_table : &'static [(char, char, u8)] = &[\n")
ix = 0
for pair in combine:
f.write(ch_prefix(ix))
f.write("(%s, %s, %s)" % (escape_char(pair[0]), escape_char(pair[1]), pair[2]))
ix += 1
f.write("\n ];\n")
f.write(" pub fn canonical(c: char, i: &fn(char)) "
+ "{ d(c, i, false); }\n\n")
f.write(" pub fn compatibility(c: char, i: &fn(char)) "
+"{ d(c, i, true); }\n\n")
f.write(" pub fn canonical_combining_class(c: char) -> u8 {\n"
+ " bsearch_range_value_table(c, combining_class_table)\n"
+ " }\n\n")
f.write(" fn d(c: char, i: &fn(char), k: bool) {\n")
f.write(" use iterator::Iterator;\n");
@@ -302,7 +351,7 @@ for i in [r]:
os.remove(i);
rf = open(r, "w")
(canon_decomp, compat_decomp, gencats) = load_unicode_data("UnicodeData.txt")
(canon_decomp, compat_decomp, gencats, combines) = load_unicode_data("UnicodeData.txt")
# Preamble
rf.write('''// Copyright 2012-2013 The Rust Project Developers. See the COPYRIGHT
@@ -324,7 +373,7 @@ rf.write('''// Copyright 2012-2013 The Rust Project Developers. See the COPYRIGH
emit_property_module(rf, "general_category", gencats)
emit_decomp_module(rf, canon_decomp, compat_decomp)
emit_decomp_module(rf, canon_decomp, compat_decomp, combines)
derived = load_derived_core_properties("DerivedCoreProperties.txt")
emit_property_module(rf, "derived_property", derived)