Add complex (but unconditional) Unicode case mapping. Fix #25800

As a result, the iterator returned by `char::to_uppercase` sometimes
yields two or three `char`s instead of just one.
This commit is contained in:
Simon Sapin
2015-06-05 17:40:09 +02:00
parent 66af12721a
commit addaa5b1ff
5 changed files with 1154 additions and 670 deletions

View File

@@ -104,11 +104,11 @@ def load_unicode_data(f):
# generate char to char direct common and simple conversions
# uppercase to lowercase
if lowcase != "" and code_org != lowcase:
to_lower[code] = int(lowcase, 16)
to_lower[code] = (int(lowcase, 16), 0, 0)
# lowercase to uppercase
if upcase != "" and code_org != upcase:
to_upper[code] = int(upcase, 16)
to_upper[code] = (int(upcase, 16), 0, 0)
# store decomposition, if given
if decomp != "":
@@ -146,6 +146,31 @@ def load_unicode_data(f):
return (canon_decomp, compat_decomp, gencats, combines, to_upper, to_lower)
def load_special_casing(f, to_upper, to_lower):
fetch(f)
for line in fileinput.input(f):
data = line.split('#')[0].split(';')
if len(data) == 5:
code, lower, title, upper, _comment = data
elif len(data) == 6:
code, lower, title, upper, condition, _comment = data
if condition.strip(): # Only keep unconditional mappins
continue
else:
continue
code = code.strip()
lower = lower.strip()
title = title.strip()
upper = upper.strip()
key = int(code, 16)
for (map_, values) in [(to_lower, lower), (to_upper, upper)]:
if values != code:
values = [int(i, 16) for i in values.split()]
for _ in range(len(values), 3):
values.append(0)
assert len(values) == 3
map_[key] = values
def group_cats(cats):
cats_out = {}
for cat in cats:
@@ -279,7 +304,7 @@ def load_east_asian_width(want_widths, except_cats):
return widths
def escape_char(c):
return "'\\u{%x}'" % c
return "'\\u{%x}'" % c if c != 0 else "'\\0'"
def emit_bsearch_range_table(f):
f.write("""
@@ -328,21 +353,21 @@ def emit_conversions_module(f, to_upper, to_lower):
use core::option::Option::{Some, None};
use core::result::Result::{Ok, Err};
pub fn to_lower(c: char) -> char {
pub fn to_lower(c: char) -> [char; 3] {
match bsearch_case_table(c, to_lowercase_table) {
None => c,
None => [c, '\\0', '\\0'],
Some(index) => to_lowercase_table[index].1
}
}
pub fn to_upper(c: char) -> char {
pub fn to_upper(c: char) -> [char; 3] {
match bsearch_case_table(c, to_uppercase_table) {
None => c,
None => [c, '\\0', '\\0'],
Some(index) => to_uppercase_table[index].1
}
}
fn bsearch_case_table(c: char, table: &'static [(char, char)]) -> Option<usize> {
fn bsearch_case_table(c: char, table: &'static [(char, [char; 3])]) -> Option<usize> {
match table.binary_search_by(|&(key, _)| {
if c == key { Equal }
else if key < c { Less }
@@ -355,9 +380,17 @@ def emit_conversions_module(f, to_upper, to_lower):
""")
emit_table(f, "to_lowercase_table",
sorted(to_lower.iteritems(), key=operator.itemgetter(0)), is_pub=False)
sorted(to_lower.iteritems(), key=operator.itemgetter(0)),
is_pub=False,
t_type = "&'static [(char, [char; 3])]",
pfun=lambda x: "(%s,[%s,%s,%s])" % (
escape_char(x[0]), escape_char(x[1][0]), escape_char(x[1][1]), escape_char(x[1][2])))
emit_table(f, "to_uppercase_table",
sorted(to_upper.iteritems(), key=operator.itemgetter(0)), is_pub=False)
sorted(to_upper.iteritems(), key=operator.itemgetter(0)),
is_pub=False,
t_type = "&'static [(char, [char; 3])]",
pfun=lambda x: "(%s,[%s,%s,%s])" % (
escape_char(x[0]), escape_char(x[1][0]), escape_char(x[1][1]), escape_char(x[1][2])))
f.write("}\n\n")
def emit_grapheme_module(f, grapheme_table, grapheme_cats):
@@ -592,6 +625,7 @@ pub const UNICODE_VERSION: (u64, u64, u64) = (%s, %s, %s);
""" % unicode_version)
(canon_decomp, compat_decomp, gencats, combines,
to_upper, to_lower) = load_unicode_data("UnicodeData.txt")
load_special_casing("SpecialCasing.txt", to_upper, to_lower)
want_derived = ["XID_Start", "XID_Continue", "Alphabetic", "Lowercase", "Uppercase"]
derived = load_properties("DerivedCoreProperties.txt", want_derived)
scripts = load_properties("Scripts.txt", [])