Add complex (but unconditional) Unicode case mapping. Fix #25800

As a result, the iterator returned by `char::to_uppercase` sometimes yields two or three `char`s instead of just one.
2015-06-05 17:40:09 +02:00
parent 66af12721a
commit addaa5b1ff
5 changed files with 1154 additions and 670 deletions
--- a/src/etc/unicode.py
+++ b/src/etc/unicode.py
@@ -104,11 +104,11 @@ def load_unicode_data(f):
        # generate char to char direct common and simple conversions
        # uppercase to lowercase
        if lowcase != "" and code_org != lowcase:
-            to_lower[code] = int(lowcase, 16)
+            to_lower[code] = (int(lowcase, 16), 0, 0)
        # lowercase to uppercase
        if upcase != "" and code_org != upcase:
-            to_upper[code] = int(upcase, 16)
+            to_upper[code] = (int(upcase, 16), 0, 0)
        # store decomposition, if given
        if decomp != "":
@@ -146,6 +146,31 @@ def load_unicode_data(f):
    return (canon_decomp, compat_decomp, gencats, combines, to_upper, to_lower)
 def load_special_casing(f, to_upper, to_lower):
    fetch(f)
    for line in fileinput.input(f):
        data = line.split('#')[0].split(';')
        if len(data) == 5:
            code, lower, title, upper, _comment = data
        elif len(data) == 6:
            code, lower, title, upper, condition, _comment = data
            if condition.strip():  # Only keep unconditional mappins
                continue
        else:
            continue
        code = code.strip()
        lower = lower.strip()
        title = title.strip()
        upper = upper.strip()
        key = int(code, 16)
        for (map_, values) in [(to_lower, lower), (to_upper, upper)]:
            if values != code:
                values = [int(i, 16) for i in values.split()]
                for _ in range(len(values), 3):
                    values.append(0)
                assert len(values) == 3
                map_[key] = values
 def group_cats(cats):
    cats_out = {}
    for cat in cats:
@@ -279,7 +304,7 @@ def load_east_asian_width(want_widths, except_cats):
    return widths
 def escape_char(c):
-    return "'\\u{%x}'" % c
+    return "'\\u{%x}'" % c if c != 0 else "'\\0'"
 def emit_bsearch_range_table(f):
    f.write("""
@@ -328,21 +353,21 @@ def emit_conversions_module(f, to_upper, to_lower):
    use core::option::Option::{Some, None};
    use core::result::Result::{Ok, Err};
-    pub fn to_lower(c: char) -> char {
+    pub fn to_lower(c: char) -> [char; 3] {
        match bsearch_case_table(c, to_lowercase_table) {
-          None        => c,
+          None        => [c, '\\0', '\\0'],
          Some(index) => to_lowercase_table[index].1
        }
    }
-    pub fn to_upper(c: char) -> char {
+    pub fn to_upper(c: char) -> [char; 3] {
        match bsearch_case_table(c, to_uppercase_table) {
-            None        => c,
+            None        => [c, '\\0', '\\0'],
            Some(index) => to_uppercase_table[index].1
        }
    }
-    fn bsearch_case_table(c: char, table: &'static [(char, char)]) -> Option<usize> {
+    fn bsearch_case_table(c: char, table: &'static [(char, [char; 3])]) -> Option<usize> {
        match table.binary_search_by(|&(key, _)| {
            if c == key { Equal }
            else if key < c { Less }
@@ -355,9 +380,17 @@ def emit_conversions_module(f, to_upper, to_lower):
 """)
    emit_table(f, "to_lowercase_table",
-        sorted(to_lower.iteritems(), key=operator.itemgetter(0)), is_pub=False)
+        sorted(to_lower.iteritems(), key=operator.itemgetter(0)),
        is_pub=False,
        t_type = "&'static [(char, [char; 3])]",
        pfun=lambda x: "(%s,[%s,%s,%s])" % (
            escape_char(x[0]), escape_char(x[1][0]), escape_char(x[1][1]), escape_char(x[1][2])))
    emit_table(f, "to_uppercase_table",
-        sorted(to_upper.iteritems(), key=operator.itemgetter(0)), is_pub=False)
+        sorted(to_upper.iteritems(), key=operator.itemgetter(0)),
        is_pub=False,
        t_type = "&'static [(char, [char; 3])]",
        pfun=lambda x: "(%s,[%s,%s,%s])" % (
            escape_char(x[0]), escape_char(x[1][0]), escape_char(x[1][1]), escape_char(x[1][2])))
    f.write("}\n\n")
 def emit_grapheme_module(f, grapheme_table, grapheme_cats):
@@ -592,6 +625,7 @@ pub const UNICODE_VERSION: (u64, u64, u64) = (%s, %s, %s);
 """ % unicode_version)
        (canon_decomp, compat_decomp, gencats, combines,
                to_upper, to_lower) = load_unicode_data("UnicodeData.txt")
        load_special_casing("SpecialCasing.txt", to_upper, to_lower)
        want_derived = ["XID_Start", "XID_Continue", "Alphabetic", "Lowercase", "Uppercase"]
        derived = load_properties("DerivedCoreProperties.txt", want_derived)
        scripts = load_properties("Scripts.txt", [])
--- a/src/libcollectionstest/char.rs
+++ b/src/libcollectionstest/char.rs
@@ -0,0 +1,32 @@
 // Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
 // file at the top-level directory of this distribution and at
 // http://rust-lang.org/COPYRIGHT.
 //
 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
 // option. This file may not be copied, modified, or distributed
 // except according to those terms.
 use collections::vec::Vec;
 #[test]
 fn char_to_lowercase() {
    assert_iter_eq('A'.to_lowercase(), &['a']);
    assert_iter_eq('É'.to_lowercase(), &['é']);
    assert_iter_eq('ǅ'.to_lowercase(), &['ǆ']);
 }
 #[test]
 fn char_to_uppercase() {
    assert_iter_eq('a'.to_uppercase(), &['A']);
    assert_iter_eq('é'.to_uppercase(), &['É']);
    assert_iter_eq('ǅ'.to_uppercase(), &['Ǆ']);
    assert_iter_eq('ß'.to_uppercase(), &['S', 'S']);
    assert_iter_eq('ﬁ'.to_uppercase(), &['F', 'I']);
    assert_iter_eq('ᾀ'.to_uppercase(), &['Ἀ', 'Ι']);
 }
 fn assert_iter_eq<I: Iterator<Item=char>>(iter: I, expected: &[char]) {
    assert_eq!(iter.collect::<Vec<_>>(), expected);
 }
--- a/src/libcollectionstest/lib.rs
+++ b/src/libcollectionstest/lib.rs
@@ -37,6 +37,7 @@ extern crate rustc_unicode;
 mod binary_heap;
 mod bit;
 mod btree;
 mod char;  // char isn't really a collection, but didn't find a better place for this.
 mod enum_set;
 mod fmt;
 mod linked_list;
--- a/src/librustc_unicode/char.rs
+++ b/src/librustc_unicode/char.rs
@@ -29,7 +29,7 @@
 #![doc(primitive = "char")]
 use core::char::CharExt as C;
-use core::option::Option::{self, Some};
+use core::option::Option::{self, Some, None};
 use core::iter::Iterator;
 use tables::{derived_property, property, general_category, conversions, charwidth};
@@ -47,24 +47,67 @@ pub use tables::UNICODE_VERSION;
 /// the [`to_lowercase` method](../primitive.char.html#method.to_lowercase) on
 /// characters.
 #[stable(feature = "rust1", since = "1.0.0")]
-pub struct ToLowercase(Option<char>);
+pub struct ToLowercase(CaseMappingIter);
 #[stable(feature = "rust1", since = "1.0.0")]
 impl Iterator for ToLowercase {
    type Item = char;
-    fn next(&mut self) -> Option<char> { self.0.take() }
+    fn next(&mut self) -> Option<char> { self.0.next() }
 }
 /// An iterator over the uppercase mapping of a given character, returned from
 /// the [`to_uppercase` method](../primitive.char.html#method.to_uppercase) on
 /// characters.
 #[stable(feature = "rust1", since = "1.0.0")]
-pub struct ToUppercase(Option<char>);
+pub struct ToUppercase(CaseMappingIter);
 #[stable(feature = "rust1", since = "1.0.0")]
 impl Iterator for ToUppercase {
    type Item = char;
-    fn next(&mut self) -> Option<char> { self.0.take() }
+    fn next(&mut self) -> Option<char> { self.0.next() }
 }
 enum CaseMappingIter {
    Three(char, char, char),
    Two(char, char),
    One(char),
    Zero
 }
 impl CaseMappingIter {
    fn new(chars: [char; 3]) -> CaseMappingIter {
        if chars[2] == '\0' {
            if chars[1] == '\0' {
                CaseMappingIter::One(chars[0])  // Including if chars[0] == '\0'
            } else {
                CaseMappingIter::Two(chars[0], chars[1])
            }
        } else {
            CaseMappingIter::Three(chars[0], chars[1], chars[2])
        }
    }
 }
 impl Iterator for CaseMappingIter {
    type Item = char;
    fn next(&mut self) -> Option<char> {
        match *self {
            CaseMappingIter::Three(a, b, c) => {
                *self = CaseMappingIter::Two(b, c);
                Some(a)
            }
            CaseMappingIter::Two(b, c) => {
                *self = CaseMappingIter::One(c);
                Some(b)
            }
            CaseMappingIter::One(c) => {
                *self = CaseMappingIter::Zero;
                Some(c)
            }
            CaseMappingIter::Zero => None,
        }
    }
 }
 #[stable(feature = "rust1", since = "1.0.0")]
@@ -397,27 +440,27 @@ impl char {
    /// Converts a character to its lowercase equivalent.
    ///
-    /// The case-folding performed is the common or simple mapping. See
+    /// This performs complex unconditional mappings with no tailoring.
-    /// `to_uppercase()` for references and more information.
+    /// See `to_uppercase()` for references and more information.
    ///
    /// # Return value
    ///
    /// Returns an iterator which yields the characters corresponding to the
    /// lowercase equivalent of the character. If no conversion is possible then
-    /// the input character is returned.
+    /// an iterator with just the input character is returned.
    #[stable(feature = "rust1", since = "1.0.0")]
    #[inline]
    pub fn to_lowercase(self) -> ToLowercase {
-        ToLowercase(Some(conversions::to_lower(self)))
+        ToLowercase(CaseMappingIter::new(conversions::to_lower(self)))
    }
    /// Converts a character to its uppercase equivalent.
    ///
-    /// The case-folding performed is the common or simple mapping: it maps
+    /// This performs complex unconditional mappings with no tailoring:
-    /// one Unicode codepoint to its uppercase equivalent according to the
+    /// it maps one Unicode character to its uppercase equivalent
-    /// Unicode database [1]. The additional [`SpecialCasing.txt`] is not yet
+    /// according to the Unicode database [1]
-    /// considered here, but the iterator returned will soon support this form
+    /// and the additional complex mappings [`SpecialCasing.txt`].
-    /// of case folding.
+    /// Conditional mappings (based on context or language) are not considerd here.
    ///
    /// A full reference can be found here [2].
    ///
@@ -425,17 +468,17 @@ impl char {
    ///
    /// Returns an iterator which yields the characters corresponding to the
    /// uppercase equivalent of the character. If no conversion is possible then
-    /// the input character is returned.
+    /// an iterator with just the input character is returned.
    ///
    /// [1]: ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
    ///
    /// [`SpecialCasing.txt`]: ftp://ftp.unicode.org/Public/UNIDATA/SpecialCasing.txt
    ///
-    /// [2]: http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf#G33992
+    /// [2]: http://www.unicode.org/versions/Unicode7.0.0/ch03.pdf#G33992
    #[stable(feature = "rust1", since = "1.0.0")]
    #[inline]
    pub fn to_uppercase(self) -> ToUppercase {
-        ToUppercase(Some(conversions::to_upper(self)))
+        ToUppercase(CaseMappingIter::new(conversions::to_upper(self)))
    }
    /// Returns this character's displayed width in columns, or `None` if it is a
--- a/src/librustc_unicode/tables.rs
+++ b/src/librustc_unicode/tables.rs