Add complex (but unconditional) Unicode case mapping. Fix #25800
As a result, the iterator returned by `char::to_uppercase` sometimes yields two or three `char`s instead of just one.
This commit is contained in:
@@ -104,11 +104,11 @@ def load_unicode_data(f):
|
|||||||
# generate char to char direct common and simple conversions
|
# generate char to char direct common and simple conversions
|
||||||
# uppercase to lowercase
|
# uppercase to lowercase
|
||||||
if lowcase != "" and code_org != lowcase:
|
if lowcase != "" and code_org != lowcase:
|
||||||
to_lower[code] = int(lowcase, 16)
|
to_lower[code] = (int(lowcase, 16), 0, 0)
|
||||||
|
|
||||||
# lowercase to uppercase
|
# lowercase to uppercase
|
||||||
if upcase != "" and code_org != upcase:
|
if upcase != "" and code_org != upcase:
|
||||||
to_upper[code] = int(upcase, 16)
|
to_upper[code] = (int(upcase, 16), 0, 0)
|
||||||
|
|
||||||
# store decomposition, if given
|
# store decomposition, if given
|
||||||
if decomp != "":
|
if decomp != "":
|
||||||
@@ -146,6 +146,31 @@ def load_unicode_data(f):
|
|||||||
|
|
||||||
return (canon_decomp, compat_decomp, gencats, combines, to_upper, to_lower)
|
return (canon_decomp, compat_decomp, gencats, combines, to_upper, to_lower)
|
||||||
|
|
||||||
|
def load_special_casing(f, to_upper, to_lower):
|
||||||
|
fetch(f)
|
||||||
|
for line in fileinput.input(f):
|
||||||
|
data = line.split('#')[0].split(';')
|
||||||
|
if len(data) == 5:
|
||||||
|
code, lower, title, upper, _comment = data
|
||||||
|
elif len(data) == 6:
|
||||||
|
code, lower, title, upper, condition, _comment = data
|
||||||
|
if condition.strip(): # Only keep unconditional mappins
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
continue
|
||||||
|
code = code.strip()
|
||||||
|
lower = lower.strip()
|
||||||
|
title = title.strip()
|
||||||
|
upper = upper.strip()
|
||||||
|
key = int(code, 16)
|
||||||
|
for (map_, values) in [(to_lower, lower), (to_upper, upper)]:
|
||||||
|
if values != code:
|
||||||
|
values = [int(i, 16) for i in values.split()]
|
||||||
|
for _ in range(len(values), 3):
|
||||||
|
values.append(0)
|
||||||
|
assert len(values) == 3
|
||||||
|
map_[key] = values
|
||||||
|
|
||||||
def group_cats(cats):
|
def group_cats(cats):
|
||||||
cats_out = {}
|
cats_out = {}
|
||||||
for cat in cats:
|
for cat in cats:
|
||||||
@@ -279,7 +304,7 @@ def load_east_asian_width(want_widths, except_cats):
|
|||||||
return widths
|
return widths
|
||||||
|
|
||||||
def escape_char(c):
|
def escape_char(c):
|
||||||
return "'\\u{%x}'" % c
|
return "'\\u{%x}'" % c if c != 0 else "'\\0'"
|
||||||
|
|
||||||
def emit_bsearch_range_table(f):
|
def emit_bsearch_range_table(f):
|
||||||
f.write("""
|
f.write("""
|
||||||
@@ -328,21 +353,21 @@ def emit_conversions_module(f, to_upper, to_lower):
|
|||||||
use core::option::Option::{Some, None};
|
use core::option::Option::{Some, None};
|
||||||
use core::result::Result::{Ok, Err};
|
use core::result::Result::{Ok, Err};
|
||||||
|
|
||||||
pub fn to_lower(c: char) -> char {
|
pub fn to_lower(c: char) -> [char; 3] {
|
||||||
match bsearch_case_table(c, to_lowercase_table) {
|
match bsearch_case_table(c, to_lowercase_table) {
|
||||||
None => c,
|
None => [c, '\\0', '\\0'],
|
||||||
Some(index) => to_lowercase_table[index].1
|
Some(index) => to_lowercase_table[index].1
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn to_upper(c: char) -> char {
|
pub fn to_upper(c: char) -> [char; 3] {
|
||||||
match bsearch_case_table(c, to_uppercase_table) {
|
match bsearch_case_table(c, to_uppercase_table) {
|
||||||
None => c,
|
None => [c, '\\0', '\\0'],
|
||||||
Some(index) => to_uppercase_table[index].1
|
Some(index) => to_uppercase_table[index].1
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn bsearch_case_table(c: char, table: &'static [(char, char)]) -> Option<usize> {
|
fn bsearch_case_table(c: char, table: &'static [(char, [char; 3])]) -> Option<usize> {
|
||||||
match table.binary_search_by(|&(key, _)| {
|
match table.binary_search_by(|&(key, _)| {
|
||||||
if c == key { Equal }
|
if c == key { Equal }
|
||||||
else if key < c { Less }
|
else if key < c { Less }
|
||||||
@@ -355,9 +380,17 @@ def emit_conversions_module(f, to_upper, to_lower):
|
|||||||
|
|
||||||
""")
|
""")
|
||||||
emit_table(f, "to_lowercase_table",
|
emit_table(f, "to_lowercase_table",
|
||||||
sorted(to_lower.iteritems(), key=operator.itemgetter(0)), is_pub=False)
|
sorted(to_lower.iteritems(), key=operator.itemgetter(0)),
|
||||||
|
is_pub=False,
|
||||||
|
t_type = "&'static [(char, [char; 3])]",
|
||||||
|
pfun=lambda x: "(%s,[%s,%s,%s])" % (
|
||||||
|
escape_char(x[0]), escape_char(x[1][0]), escape_char(x[1][1]), escape_char(x[1][2])))
|
||||||
emit_table(f, "to_uppercase_table",
|
emit_table(f, "to_uppercase_table",
|
||||||
sorted(to_upper.iteritems(), key=operator.itemgetter(0)), is_pub=False)
|
sorted(to_upper.iteritems(), key=operator.itemgetter(0)),
|
||||||
|
is_pub=False,
|
||||||
|
t_type = "&'static [(char, [char; 3])]",
|
||||||
|
pfun=lambda x: "(%s,[%s,%s,%s])" % (
|
||||||
|
escape_char(x[0]), escape_char(x[1][0]), escape_char(x[1][1]), escape_char(x[1][2])))
|
||||||
f.write("}\n\n")
|
f.write("}\n\n")
|
||||||
|
|
||||||
def emit_grapheme_module(f, grapheme_table, grapheme_cats):
|
def emit_grapheme_module(f, grapheme_table, grapheme_cats):
|
||||||
@@ -592,6 +625,7 @@ pub const UNICODE_VERSION: (u64, u64, u64) = (%s, %s, %s);
|
|||||||
""" % unicode_version)
|
""" % unicode_version)
|
||||||
(canon_decomp, compat_decomp, gencats, combines,
|
(canon_decomp, compat_decomp, gencats, combines,
|
||||||
to_upper, to_lower) = load_unicode_data("UnicodeData.txt")
|
to_upper, to_lower) = load_unicode_data("UnicodeData.txt")
|
||||||
|
load_special_casing("SpecialCasing.txt", to_upper, to_lower)
|
||||||
want_derived = ["XID_Start", "XID_Continue", "Alphabetic", "Lowercase", "Uppercase"]
|
want_derived = ["XID_Start", "XID_Continue", "Alphabetic", "Lowercase", "Uppercase"]
|
||||||
derived = load_properties("DerivedCoreProperties.txt", want_derived)
|
derived = load_properties("DerivedCoreProperties.txt", want_derived)
|
||||||
scripts = load_properties("Scripts.txt", [])
|
scripts = load_properties("Scripts.txt", [])
|
||||||
|
|||||||
32
src/libcollectionstest/char.rs
Normal file
32
src/libcollectionstest/char.rs
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
|
||||||
|
// file at the top-level directory of this distribution and at
|
||||||
|
// http://rust-lang.org/COPYRIGHT.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||||
|
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||||
|
// option. This file may not be copied, modified, or distributed
|
||||||
|
// except according to those terms.
|
||||||
|
|
||||||
|
use collections::vec::Vec;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn char_to_lowercase() {
|
||||||
|
assert_iter_eq('A'.to_lowercase(), &['a']);
|
||||||
|
assert_iter_eq('É'.to_lowercase(), &['é']);
|
||||||
|
assert_iter_eq('Dž'.to_lowercase(), &['dž']);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn char_to_uppercase() {
|
||||||
|
assert_iter_eq('a'.to_uppercase(), &['A']);
|
||||||
|
assert_iter_eq('é'.to_uppercase(), &['É']);
|
||||||
|
assert_iter_eq('Dž'.to_uppercase(), &['DŽ']);
|
||||||
|
assert_iter_eq('ß'.to_uppercase(), &['S', 'S']);
|
||||||
|
assert_iter_eq('fi'.to_uppercase(), &['F', 'I']);
|
||||||
|
assert_iter_eq('ᾀ'.to_uppercase(), &['Ἀ', 'Ι']);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn assert_iter_eq<I: Iterator<Item=char>>(iter: I, expected: &[char]) {
|
||||||
|
assert_eq!(iter.collect::<Vec<_>>(), expected);
|
||||||
|
}
|
||||||
@@ -37,6 +37,7 @@ extern crate rustc_unicode;
|
|||||||
mod binary_heap;
|
mod binary_heap;
|
||||||
mod bit;
|
mod bit;
|
||||||
mod btree;
|
mod btree;
|
||||||
|
mod char; // char isn't really a collection, but didn't find a better place for this.
|
||||||
mod enum_set;
|
mod enum_set;
|
||||||
mod fmt;
|
mod fmt;
|
||||||
mod linked_list;
|
mod linked_list;
|
||||||
|
|||||||
@@ -29,7 +29,7 @@
|
|||||||
#![doc(primitive = "char")]
|
#![doc(primitive = "char")]
|
||||||
|
|
||||||
use core::char::CharExt as C;
|
use core::char::CharExt as C;
|
||||||
use core::option::Option::{self, Some};
|
use core::option::Option::{self, Some, None};
|
||||||
use core::iter::Iterator;
|
use core::iter::Iterator;
|
||||||
use tables::{derived_property, property, general_category, conversions, charwidth};
|
use tables::{derived_property, property, general_category, conversions, charwidth};
|
||||||
|
|
||||||
@@ -47,24 +47,67 @@ pub use tables::UNICODE_VERSION;
|
|||||||
/// the [`to_lowercase` method](../primitive.char.html#method.to_lowercase) on
|
/// the [`to_lowercase` method](../primitive.char.html#method.to_lowercase) on
|
||||||
/// characters.
|
/// characters.
|
||||||
#[stable(feature = "rust1", since = "1.0.0")]
|
#[stable(feature = "rust1", since = "1.0.0")]
|
||||||
pub struct ToLowercase(Option<char>);
|
pub struct ToLowercase(CaseMappingIter);
|
||||||
|
|
||||||
#[stable(feature = "rust1", since = "1.0.0")]
|
#[stable(feature = "rust1", since = "1.0.0")]
|
||||||
impl Iterator for ToLowercase {
|
impl Iterator for ToLowercase {
|
||||||
type Item = char;
|
type Item = char;
|
||||||
fn next(&mut self) -> Option<char> { self.0.take() }
|
fn next(&mut self) -> Option<char> { self.0.next() }
|
||||||
}
|
}
|
||||||
|
|
||||||
/// An iterator over the uppercase mapping of a given character, returned from
|
/// An iterator over the uppercase mapping of a given character, returned from
|
||||||
/// the [`to_uppercase` method](../primitive.char.html#method.to_uppercase) on
|
/// the [`to_uppercase` method](../primitive.char.html#method.to_uppercase) on
|
||||||
/// characters.
|
/// characters.
|
||||||
#[stable(feature = "rust1", since = "1.0.0")]
|
#[stable(feature = "rust1", since = "1.0.0")]
|
||||||
pub struct ToUppercase(Option<char>);
|
pub struct ToUppercase(CaseMappingIter);
|
||||||
|
|
||||||
#[stable(feature = "rust1", since = "1.0.0")]
|
#[stable(feature = "rust1", since = "1.0.0")]
|
||||||
impl Iterator for ToUppercase {
|
impl Iterator for ToUppercase {
|
||||||
type Item = char;
|
type Item = char;
|
||||||
fn next(&mut self) -> Option<char> { self.0.take() }
|
fn next(&mut self) -> Option<char> { self.0.next() }
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
enum CaseMappingIter {
|
||||||
|
Three(char, char, char),
|
||||||
|
Two(char, char),
|
||||||
|
One(char),
|
||||||
|
Zero
|
||||||
|
}
|
||||||
|
|
||||||
|
impl CaseMappingIter {
|
||||||
|
fn new(chars: [char; 3]) -> CaseMappingIter {
|
||||||
|
if chars[2] == '\0' {
|
||||||
|
if chars[1] == '\0' {
|
||||||
|
CaseMappingIter::One(chars[0]) // Including if chars[0] == '\0'
|
||||||
|
} else {
|
||||||
|
CaseMappingIter::Two(chars[0], chars[1])
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
CaseMappingIter::Three(chars[0], chars[1], chars[2])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Iterator for CaseMappingIter {
|
||||||
|
type Item = char;
|
||||||
|
fn next(&mut self) -> Option<char> {
|
||||||
|
match *self {
|
||||||
|
CaseMappingIter::Three(a, b, c) => {
|
||||||
|
*self = CaseMappingIter::Two(b, c);
|
||||||
|
Some(a)
|
||||||
|
}
|
||||||
|
CaseMappingIter::Two(b, c) => {
|
||||||
|
*self = CaseMappingIter::One(c);
|
||||||
|
Some(b)
|
||||||
|
}
|
||||||
|
CaseMappingIter::One(c) => {
|
||||||
|
*self = CaseMappingIter::Zero;
|
||||||
|
Some(c)
|
||||||
|
}
|
||||||
|
CaseMappingIter::Zero => None,
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[stable(feature = "rust1", since = "1.0.0")]
|
#[stable(feature = "rust1", since = "1.0.0")]
|
||||||
@@ -397,27 +440,27 @@ impl char {
|
|||||||
|
|
||||||
/// Converts a character to its lowercase equivalent.
|
/// Converts a character to its lowercase equivalent.
|
||||||
///
|
///
|
||||||
/// The case-folding performed is the common or simple mapping. See
|
/// This performs complex unconditional mappings with no tailoring.
|
||||||
/// `to_uppercase()` for references and more information.
|
/// See `to_uppercase()` for references and more information.
|
||||||
///
|
///
|
||||||
/// # Return value
|
/// # Return value
|
||||||
///
|
///
|
||||||
/// Returns an iterator which yields the characters corresponding to the
|
/// Returns an iterator which yields the characters corresponding to the
|
||||||
/// lowercase equivalent of the character. If no conversion is possible then
|
/// lowercase equivalent of the character. If no conversion is possible then
|
||||||
/// the input character is returned.
|
/// an iterator with just the input character is returned.
|
||||||
#[stable(feature = "rust1", since = "1.0.0")]
|
#[stable(feature = "rust1", since = "1.0.0")]
|
||||||
#[inline]
|
#[inline]
|
||||||
pub fn to_lowercase(self) -> ToLowercase {
|
pub fn to_lowercase(self) -> ToLowercase {
|
||||||
ToLowercase(Some(conversions::to_lower(self)))
|
ToLowercase(CaseMappingIter::new(conversions::to_lower(self)))
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Converts a character to its uppercase equivalent.
|
/// Converts a character to its uppercase equivalent.
|
||||||
///
|
///
|
||||||
/// The case-folding performed is the common or simple mapping: it maps
|
/// This performs complex unconditional mappings with no tailoring:
|
||||||
/// one Unicode codepoint to its uppercase equivalent according to the
|
/// it maps one Unicode character to its uppercase equivalent
|
||||||
/// Unicode database [1]. The additional [`SpecialCasing.txt`] is not yet
|
/// according to the Unicode database [1]
|
||||||
/// considered here, but the iterator returned will soon support this form
|
/// and the additional complex mappings [`SpecialCasing.txt`].
|
||||||
/// of case folding.
|
/// Conditional mappings (based on context or language) are not considerd here.
|
||||||
///
|
///
|
||||||
/// A full reference can be found here [2].
|
/// A full reference can be found here [2].
|
||||||
///
|
///
|
||||||
@@ -425,17 +468,17 @@ impl char {
|
|||||||
///
|
///
|
||||||
/// Returns an iterator which yields the characters corresponding to the
|
/// Returns an iterator which yields the characters corresponding to the
|
||||||
/// uppercase equivalent of the character. If no conversion is possible then
|
/// uppercase equivalent of the character. If no conversion is possible then
|
||||||
/// the input character is returned.
|
/// an iterator with just the input character is returned.
|
||||||
///
|
///
|
||||||
/// [1]: ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
|
/// [1]: ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
|
||||||
///
|
///
|
||||||
/// [`SpecialCasing.txt`]: ftp://ftp.unicode.org/Public/UNIDATA/SpecialCasing.txt
|
/// [`SpecialCasing.txt`]: ftp://ftp.unicode.org/Public/UNIDATA/SpecialCasing.txt
|
||||||
///
|
///
|
||||||
/// [2]: http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf#G33992
|
/// [2]: http://www.unicode.org/versions/Unicode7.0.0/ch03.pdf#G33992
|
||||||
#[stable(feature = "rust1", since = "1.0.0")]
|
#[stable(feature = "rust1", since = "1.0.0")]
|
||||||
#[inline]
|
#[inline]
|
||||||
pub fn to_uppercase(self) -> ToUppercase {
|
pub fn to_uppercase(self) -> ToUppercase {
|
||||||
ToUppercase(Some(conversions::to_upper(self)))
|
ToUppercase(CaseMappingIter::new(conversions::to_upper(self)))
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns this character's displayed width in columns, or `None` if it is a
|
/// Returns this character's displayed width in columns, or `None` if it is a
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user