Add support to libcore for encoded-in-rust unicode character properties, at least. Add script to compute them from unicode.org.

This commit is contained in:
Graydon Hoare
2011-12-23 18:48:08 -08:00
parent 88d74993d8
commit ac13f0da9e
4 changed files with 4919 additions and 82 deletions

View File

@@ -4,96 +4,75 @@ Module: char
Utilities for manipulating the char type
*/
/*
Lu Uppercase_Letter an uppercase letter
Ll Lowercase_Letter a lowercase letter
Lt Titlecase_Letter a digraphic character, with first part uppercase
Lm Modifier_Letter a modifier letter
Lo Other_Letter other letters, including syllables and ideographs
Mn Nonspacing_Mark a nonspacing combining mark (zero advance width)
Mc Spacing_Mark a spacing combining mark (positive advance width)
Me Enclosing_Mark an enclosing combining mark
Nd Decimal_Number a decimal digit
Nl Letter_Number a letterlike numeric character
No Other_Number a numeric character of other type
Pc Connector_Punctuation a connecting punctuation mark, like a tie
Pd Dash_Punctuation a dash or hyphen punctuation mark
Ps Open_Punctuation an opening punctuation mark (of a pair)
Pe Close_Punctuation a closing punctuation mark (of a pair)
Pi Initial_Punctuation an initial quotation mark
Pf Final_Punctuation a final quotation mark
Po Other_Punctuation a punctuation mark of other type
Sm Math_Symbol a symbol of primarily mathematical use
Sc Currency_Symbol a currency sign
Sk Modifier_Symbol a non-letterlike modifier symbol
So Other_Symbol a symbol of other type
Zs Space_Separator a space character (of various non-zero widths)
Zl Line_Separator U+2028 LINE SEPARATOR only
Zp Paragraph_Separator U+2029 PARAGRAPH SEPARATOR only
Cc Control a C0 or C1 control code
Cf Format a format control character
Cs Surrogate a surrogate code point
Co Private_Use a private-use character
Cn Unassigned a reserved unassigned code point or a noncharacter
*/
import is_alphabetic = unicode::derived_property::Alphabetic;
import is_XID_start = unicode::derived_property::XID_Start;
import is_XID_continue = unicode::derived_property::XID_Continue;
/*
Function: is_whitespace
Indicates whether a character is whitespace.
Indicates whether a character is whitespace, defined in terms of
the Unicode General Categories 'Zs', 'Zl', 'Zp' and the additional
'Cc'-category control codes in the range [0x09, 0x0d].
Whitespace characters include space (U+0020), tab (U+0009), line feed
(U+000A), carriage return (U+000D), and a number of less common
ASCII and unicode characters.
*/
pure fn is_whitespace(c: char) -> bool {
const ch_space: char = '\u0020';
const ch_ogham_space_mark: char = '\u1680';
const ch_mongolian_vowel_sep: char = '\u180e';
const ch_en_quad: char = '\u2000';
const ch_em_quad: char = '\u2001';
const ch_en_space: char = '\u2002';
const ch_em_space: char = '\u2003';
const ch_three_per_em_space: char = '\u2004';
const ch_four_per_em_space: char = '\u2005';
const ch_six_per_em_space: char = '\u2006';
const ch_figure_space: char = '\u2007';
const ch_punctuation_space: char = '\u2008';
const ch_thin_space: char = '\u2009';
const ch_hair_space: char = '\u200a';
const ch_narrow_no_break_space: char = '\u202f';
const ch_medium_mathematical_space: char = '\u205f';
const ch_ideographic_space: char = '\u3000';
const ch_line_separator: char = '\u2028';
const ch_paragraph_separator: char = '\u2029';
const ch_character_tabulation: char = '\u0009';
const ch_line_feed: char = '\u000a';
const ch_line_tabulation: char = '\u000b';
const ch_form_feed: char = '\u000c';
const ch_carriage_return: char = '\u000d';
const ch_next_line: char = '\u0085';
const ch_no_break_space: char = '\u00a0';
if c == ch_space {
true
} else if c == ch_ogham_space_mark {
true
} else if c == ch_mongolian_vowel_sep {
true
} else if c == ch_en_quad {
true
} else if c == ch_em_quad {
true
} else if c == ch_en_space {
true
} else if c == ch_em_space {
true
} else if c == ch_three_per_em_space {
true
} else if c == ch_four_per_em_space {
true
} else if c == ch_six_per_em_space {
true
} else if c == ch_figure_space {
true
} else if c == ch_punctuation_space {
true
} else if c == ch_thin_space {
true
} else if c == ch_hair_space {
true
} else if c == ch_narrow_no_break_space {
true
} else if c == ch_medium_mathematical_space {
true
} else if c == ch_ideographic_space {
true
} else if c == ch_line_tabulation {
true
} else if c == ch_paragraph_separator {
true
} else if c == ch_character_tabulation {
true
} else if c == ch_line_feed {
true
} else if c == ch_line_tabulation {
true
} else if c == ch_form_feed {
true
} else if c == ch_carriage_return {
true
} else if c == ch_next_line {
true
} else if c == ch_no_break_space { true } else { false }
ret ('\x09' <= c && c <= '\x0x0d')
|| unicode::general_category::Zs(c)
|| unicode::general_category::Zl(c)
|| unicode::general_category::Zp(c);
}
/*
Function: is_alphanumeric
Indicates whether a character is alphanumeric, defined in terms of
the Unicode General Categories 'Nd', 'Nl', 'No' and the Derived
Core Property 'Alphabetic'.
*/
pure fn is_alphanumeric(c: char) -> bool {
ret unicode::derived_property::Alphabetic(c) ||
unicode::general_category::Nd(c) ||
unicode::general_category::Nl(c) ||
unicode::general_category::No(c);
}
/*
Function: to_digit

View File

@@ -30,6 +30,9 @@ mod u64;
mod vec;
mod bool;
// For internal use by char, not exported
mod unicode;
// Ubiquitous-utility-type modules

4683
src/libcore/unicode.rs Normal file

File diff suppressed because it is too large Load Diff