Add support to libcore for encoded-in-rust unicode character properties, at least. Add script to compute them from unicode.org.
This commit is contained in:
@@ -4,96 +4,75 @@ Module: char
|
||||
Utilities for manipulating the char type
|
||||
*/
|
||||
|
||||
/*
|
||||
Lu Uppercase_Letter an uppercase letter
|
||||
Ll Lowercase_Letter a lowercase letter
|
||||
Lt Titlecase_Letter a digraphic character, with first part uppercase
|
||||
Lm Modifier_Letter a modifier letter
|
||||
Lo Other_Letter other letters, including syllables and ideographs
|
||||
Mn Nonspacing_Mark a nonspacing combining mark (zero advance width)
|
||||
Mc Spacing_Mark a spacing combining mark (positive advance width)
|
||||
Me Enclosing_Mark an enclosing combining mark
|
||||
Nd Decimal_Number a decimal digit
|
||||
Nl Letter_Number a letterlike numeric character
|
||||
No Other_Number a numeric character of other type
|
||||
Pc Connector_Punctuation a connecting punctuation mark, like a tie
|
||||
Pd Dash_Punctuation a dash or hyphen punctuation mark
|
||||
Ps Open_Punctuation an opening punctuation mark (of a pair)
|
||||
Pe Close_Punctuation a closing punctuation mark (of a pair)
|
||||
Pi Initial_Punctuation an initial quotation mark
|
||||
Pf Final_Punctuation a final quotation mark
|
||||
Po Other_Punctuation a punctuation mark of other type
|
||||
Sm Math_Symbol a symbol of primarily mathematical use
|
||||
Sc Currency_Symbol a currency sign
|
||||
Sk Modifier_Symbol a non-letterlike modifier symbol
|
||||
So Other_Symbol a symbol of other type
|
||||
Zs Space_Separator a space character (of various non-zero widths)
|
||||
Zl Line_Separator U+2028 LINE SEPARATOR only
|
||||
Zp Paragraph_Separator U+2029 PARAGRAPH SEPARATOR only
|
||||
Cc Control a C0 or C1 control code
|
||||
Cf Format a format control character
|
||||
Cs Surrogate a surrogate code point
|
||||
Co Private_Use a private-use character
|
||||
Cn Unassigned a reserved unassigned code point or a noncharacter
|
||||
*/
|
||||
|
||||
import is_alphabetic = unicode::derived_property::Alphabetic;
|
||||
import is_XID_start = unicode::derived_property::XID_Start;
|
||||
import is_XID_continue = unicode::derived_property::XID_Continue;
|
||||
|
||||
/*
|
||||
Function: is_whitespace
|
||||
|
||||
Indicates whether a character is whitespace.
|
||||
Indicates whether a character is whitespace, defined in terms of
|
||||
the Unicode General Categories 'Zs', 'Zl', 'Zp' and the additional
|
||||
'Cc'-category control codes in the range [0x09, 0x0d].
|
||||
|
||||
Whitespace characters include space (U+0020), tab (U+0009), line feed
|
||||
(U+000A), carriage return (U+000D), and a number of less common
|
||||
ASCII and unicode characters.
|
||||
*/
|
||||
pure fn is_whitespace(c: char) -> bool {
|
||||
const ch_space: char = '\u0020';
|
||||
const ch_ogham_space_mark: char = '\u1680';
|
||||
const ch_mongolian_vowel_sep: char = '\u180e';
|
||||
const ch_en_quad: char = '\u2000';
|
||||
const ch_em_quad: char = '\u2001';
|
||||
const ch_en_space: char = '\u2002';
|
||||
const ch_em_space: char = '\u2003';
|
||||
const ch_three_per_em_space: char = '\u2004';
|
||||
const ch_four_per_em_space: char = '\u2005';
|
||||
const ch_six_per_em_space: char = '\u2006';
|
||||
const ch_figure_space: char = '\u2007';
|
||||
const ch_punctuation_space: char = '\u2008';
|
||||
const ch_thin_space: char = '\u2009';
|
||||
const ch_hair_space: char = '\u200a';
|
||||
const ch_narrow_no_break_space: char = '\u202f';
|
||||
const ch_medium_mathematical_space: char = '\u205f';
|
||||
const ch_ideographic_space: char = '\u3000';
|
||||
const ch_line_separator: char = '\u2028';
|
||||
const ch_paragraph_separator: char = '\u2029';
|
||||
const ch_character_tabulation: char = '\u0009';
|
||||
const ch_line_feed: char = '\u000a';
|
||||
const ch_line_tabulation: char = '\u000b';
|
||||
const ch_form_feed: char = '\u000c';
|
||||
const ch_carriage_return: char = '\u000d';
|
||||
const ch_next_line: char = '\u0085';
|
||||
const ch_no_break_space: char = '\u00a0';
|
||||
|
||||
if c == ch_space {
|
||||
true
|
||||
} else if c == ch_ogham_space_mark {
|
||||
true
|
||||
} else if c == ch_mongolian_vowel_sep {
|
||||
true
|
||||
} else if c == ch_en_quad {
|
||||
true
|
||||
} else if c == ch_em_quad {
|
||||
true
|
||||
} else if c == ch_en_space {
|
||||
true
|
||||
} else if c == ch_em_space {
|
||||
true
|
||||
} else if c == ch_three_per_em_space {
|
||||
true
|
||||
} else if c == ch_four_per_em_space {
|
||||
true
|
||||
} else if c == ch_six_per_em_space {
|
||||
true
|
||||
} else if c == ch_figure_space {
|
||||
true
|
||||
} else if c == ch_punctuation_space {
|
||||
true
|
||||
} else if c == ch_thin_space {
|
||||
true
|
||||
} else if c == ch_hair_space {
|
||||
true
|
||||
} else if c == ch_narrow_no_break_space {
|
||||
true
|
||||
} else if c == ch_medium_mathematical_space {
|
||||
true
|
||||
} else if c == ch_ideographic_space {
|
||||
true
|
||||
} else if c == ch_line_tabulation {
|
||||
true
|
||||
} else if c == ch_paragraph_separator {
|
||||
true
|
||||
} else if c == ch_character_tabulation {
|
||||
true
|
||||
} else if c == ch_line_feed {
|
||||
true
|
||||
} else if c == ch_line_tabulation {
|
||||
true
|
||||
} else if c == ch_form_feed {
|
||||
true
|
||||
} else if c == ch_carriage_return {
|
||||
true
|
||||
} else if c == ch_next_line {
|
||||
true
|
||||
} else if c == ch_no_break_space { true } else { false }
|
||||
ret ('\x09' <= c && c <= '\x0x0d')
|
||||
|| unicode::general_category::Zs(c)
|
||||
|| unicode::general_category::Zl(c)
|
||||
|| unicode::general_category::Zp(c);
|
||||
}
|
||||
|
||||
/*
|
||||
Function: is_alphanumeric
|
||||
|
||||
Indicates whether a character is alphanumeric, defined in terms of
|
||||
the Unicode General Categories 'Nd', 'Nl', 'No' and the Derived
|
||||
Core Property 'Alphabetic'.
|
||||
|
||||
*/
|
||||
|
||||
pure fn is_alphanumeric(c: char) -> bool {
|
||||
ret unicode::derived_property::Alphabetic(c) ||
|
||||
unicode::general_category::Nd(c) ||
|
||||
unicode::general_category::Nl(c) ||
|
||||
unicode::general_category::No(c);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
Function: to_digit
|
||||
|
||||
|
||||
@@ -30,6 +30,9 @@ mod u64;
|
||||
mod vec;
|
||||
mod bool;
|
||||
|
||||
// For internal use by char, not exported
|
||||
mod unicode;
|
||||
|
||||
|
||||
// Ubiquitous-utility-type modules
|
||||
|
||||
|
||||
4683
src/libcore/unicode.rs
Normal file
4683
src/libcore/unicode.rs
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user