Add support to libcore for encoded-in-rust unicode character properties, at least. Add script to compute them from unicode.org.
This commit is contained in:
172
src/etc/unicode.py
Executable file
172
src/etc/unicode.py
Executable file
@@ -0,0 +1,172 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
# This digests UnicodeData.txt and DerivedCoreProperties.txt and emits rust
|
||||||
|
# code covering the core properties. Since this is a pretty rare event we
|
||||||
|
# just store this out-of-line and check the unicode.rs file into git.
|
||||||
|
#
|
||||||
|
# The emitted code is "the minimum we think is necessary for libcore", that
|
||||||
|
# is, to support basic operations of the compiler and "most nontrivial rust
|
||||||
|
# programs". It is not meant to be a complete implementation of unicode.
|
||||||
|
# For that we recommend you use a proper binding to libicu.
|
||||||
|
|
||||||
|
import fileinput, re, os, sys
|
||||||
|
|
||||||
|
|
||||||
|
def fetch(f):
|
||||||
|
if not os.path.exists(f):
|
||||||
|
os.system("curl -O http://www.unicode.org/Public/UNIDATA/%s"
|
||||||
|
% f)
|
||||||
|
|
||||||
|
if not os.path.exists(f):
|
||||||
|
sys.stderr.write("cannot load %s" % f)
|
||||||
|
exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
def load_general_categories(f):
|
||||||
|
fetch(f)
|
||||||
|
gencats = {}
|
||||||
|
curr_cat = ""
|
||||||
|
c_lo = 0
|
||||||
|
c_hi = 0
|
||||||
|
for line in fileinput.input(f):
|
||||||
|
fields = line.split(";")
|
||||||
|
if len(fields) != 15:
|
||||||
|
continue
|
||||||
|
[code, name, gencat, combine, bidi,
|
||||||
|
decomp, deci, digit, num, mirror,
|
||||||
|
old, iso, upcase, lowcsae, titlecase ] = fields
|
||||||
|
|
||||||
|
code = int(code, 16)
|
||||||
|
|
||||||
|
if curr_cat == "":
|
||||||
|
curr_cat = gencat
|
||||||
|
c_lo = code
|
||||||
|
c_hi = code
|
||||||
|
|
||||||
|
if curr_cat == gencat:
|
||||||
|
c_hi = code
|
||||||
|
else:
|
||||||
|
if curr_cat not in gencats:
|
||||||
|
gencats[curr_cat] = []
|
||||||
|
|
||||||
|
gencats[curr_cat].append((c_lo, c_hi))
|
||||||
|
curr_cat = gencat
|
||||||
|
c_lo = code
|
||||||
|
c_hi = code
|
||||||
|
return gencats
|
||||||
|
|
||||||
|
|
||||||
|
def load_derived_core_properties(f):
|
||||||
|
fetch(f)
|
||||||
|
derivedprops = {}
|
||||||
|
interestingprops = ["XID_Start", "XID_Continue", "Alphabetic"]
|
||||||
|
re1 = re.compile("^([0-9A-F]+) +; (\w+)")
|
||||||
|
re2 = re.compile("^([0-9A-F]+)\.\.([0-9A-F]+) +; (\w+)")
|
||||||
|
|
||||||
|
for line in fileinput.input(f):
|
||||||
|
prop = None
|
||||||
|
d_lo = 0
|
||||||
|
d_hi = 0
|
||||||
|
m = re1.match(line)
|
||||||
|
if m:
|
||||||
|
d_lo = m.group(1)
|
||||||
|
d_hi = m.group(1)
|
||||||
|
prop = m.group(2)
|
||||||
|
else:
|
||||||
|
m = re2.match(line)
|
||||||
|
if m:
|
||||||
|
d_lo = m.group(1)
|
||||||
|
d_hi = m.group(2)
|
||||||
|
prop = m.group(3)
|
||||||
|
else:
|
||||||
|
continue
|
||||||
|
if prop not in interestingprops:
|
||||||
|
continue
|
||||||
|
d_lo = int(d_lo, 16)
|
||||||
|
d_hi = int(d_hi, 16)
|
||||||
|
if prop not in derivedprops:
|
||||||
|
derivedprops[prop] = []
|
||||||
|
derivedprops[prop].append((d_lo, d_hi))
|
||||||
|
return derivedprops
|
||||||
|
|
||||||
|
def escape_char(c):
|
||||||
|
if c <= 0xff:
|
||||||
|
return "'\\x%2.2x'" % c
|
||||||
|
if c <= 0xffff:
|
||||||
|
return "'\\u%4.4x'" % c
|
||||||
|
return "'\\U%8.8x'" % c
|
||||||
|
|
||||||
|
def emit_rust_module(f, mod, tbl):
|
||||||
|
f.write("mod %s {\n" % mod)
|
||||||
|
keys = tbl.keys()
|
||||||
|
keys.sort()
|
||||||
|
for cat in keys:
|
||||||
|
f.write(" pure fn %s(c: char) -> bool {\n" % cat)
|
||||||
|
f.write(" ret alt c {\n")
|
||||||
|
prefix = ' '
|
||||||
|
for pair in tbl[cat]:
|
||||||
|
if pair[0] == pair[1]:
|
||||||
|
f.write(" %c %s\n" %
|
||||||
|
(prefix, escape_char(pair[0])))
|
||||||
|
else:
|
||||||
|
f.write(" %c %s to %s\n" %
|
||||||
|
(prefix,
|
||||||
|
escape_char(pair[0]),
|
||||||
|
escape_char(pair[1])))
|
||||||
|
prefix = '|'
|
||||||
|
f.write(" { true }\n")
|
||||||
|
f.write(" _ { false }\n")
|
||||||
|
f.write(" };\n")
|
||||||
|
f.write(" }\n\n")
|
||||||
|
f.write("}\n")
|
||||||
|
|
||||||
|
|
||||||
|
def emit_cpp_module(f, mod, tbl):
|
||||||
|
keys = tbl.keys()
|
||||||
|
keys.sort()
|
||||||
|
|
||||||
|
for cat in keys:
|
||||||
|
|
||||||
|
singles = []
|
||||||
|
ranges = []
|
||||||
|
|
||||||
|
for pair in tbl[cat]:
|
||||||
|
if pair[0] == pair[1]:
|
||||||
|
singles.append(pair[0])
|
||||||
|
else:
|
||||||
|
ranges.append(pair)
|
||||||
|
|
||||||
|
f.write("bool %s_%s(unsigned c) {\n" % (mod, cat))
|
||||||
|
for pair in ranges:
|
||||||
|
f.write(" if (0x%x <= c && c <= 0x%x) { return true; }\n"
|
||||||
|
% pair)
|
||||||
|
if len(singles) > 0:
|
||||||
|
f.write(" switch (c) {\n");
|
||||||
|
for single in singles:
|
||||||
|
f.write(" case 0x%x:\n" % single)
|
||||||
|
f.write(" return true;\n");
|
||||||
|
f.write(" default:\n");
|
||||||
|
f.write(" return false;\n");
|
||||||
|
f.write(" }\n")
|
||||||
|
f.write("return false;\n")
|
||||||
|
f.write("}\n\n")
|
||||||
|
|
||||||
|
|
||||||
|
def emit_module(rf, cf, mod, tbl):
|
||||||
|
emit_rust_module(rf, mod, tbl)
|
||||||
|
emit_cpp_module(cf, mod, tbl)
|
||||||
|
|
||||||
|
r = "unicode.rs"
|
||||||
|
c = "unicode.cpp"
|
||||||
|
for i in [r, c]:
|
||||||
|
if os.path.exists(i):
|
||||||
|
os.remove(i);
|
||||||
|
|
||||||
|
rf = open(r, "w")
|
||||||
|
cf = open(c, "w")
|
||||||
|
|
||||||
|
emit_module(rf, cf, "general_category",
|
||||||
|
load_general_categories("UnicodeData.txt"))
|
||||||
|
|
||||||
|
emit_module(rf, cf, "derived_property",
|
||||||
|
load_derived_core_properties("DerivedCoreProperties.txt"))
|
||||||
@@ -4,96 +4,75 @@ Module: char
|
|||||||
Utilities for manipulating the char type
|
Utilities for manipulating the char type
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
Lu Uppercase_Letter an uppercase letter
|
||||||
|
Ll Lowercase_Letter a lowercase letter
|
||||||
|
Lt Titlecase_Letter a digraphic character, with first part uppercase
|
||||||
|
Lm Modifier_Letter a modifier letter
|
||||||
|
Lo Other_Letter other letters, including syllables and ideographs
|
||||||
|
Mn Nonspacing_Mark a nonspacing combining mark (zero advance width)
|
||||||
|
Mc Spacing_Mark a spacing combining mark (positive advance width)
|
||||||
|
Me Enclosing_Mark an enclosing combining mark
|
||||||
|
Nd Decimal_Number a decimal digit
|
||||||
|
Nl Letter_Number a letterlike numeric character
|
||||||
|
No Other_Number a numeric character of other type
|
||||||
|
Pc Connector_Punctuation a connecting punctuation mark, like a tie
|
||||||
|
Pd Dash_Punctuation a dash or hyphen punctuation mark
|
||||||
|
Ps Open_Punctuation an opening punctuation mark (of a pair)
|
||||||
|
Pe Close_Punctuation a closing punctuation mark (of a pair)
|
||||||
|
Pi Initial_Punctuation an initial quotation mark
|
||||||
|
Pf Final_Punctuation a final quotation mark
|
||||||
|
Po Other_Punctuation a punctuation mark of other type
|
||||||
|
Sm Math_Symbol a symbol of primarily mathematical use
|
||||||
|
Sc Currency_Symbol a currency sign
|
||||||
|
Sk Modifier_Symbol a non-letterlike modifier symbol
|
||||||
|
So Other_Symbol a symbol of other type
|
||||||
|
Zs Space_Separator a space character (of various non-zero widths)
|
||||||
|
Zl Line_Separator U+2028 LINE SEPARATOR only
|
||||||
|
Zp Paragraph_Separator U+2029 PARAGRAPH SEPARATOR only
|
||||||
|
Cc Control a C0 or C1 control code
|
||||||
|
Cf Format a format control character
|
||||||
|
Cs Surrogate a surrogate code point
|
||||||
|
Co Private_Use a private-use character
|
||||||
|
Cn Unassigned a reserved unassigned code point or a noncharacter
|
||||||
|
*/
|
||||||
|
|
||||||
|
import is_alphabetic = unicode::derived_property::Alphabetic;
|
||||||
|
import is_XID_start = unicode::derived_property::XID_Start;
|
||||||
|
import is_XID_continue = unicode::derived_property::XID_Continue;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
Function: is_whitespace
|
Function: is_whitespace
|
||||||
|
|
||||||
Indicates whether a character is whitespace.
|
Indicates whether a character is whitespace, defined in terms of
|
||||||
|
the Unicode General Categories 'Zs', 'Zl', 'Zp' and the additional
|
||||||
|
'Cc'-category control codes in the range [0x09, 0x0d].
|
||||||
|
|
||||||
Whitespace characters include space (U+0020), tab (U+0009), line feed
|
|
||||||
(U+000A), carriage return (U+000D), and a number of less common
|
|
||||||
ASCII and unicode characters.
|
|
||||||
*/
|
*/
|
||||||
pure fn is_whitespace(c: char) -> bool {
|
pure fn is_whitespace(c: char) -> bool {
|
||||||
const ch_space: char = '\u0020';
|
ret ('\x09' <= c && c <= '\x0x0d')
|
||||||
const ch_ogham_space_mark: char = '\u1680';
|
|| unicode::general_category::Zs(c)
|
||||||
const ch_mongolian_vowel_sep: char = '\u180e';
|
|| unicode::general_category::Zl(c)
|
||||||
const ch_en_quad: char = '\u2000';
|
|| unicode::general_category::Zp(c);
|
||||||
const ch_em_quad: char = '\u2001';
|
|
||||||
const ch_en_space: char = '\u2002';
|
|
||||||
const ch_em_space: char = '\u2003';
|
|
||||||
const ch_three_per_em_space: char = '\u2004';
|
|
||||||
const ch_four_per_em_space: char = '\u2005';
|
|
||||||
const ch_six_per_em_space: char = '\u2006';
|
|
||||||
const ch_figure_space: char = '\u2007';
|
|
||||||
const ch_punctuation_space: char = '\u2008';
|
|
||||||
const ch_thin_space: char = '\u2009';
|
|
||||||
const ch_hair_space: char = '\u200a';
|
|
||||||
const ch_narrow_no_break_space: char = '\u202f';
|
|
||||||
const ch_medium_mathematical_space: char = '\u205f';
|
|
||||||
const ch_ideographic_space: char = '\u3000';
|
|
||||||
const ch_line_separator: char = '\u2028';
|
|
||||||
const ch_paragraph_separator: char = '\u2029';
|
|
||||||
const ch_character_tabulation: char = '\u0009';
|
|
||||||
const ch_line_feed: char = '\u000a';
|
|
||||||
const ch_line_tabulation: char = '\u000b';
|
|
||||||
const ch_form_feed: char = '\u000c';
|
|
||||||
const ch_carriage_return: char = '\u000d';
|
|
||||||
const ch_next_line: char = '\u0085';
|
|
||||||
const ch_no_break_space: char = '\u00a0';
|
|
||||||
|
|
||||||
if c == ch_space {
|
|
||||||
true
|
|
||||||
} else if c == ch_ogham_space_mark {
|
|
||||||
true
|
|
||||||
} else if c == ch_mongolian_vowel_sep {
|
|
||||||
true
|
|
||||||
} else if c == ch_en_quad {
|
|
||||||
true
|
|
||||||
} else if c == ch_em_quad {
|
|
||||||
true
|
|
||||||
} else if c == ch_en_space {
|
|
||||||
true
|
|
||||||
} else if c == ch_em_space {
|
|
||||||
true
|
|
||||||
} else if c == ch_three_per_em_space {
|
|
||||||
true
|
|
||||||
} else if c == ch_four_per_em_space {
|
|
||||||
true
|
|
||||||
} else if c == ch_six_per_em_space {
|
|
||||||
true
|
|
||||||
} else if c == ch_figure_space {
|
|
||||||
true
|
|
||||||
} else if c == ch_punctuation_space {
|
|
||||||
true
|
|
||||||
} else if c == ch_thin_space {
|
|
||||||
true
|
|
||||||
} else if c == ch_hair_space {
|
|
||||||
true
|
|
||||||
} else if c == ch_narrow_no_break_space {
|
|
||||||
true
|
|
||||||
} else if c == ch_medium_mathematical_space {
|
|
||||||
true
|
|
||||||
} else if c == ch_ideographic_space {
|
|
||||||
true
|
|
||||||
} else if c == ch_line_tabulation {
|
|
||||||
true
|
|
||||||
} else if c == ch_paragraph_separator {
|
|
||||||
true
|
|
||||||
} else if c == ch_character_tabulation {
|
|
||||||
true
|
|
||||||
} else if c == ch_line_feed {
|
|
||||||
true
|
|
||||||
} else if c == ch_line_tabulation {
|
|
||||||
true
|
|
||||||
} else if c == ch_form_feed {
|
|
||||||
true
|
|
||||||
} else if c == ch_carriage_return {
|
|
||||||
true
|
|
||||||
} else if c == ch_next_line {
|
|
||||||
true
|
|
||||||
} else if c == ch_no_break_space { true } else { false }
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
Function: is_alphanumeric
|
||||||
|
|
||||||
|
Indicates whether a character is alphanumeric, defined in terms of
|
||||||
|
the Unicode General Categories 'Nd', 'Nl', 'No' and the Derived
|
||||||
|
Core Property 'Alphabetic'.
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
pure fn is_alphanumeric(c: char) -> bool {
|
||||||
|
ret unicode::derived_property::Alphabetic(c) ||
|
||||||
|
unicode::general_category::Nd(c) ||
|
||||||
|
unicode::general_category::Nl(c) ||
|
||||||
|
unicode::general_category::No(c);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
Function: to_digit
|
Function: to_digit
|
||||||
|
|
||||||
|
|||||||
@@ -30,6 +30,9 @@ mod u64;
|
|||||||
mod vec;
|
mod vec;
|
||||||
mod bool;
|
mod bool;
|
||||||
|
|
||||||
|
// For internal use by char, not exported
|
||||||
|
mod unicode;
|
||||||
|
|
||||||
|
|
||||||
// Ubiquitous-utility-type modules
|
// Ubiquitous-utility-type modules
|
||||||
|
|
||||||
|
|||||||
4683
src/libcore/unicode.rs
Normal file
4683
src/libcore/unicode.rs
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user