use normative source for Grapheme class data
@mahkoh points out in #15628 that unicode.py does not use normative data for Grapheme classes. This pr fixes that issue. In addition, GC_RegionalIndicator is renamed GC_Regional_Indicator in order to stay in line with the Unicode class name definitions. I have updated refs in u_str.rs, and verified that there are no refs elsewhere in the codebase. However, in principle someone using the unicode tables for their own purposes might see breakage from this.
This commit is contained in:
@@ -12,8 +12,11 @@
|
||||
|
||||
# This script uses the following Unicode tables:
|
||||
# - DerivedCoreProperties.txt
|
||||
# - DerivedNormalizationProps.txt
|
||||
# - EastAsianWidth.txt
|
||||
# - auxiliary/GraphemeBreakProperty.txt
|
||||
# - PropList.txt
|
||||
# - ReadMe.txt
|
||||
# - Scripts.txt
|
||||
# - UnicodeData.txt
|
||||
#
|
||||
@@ -51,41 +54,20 @@ expanded_categories = {
|
||||
'Cc': ['C'], 'Cf': ['C'], 'Cs': ['C'], 'Co': ['C'], 'Cn': ['C'],
|
||||
}
|
||||
|
||||
|
||||
# Grapheme cluster data
|
||||
# taken from UAX29, http://www.unicode.org/reports/tr29/
|
||||
# these code points are excluded from the Control category
|
||||
# NOTE: CR and LF are also technically excluded, but for
|
||||
# the sake of convenience we leave them in the Control group
|
||||
# and manually check them in the appropriate place. This is
|
||||
# still compliant with the implementation requirements.
|
||||
grapheme_control_exceptions = set([0x200c, 0x200d])
|
||||
|
||||
# the Regional_Indicator category
|
||||
grapheme_regional_indicator = [(0x1f1e6, 0x1f1ff)]
|
||||
|
||||
# "The following ... are specifically excluded" from the SpacingMark category
|
||||
# http://www.unicode.org/reports/tr29/#SpacingMark
|
||||
grapheme_spacingmark_exceptions = [(0x102b, 0x102c), (0x1038, 0x1038),
|
||||
(0x1062, 0x1064), (0x1067, 0x106d), (0x1083, 0x1083), (0x1087, 0x108c),
|
||||
(0x108f, 0x108f), (0x109a, 0x109c), (0x19b0, 0x19b4), (0x19b8, 0x19b9),
|
||||
(0x19bb, 0x19c0), (0x19c8, 0x19c9), (0x1a61, 0x1a61), (0x1a63, 0x1a64),
|
||||
(0xaa7b, 0xaa7b), (0xaa7d, 0xaa7d)]
|
||||
|
||||
# these are included in the SpacingMark category
|
||||
grapheme_spacingmark_extra = set([0xe33, 0xeb3])
|
||||
# these are the surrogate codepoints, which are not valid rust characters
|
||||
surrogate_codepoints = (0xd800, 0xdfff)
|
||||
|
||||
def fetch(f):
|
||||
if not os.path.exists(f):
|
||||
if not os.path.exists(os.path.basename(f)):
|
||||
os.system("curl -O http://www.unicode.org/Public/UNIDATA/%s"
|
||||
% f)
|
||||
|
||||
if not os.path.exists(f):
|
||||
if not os.path.exists(os.path.basename(f)):
|
||||
sys.stderr.write("cannot load %s" % f)
|
||||
exit(1)
|
||||
|
||||
def is_surrogate(n):
|
||||
return 0xD800 <= n <= 0xDFFF
|
||||
return surrogate_codepoints[0] <= n <= surrogate_codepoints[1]
|
||||
|
||||
def load_unicode_data(f):
|
||||
fetch(f)
|
||||
@@ -228,7 +210,7 @@ def load_properties(f, interestingprops):
|
||||
re1 = re.compile("^([0-9A-F]+) +; (\w+)")
|
||||
re2 = re.compile("^([0-9A-F]+)\.\.([0-9A-F]+) +; (\w+)")
|
||||
|
||||
for line in fileinput.input(f):
|
||||
for line in fileinput.input(os.path.basename(f)):
|
||||
prop = None
|
||||
d_lo = 0
|
||||
d_hi = 0
|
||||
@@ -623,7 +605,7 @@ pub const UNICODE_VERSION: (u64, u64, u64) = (%s, %s, %s);
|
||||
(canon_decomp, compat_decomp, gencats, combines,
|
||||
lowerupper, upperlower) = load_unicode_data("UnicodeData.txt")
|
||||
want_derived = ["XID_Start", "XID_Continue", "Alphabetic", "Lowercase", "Uppercase"]
|
||||
other_derived = ["Default_Ignorable_Code_Point", "Grapheme_Extend"]
|
||||
other_derived = ["Default_Ignorable_Code_Point"]
|
||||
derived = load_properties("DerivedCoreProperties.txt", want_derived + other_derived)
|
||||
scripts = load_properties("Scripts.txt", [])
|
||||
props = load_properties("PropList.txt",
|
||||
@@ -631,12 +613,6 @@ pub const UNICODE_VERSION: (u64, u64, u64) = (%s, %s, %s);
|
||||
norm_props = load_properties("DerivedNormalizationProps.txt",
|
||||
["Full_Composition_Exclusion"])
|
||||
|
||||
# grapheme cluster category from DerivedCoreProperties
|
||||
# the rest are defined below
|
||||
grapheme_cats = {}
|
||||
grapheme_cats["Extend"] = derived["Grapheme_Extend"]
|
||||
del(derived["Grapheme_Extend"])
|
||||
|
||||
# bsearch_range_table is used in all the property modules below
|
||||
emit_bsearch_range_table(rf)
|
||||
|
||||
@@ -691,34 +667,24 @@ pub const UNICODE_VERSION: (u64, u64, u64) = (%s, %s, %s);
|
||||
|
||||
### grapheme cluster module
|
||||
# from http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Break_Property_Values
|
||||
# Hangul syllable categories
|
||||
want_hangul = ["L", "V", "T", "LV", "LVT"]
|
||||
grapheme_cats.update(load_properties("HangulSyllableType.txt", want_hangul))
|
||||
grapheme_cats = load_properties("auxiliary/GraphemeBreakProperty.txt", [])
|
||||
|
||||
# Control
|
||||
# Note 1:
|
||||
# This category also includes Cs (surrogate codepoints), but Rust's `char`s are
|
||||
# Unicode Scalar Values only, and surrogates are thus invalid `char`s.
|
||||
grapheme_cats["Control"] = set()
|
||||
for cat in ["Zl", "Zp", "Cc", "Cf"]:
|
||||
grapheme_cats["Control"] |= set(ungroup_cat(gencats[cat]))
|
||||
# Thus, we have to remove Cs from the Control category
|
||||
# Note 2:
|
||||
# 0x0a and 0x0d (CR and LF) are not in the Control category for Graphemes.
|
||||
# However, the Graphemes iterator treats these as a special case, so they
|
||||
# should be included in grapheme_cats["Control"] for our implementation.
|
||||
grapheme_cats["Control"] = group_cat(list(
|
||||
grapheme_cats["Control"]
|
||||
- grapheme_control_exceptions
|
||||
| (set(ungroup_cat(gencats["Cn"]))
|
||||
& set(ungroup_cat(derived["Default_Ignorable_Code_Point"])))))
|
||||
|
||||
# Regional Indicator
|
||||
grapheme_cats["RegionalIndicator"] = grapheme_regional_indicator
|
||||
|
||||
# Prepend - "Currently there are no characters with this value"
|
||||
# (from UAX#29, Unicode 7.0)
|
||||
|
||||
# SpacingMark
|
||||
grapheme_cats["SpacingMark"] = group_cat(list(
|
||||
set(ungroup_cat(gencats["Mc"]))
|
||||
- set(ungroup_cat(grapheme_cats["Extend"]))
|
||||
| grapheme_spacingmark_extra
|
||||
- set(ungroup_cat(grapheme_spacingmark_exceptions))))
|
||||
(set(ungroup_cat(grapheme_cats["Control"]))
|
||||
| set(ungroup_cat(grapheme_cats["CR"]))
|
||||
| set(ungroup_cat(grapheme_cats["LF"])))
|
||||
- set(ungroup_cat([surrogate_codepoints]))))
|
||||
del(grapheme_cats["CR"])
|
||||
del(grapheme_cats["LF"])
|
||||
|
||||
grapheme_table = []
|
||||
for cat in grapheme_cats:
|
||||
|
||||
Reference in New Issue
Block a user