working updates

2022-07-19 17:35:19 -07:00
parent 8bd12e8cca
commit d4819632e2
3 changed files with 108 additions and 2 deletions
--- a/src/tools/unicode-table-generator/src/cascading_map.rs
+++ b/src/tools/unicode-table-generator/src/cascading_map.rs
@@ -0,0 +1,90 @@
+use crate::fmt_list;
+use crate::raw_emitter::RawEmitter;
+use std::collections::HashMap;
+use std::fmt::Write as _;
+use std::ops::Range;
+
+
+impl RawEmitter {
+    pub fn emit_cascading_map(&mut self, ranges: &[Range<u32>]) -> bool {
+
+        let mut map: [u8; 256] = [
+            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        ];
+
+        let points = ranges.iter().flat_map(
+            |r| (r.start..r.end).into_iter().collect::<Vec<u32>>()
+        ).collect::<Vec<u32>>();
+
+        println!("there are {} points", points.len());
+
+        // how many distinct ranges need to be counted?
+        let mut codepoints_by_high_bytes = HashMap::<usize, Vec<u32>>::new();
+        for point in points {
+            // assert that there is no whitespace over the 0x3000 range.
+            assert!(point <= 0x3000, "the highest unicode whitespace value has changed");
+            let high_bytes = point as usize >> 8;
+            let codepoints = codepoints_by_high_bytes.entry(high_bytes).or_insert_with(Vec::new);
+            codepoints.push(point);
+        }
+
+        let mut bit_for_high_byte = 1u8;
+        let mut arms = Vec::<String>::new();
+
+        let mut high_bytes: Vec<usize> = codepoints_by_high_bytes.keys().map(|k| k.clone()).collect();
+        high_bytes.sort();
+        for high_byte in high_bytes {
+            let codepoints = codepoints_by_high_bytes.get_mut(&high_byte).unwrap();
+            if codepoints.len() == 1 {
+                let ch = codepoints.pop().unwrap();
+                arms.push(format!("{} => c as u32 == {:#04x}", high_byte, ch));
+                continue;
+            }
+            // more than 1 codepoint in this arm
+            for codepoint in codepoints {
+                map[(*codepoint & 0xff) as usize] |= bit_for_high_byte;
+            }
+            arms.push(format!(
+                "{} => WHITESPACE_MAP[c as usize & 0xff] & {} != 0",
+                high_byte,
+                bit_for_high_byte)
+            );
+            bit_for_high_byte <<= 1;
+        }
+
+        writeln!(
+            &mut self.file,
+            "static WHITESPACE_MAP: [u8; 256] = [{}];",
+            fmt_list(map.iter())
+        )
+        .unwrap();
+        self.bytes_used += 256;
+
+
+        writeln!(&mut self.file, "pub fn lookup(c: char) -> bool {{").unwrap();
+        writeln!(&mut self.file, "    match c as u32 >> 8 {{").unwrap();
+        for arm in arms {
+            writeln!(&mut self.file, "        {},", arm).unwrap();
+        }
+        writeln!(&mut self.file, "        _ => false,").unwrap();
+        writeln!(&mut self.file, "    }}").unwrap();
+        writeln!(&mut self.file, "}}").unwrap();
+
+        true
+    }
+}
--- a/src/tools/unicode-table-generator/src/main.rs
+++ b/src/tools/unicode-table-generator/src/main.rs
@@ -78,9 +78,10 @@ use ucd_parse::Codepoints;
 mod case_mapping;
 mod raw_emitter;
 mod skiplist;
+mod cascading_map;
 mod unicode_download;

-use raw_emitter::{emit_codepoints, RawEmitter};
+use raw_emitter::{emit_codepoints, emit_whitespace, RawEmitter};

 static PROPERTIES: &[&str] = &[
    "Alphabetic",
@@ -241,8 +242,13 @@ fn main() {
    let mut modules = Vec::new();
    for (property, ranges) in ranges_by_property {
        let datapoints = ranges.iter().map(|r| r.end - r.start).sum::<u32>();
+
        let mut emitter = RawEmitter::new();
-        emit_codepoints(&mut emitter, &ranges);
+        if property == &"White_Space" {
+            emit_whitespace(&mut emitter, &ranges);
+        } else {
+            emit_codepoints(&mut emitter, &ranges);
+        }

        modules.push((property.to_lowercase().to_string(), emitter.file));
        println!(
--- a/src/tools/unicode-table-generator/src/raw_emitter.rs
+++ b/src/tools/unicode-table-generator/src/raw_emitter.rs
@@ -170,6 +170,16 @@ pub fn emit_codepoints(emitter: &mut RawEmitter, ranges: &[Range<u32>]) {
    }
 }

+pub fn emit_whitespace(emitter: &mut RawEmitter, ranges: &[Range<u32>]) {
+    emitter.blank_line();
+
+    let mut cascading = emitter.clone();
+    cascading.emit_cascading_map(&ranges);
+    *emitter = cascading;
+    emitter.desc = String::from("cascading");
+
+}
+
 struct Canonicalized {
    canonical_words: Vec<u64>,
    canonicalized_words: Vec<(u8, u8)>,