Shrink bitset words through functional mapping

Previously, all words in the (deduplicated) bitset would be stored raw -- a full 64 bits (8 bytes). Now, those words that are equivalent to others through a specific mapping are stored separately and "mapped" to the original when loading; this shrinks the table sizes significantly, as each mapped word is stored in 2 bytes (a 4x decrease from the previous). The new encoding is also potentially non-optimal: the "mapped" byte is frequently repeated, as in practice many mapped words use the same base word. Currently we only support two forms of mapping: rotation and inversion. Note that these are both guaranteed to map transitively if at all, and supporting mappings for which this is not true may require a more interesting algorithm for choosing the optimal pairing. Updated sizes: Alphabetic : 2622 bytes (- 414 bytes) Case_Ignorable : 1803 bytes (- 330 bytes) Cased : 808 bytes (- 126 bytes) Cc : 32 bytes Grapheme_Extend: 1508 bytes (- 252 bytes) Lowercase : 901 bytes (- 84 bytes) N : 1064 bytes (- 156 bytes) Uppercase : 838 bytes (- 96 bytes) White_Space : 91 bytes (- 6 bytes) Total table sizes: 9667 bytes (-1,464 bytes)
2020-03-21 10:16:01 -04:00
parent 6c7691a37b
commit b0e121d9d5
4 changed files with 1211 additions and 432 deletions
--- a/src/tools/unicode-table-generator/src/main.rs
+++ b/src/tools/unicode-table-generator/src/main.rs
@@ -254,12 +254,19 @@ fn generate_tests(data_path: &str, ranges: &[(&str, Vec<Range<u32>>)]) -> String
    s.push_str(
        "
 #[inline(always)]
-fn range_search<const N: usize, const CHUNK_SIZE: usize, const N1: usize, const N2: usize>(
+fn range_search<
+    const N: usize,
+    const CHUNK_SIZE: usize,
+    const N1: usize,
+    const CANONICAL: usize,
+    const CANONICALIZED: usize,
+>(
    needle: u32,
    chunk_idx_map: &[u8; N],
    (last_chunk_idx, last_chunk_mapping): (u16, u8),
    bitset_chunk_idx: &[[u8; CHUNK_SIZE]; N1],
-    bitset: &[u64; N2],
+    bitset_canonical: &[u64; CANONICAL],
+    bitset_canonicalized: &[(u8, u8); CANONICALIZED],
 ) -> bool {
    let bucket_idx = (needle / 64) as usize;
    let chunk_map_idx = bucket_idx / CHUNK_SIZE;
@@ -273,8 +280,21 @@ fn range_search<const N: usize, const CHUNK_SIZE: usize, const N1: usize, const
    } else {
        chunk_idx_map[chunk_map_idx]
    };
-    let idx = bitset_chunk_idx[(chunk_idx as usize)][chunk_piece];
-    let word = bitset[(idx as usize)];
+    let idx = bitset_chunk_idx[(chunk_idx as usize)][chunk_piece] as usize;
+    let word = if idx < CANONICAL {
+        bitset_canonical[idx]
+    } else {
+        let (real_idx, mapping) = bitset_canonicalized[idx - CANONICAL];
+        let mut word = bitset_canonical[real_idx as usize];
+        let should_invert = mapping & (1 << 7) != 0;
+        if should_invert {
+            word = !word;
+        }
+        // Unset the inversion bit
+        let rotate_by = mapping & !(1 << 7);
+        word = word.rotate_left(rotate_by as u32);
+        word
+    };
    (word & (1 << (needle % 64) as u64)) != 0
 }
    ",
--- a/src/tools/unicode-table-generator/src/raw_emitter.rs
+++ b/src/tools/unicode-table-generator/src/raw_emitter.rs
@@ -22,8 +22,9 @@
 //! mapping into two separate sets; currently this is not dealt with).
 //!
 //! With that scheme, we now have a single byte for every 64 codepoints. We
-//! further group these by 16 (arbitrarily chosen), and again deduplicate and
-//! store in an array (u8 -> [u8; 16]).
+//! further group these by some constant N (between 1 and 64 per group), and
+//! again deduplicate and store in an array (u8 -> [u8; N]). The constant is
+//! chosen to be optimal in bytes-in-memory for the given dataset.
 //!
 //! The indices into this array represent ranges of 64*16 = 1024 codepoints.
 //!
@@ -37,9 +38,9 @@
 //! down considerably.

 use crate::fmt_list;
-use std::collections::{BTreeSet, HashMap};
+use std::collections::{BTreeMap, BTreeSet, HashMap};
 use std::convert::TryFrom;
-use std::fmt::Write;
+use std::fmt::{self, Write};
 use std::ops::Range;

 #[derive(Clone)]
@@ -61,6 +62,10 @@ impl RawEmitter {
    }

    fn emit_bitset(&mut self, words: &[u64]) {
+        let mut words = words.to_vec();
+        // Ensure that there's a zero word in the dataset, used for padding and
+        // such.
+        words.push(0);
        let unique_words =
            words.iter().cloned().collect::<BTreeSet<_>>().into_iter().collect::<Vec<_>>();
        if unique_words.len() > u8::max_value() as usize {
@@ -68,13 +73,9 @@ impl RawEmitter {
        }
        // needed for the chunk mapping to work
        assert_eq!(unique_words[0], 0, "has a zero word");
+        let canonicalized = Canonicalized::canonicalize(&unique_words);

-        let word_indices = unique_words
-            .iter()
-            .cloned()
-            .enumerate()
-            .map(|(idx, word)| (word, u8::try_from(idx).unwrap()))
-            .collect::<HashMap<_, _>>();
+        let word_indices = canonicalized.unique_mapping.clone();
        let compressed_words = words.iter().map(|w| word_indices[w]).collect::<Vec<u8>>();

        let mut best = None;
@@ -91,14 +92,32 @@ impl RawEmitter {
        }
        self.emit_chunk_map(word_indices[&0], &compressed_words, best.unwrap().0);

+        struct Bits(u64);
+        impl fmt::Debug for Bits {
+            fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+                write!(f, "0b{:064b}", self.0)
+            }
+        }
+
        writeln!(
            &mut self.file,
-            "static BITSET: [u64; {}] = [{}];",
-            unique_words.len(),
-            fmt_list(&unique_words),
+            "static BITSET_CANONICAL: [u64; {}] = [{}];",
+            canonicalized.canonical_words.len(),
+            fmt_list(canonicalized.canonical_words.iter().map(|v| Bits(*v))),
        )
        .unwrap();
-        self.bytes_used += 8 * unique_words.len();
+        self.bytes_used += 8 * canonicalized.canonical_words.len();
+        writeln!(
+            &mut self.file,
+            "static BITSET_MAPPING: [(u8, u8); {}] = [{}];",
+            canonicalized.canonicalized_words.len(),
+            fmt_list(&canonicalized.canonicalized_words),
+        )
+        .unwrap();
+        // 8 bit index into shifted words, 7 bits for shift + optional flip
+        // We only need it for the words that we removed by applying a shift and
+        // flip to them.
+        self.bytes_used += 2 * canonicalized.canonicalized_words.len();
    }

    fn emit_chunk_map(&mut self, zero_at: u8, compressed_words: &[u8], chunk_length: usize) {
@@ -170,7 +189,8 @@ impl RawEmitter {
        writeln!(&mut self.file, "        &BITSET_CHUNKS_MAP,").unwrap();
        writeln!(&mut self.file, "        BITSET_LAST_CHUNK_MAP,").unwrap();
        writeln!(&mut self.file, "        &BITSET_INDEX_CHUNKS,").unwrap();
-        writeln!(&mut self.file, "        &BITSET,").unwrap();
+        writeln!(&mut self.file, "        &BITSET_CANONICAL,").unwrap();
+        writeln!(&mut self.file, "        &BITSET_MAPPING,").unwrap();
        writeln!(&mut self.file, "    )").unwrap();
        writeln!(&mut self.file, "}}").unwrap();
    }
@@ -196,3 +216,193 @@ pub fn emit_codepoints(emitter: &mut RawEmitter, ranges: &[Range<u32>]) {
    emitter.blank_line();
    emitter.emit_lookup();
 }
+
+struct Canonicalized {
+    canonical_words: Vec<u64>,
+    canonicalized_words: Vec<(u8, u8)>,
+
+    /// Maps an input unique word to the associated index (u8) which is into
+    /// canonical_words or canonicalized_words (in order).
+    unique_mapping: HashMap<u64, u8>,
+}
+
+impl Canonicalized {
+    fn canonicalize(unique_words: &[u64]) -> Self {
+        #[derive(Copy, Clone, Debug)]
+        enum Mapping {
+            Rotate(u32),
+            Invert,
+            RotateAndInvert(u32),
+        }
+
+        // key is the word being mapped to
+        let mut mappings: BTreeMap<u64, Vec<(u64, Mapping)>> = BTreeMap::new();
+        for &a in unique_words {
+            'b: for &b in unique_words {
+                // skip self
+                if a == b {
+                    continue;
+                }
+
+                // All possible distinct rotations
+                for rotation in 1..64 {
+                    if a.rotate_right(rotation) == b {
+                        mappings.entry(b).or_default().push((a, Mapping::Rotate(rotation)));
+                        // We're not interested in further mappings between a and b
+                        continue 'b;
+                    }
+                }
+
+                if (!a) == b {
+                    mappings.entry(b).or_default().push((a, Mapping::Invert));
+                    // We're not interested in further mappings between a and b
+                    continue 'b;
+                }
+
+                // All possible distinct rotations, inverted
+                for rotation in 1..64 {
+                    if (!a.rotate_right(rotation)) == b {
+                        mappings
+                            .entry(b)
+                            .or_default()
+                            .push((a, Mapping::RotateAndInvert(rotation)));
+                        // We're not interested in further mappings between a and b
+                        continue 'b;
+                    }
+                }
+            }
+        }
+        // These are the bitset words which will be represented "raw" (as a u64)
+        let mut canonical_words = Vec::new();
+        // These are mapped words, which will be represented by an index into
+        // the canonical_words and a Mapping; u16 when encoded.
+        let mut canonicalized_words = Vec::new();
+        let mut unique_mapping = HashMap::new();
+
+        #[derive(Debug, PartialEq, Eq)]
+        enum UniqueMapping {
+            Canonical(usize),
+            Canonicalized(usize),
+        }
+
+        while let Some((&to, _)) = mappings.iter().max_by_key(|m| m.1.len()) {
+            // Get the mapping with the most entries. Currently, no mapping can
+            // only exist transitively (i.e., there is no A, B, C such that A
+            // does not map to C and but A maps to B maps to C), so this is
+            // guaranteed to be acceptable.
+            //
+            // In the future, we may need a more sophisticated algorithm to
+            // identify which keys to prefer as canonical.
+            let mapped_from = mappings.remove(&to).unwrap();
+            for (from, how) in &mapped_from {
+                // Remove the entries which mapped to this one.
+                // Noting that it should be associated with the Nth canonical word.
+                //
+                // We do not assert that this is present, because there may be
+                // no mappings to the `from` word; that's fine.
+                mappings.remove(from);
+                assert_eq!(
+                    unique_mapping
+                        .insert(*from, UniqueMapping::Canonicalized(canonicalized_words.len())),
+                    None
+                );
+                canonicalized_words.push((canonical_words.len(), *how));
+
+                // Remove the now-canonicalized word from other mappings,
+                // to ensure that we deprioritize them in the next iteration of
+                // the while loop.
+                for (_, mapped) in &mut mappings {
+                    let mut i = 0;
+                    while i != mapped.len() {
+                        if mapped[i].0 == *from {
+                            mapped.remove(i);
+                        } else {
+                            i += 1;
+                        }
+                    }
+                }
+            }
+            assert!(
+                unique_mapping
+                    .insert(to, UniqueMapping::Canonical(canonical_words.len()))
+                    .is_none()
+            );
+            canonical_words.push(to);
+
+            // Remove the now-canonical word from other mappings, to ensure that
+            // we deprioritize them in the next iteration of the while loop.
+            for (_, mapped) in &mut mappings {
+                let mut i = 0;
+                while i != mapped.len() {
+                    if mapped[i].0 == to {
+                        mapped.remove(i);
+                    } else {
+                        i += 1;
+                    }
+                }
+            }
+        }
+
+        // Any words which we couldn't shrink, just stick into the canonical
+        // words.
+        //
+        // FIXME: work harder -- there are more possibilities for mapping
+        // functions (e.g., multiplication, shifting instead of rotation, etc.)
+        // We'll probably always have some slack though so this loop will still
+        // be needed.
+        for &w in unique_words {
+            if !unique_mapping.contains_key(&w) {
+                assert!(
+                    unique_mapping
+                        .insert(w, UniqueMapping::Canonical(canonical_words.len()))
+                        .is_none()
+                );
+                canonical_words.push(w);
+            }
+        }
+        assert_eq!(canonicalized_words.len() + canonical_words.len(), unique_words.len());
+        assert_eq!(unique_mapping.len(), unique_words.len());
+
+        let unique_mapping = unique_mapping
+            .into_iter()
+            .map(|(key, value)| {
+                (
+                    key,
+                    match value {
+                        UniqueMapping::Canonicalized(idx) => {
+                            u8::try_from(canonical_words.len() + idx).unwrap()
+                        }
+                        UniqueMapping::Canonical(idx) => u8::try_from(idx).unwrap(),
+                    },
+                )
+            })
+            .collect::<HashMap<_, _>>();
+
+        let mut distinct_indices = BTreeSet::new();
+        for &w in unique_words {
+            let idx = unique_mapping.get(&w).unwrap();
+            assert!(distinct_indices.insert(idx));
+        }
+
+        let canonicalized_words = canonicalized_words
+            .into_iter()
+            .map(|v| {
+                (
+                    u8::try_from(v.0).unwrap(),
+                    match v.1 {
+                        Mapping::RotateAndInvert(amount) => {
+                            assert!(amount < (1 << 7));
+                            1 << 7 | (amount as u8)
+                        }
+                        Mapping::Rotate(amount) => {
+                            assert!(amount < (1 << 7));
+                            amount as u8
+                        }
+                        Mapping::Invert => 1 << 7,
+                    },
+                )
+            })
+            .collect::<Vec<(u8, u8)>>();
+        Canonicalized { unique_mapping, canonical_words, canonicalized_words }
+    }
+}