Dynamically choose best chunk size

Try chunk sizes between 1 and 64, selecting the one which minimizes the number
of bytes used. 16, the previous constant, turned out to be a rather good choice,
with 5/9 of the datasets still using it.

Alphabetic     : 3036 bytes    (- 19 bytes)
Case_Ignorable : 2136 bytes
Cased          : 934 bytes
Cc             : 32 bytes      (- 11 bytes)
Grapheme_Extend: 1774 bytes
Lowercase      : 985 bytes
N              : 1225 bytes    (- 41 bytes)
Uppercase      : 934 bytes
White_Space    : 97 bytes      (- 43 bytes)
Total table sizes: 11153 bytes (-114 bytes)
This commit is contained in:
Mark Rousskov
2020-03-19 11:38:41 -04:00
parent 903f67d599
commit 7c4baedb3a
3 changed files with 134 additions and 138 deletions

View File

@@ -42,6 +42,7 @@ use std::convert::TryFrom;
use std::fmt::Write;
use std::ops::Range;
#[derive(Clone)]
pub struct RawEmitter {
pub file: String,
pub bytes_used: usize,
@@ -65,6 +66,8 @@ impl RawEmitter {
if unique_words.len() > u8::max_value() as usize {
panic!("cannot pack {} into 8 bits", unique_words.len());
}
// needed for the chunk mapping to work
assert_eq!(unique_words[0], 0, "first word is all zeros");
let word_indices = unique_words
.iter()
@@ -72,17 +75,42 @@ impl RawEmitter {
.enumerate()
.map(|(idx, word)| (word, u8::try_from(idx).unwrap()))
.collect::<HashMap<_, _>>();
let compressed_words = words.iter().map(|w| word_indices[w]).collect::<Vec<u8>>();
let mut idx = words.iter().map(|w| word_indices[w]).collect::<Vec<u8>>();
let chunk_length = 16;
for _ in 0..(chunk_length - (idx.len() % chunk_length)) {
assert_eq!(unique_words[0], 0, "first word is all zeros");
// pad out bitset index with zero words so we have all chunks of 16
idx.push(0);
let mut best = None;
for length in 1..=64 {
let mut temp = self.clone();
temp.emit_chunk_map(&compressed_words, length);
if let Some((_, size)) = best {
if temp.bytes_used < size {
best = Some((length, temp.bytes_used));
}
} else {
best = Some((length, temp.bytes_used));
}
}
self.emit_chunk_map(&compressed_words, best.unwrap().0);
writeln!(
&mut self.file,
"static BITSET: [u64; {}] = [{}];",
unique_words.len(),
fmt_list(&unique_words),
)
.unwrap();
self.bytes_used += 8 * unique_words.len();
}
fn emit_chunk_map(&mut self, compressed_words: &[u8], chunk_length: usize) {
let mut compressed_words = compressed_words.to_vec();
for _ in 0..(chunk_length - (compressed_words.len() % chunk_length)) {
// pad out bitset index with zero words so we have all chunks of
// chunkchunk_length
compressed_words.push(0);
}
let mut chunks = BTreeSet::new();
for chunk in idx.chunks(chunk_length) {
for chunk in compressed_words.chunks(chunk_length) {
chunks.insert(chunk);
}
let chunk_map = chunks
@@ -92,7 +120,7 @@ impl RawEmitter {
.map(|(idx, chunk)| (chunk, idx))
.collect::<HashMap<_, _>>();
let mut chunk_indices = Vec::new();
for chunk in idx.chunks(chunk_length) {
for chunk in compressed_words.chunks(chunk_length) {
chunk_indices.push(chunk_map[chunk]);
}
writeln!(
@@ -105,7 +133,6 @@ impl RawEmitter {
self.bytes_used += 3;
// Strip out the empty pieces, presuming our above pop() made us now
// have some trailing zeros.
assert_eq!(unique_words[0], 0, "first word is all zeros");
while let Some(0) = chunk_indices.last() {
chunk_indices.pop();
}
@@ -119,20 +146,13 @@ impl RawEmitter {
self.bytes_used += chunk_indices.len();
writeln!(
&mut self.file,
"static BITSET_INDEX_CHUNKS: [[u8; 16]; {}] = [{}];",
"static BITSET_INDEX_CHUNKS: [[u8; {}]; {}] = [{}];",
chunk_length,
chunks.len(),
fmt_list(chunks.iter()),
)
.unwrap();
self.bytes_used += 16 * chunks.len();
writeln!(
&mut self.file,
"static BITSET: [u64; {}] = [{}];",
unique_words.len(),
fmt_list(&unique_words),
)
.unwrap();
self.bytes_used += 8 * unique_words.len();
self.bytes_used += chunk_length * chunks.len();
}
pub fn emit_lookup(&mut self) {