perf: faster term hash map (#1940)

* add term hashmap benchmark * refactor arena hashmap add inlines remove occupied array and use table_entry.is_empty instead (saves 4 bytes per entry) reduce saturation threshold from 1/3 to 1/2 to reduce memory use u32 for UnorderedId (we have the 4billion limit anyways on the Columnar stuff) fix naming LinearProbing remove byteorder dependency memory consumption went down from 2Gb to 1.8GB on indexing wikipedia dataset in tantivy * Update stacker/src/arena_hashmap.rs Co-authored-by: Paul Masurel <paul@quickwit.io> --------- Co-authored-by: Paul Masurel <paul@quickwit.io>
quickwit-oss · Apr 17, 2023 · e83abbf · e83abbf
1 parent 780e263
commit e83abbf
Show file tree

Hide file tree

Showing 8 changed files with 195 additions and 53 deletions.
diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs
@@ -9,7 +9,7 @@ use crate::fastfield::FastFieldsWriter;
 use crate::fieldnorm::{FieldNormReaders, FieldNormsWriter};
 use crate::indexer::segment_serializer::SegmentSerializer;
 use crate::postings::{
-    compute_table_size, serialize_postings, IndexingContext, IndexingPosition,
+    compute_table_memory_size, serialize_postings, IndexingContext, IndexingPosition,
     PerFieldPostingsWriter, PostingsWriter,
 };
 use crate::schema::{FieldEntry, FieldType, Schema, Term, Value};
@@ -26,7 +26,7 @@ fn compute_initial_table_size(per_thread_memory_budget: usize) -> crate::Result<
     let table_memory_upper_bound = per_thread_memory_budget / 3;
     (10..20) // We cap it at 2^19 = 512K capacity.
         .map(|power| 1 << power)
-        .take_while(|capacity| compute_table_size(*capacity) < table_memory_upper_bound)
+        .take_while(|capacity| compute_table_memory_size(*capacity) < table_memory_upper_bound)
         .last()
         .ok_or_else(|| {
             crate::TantivyError::InvalidArgument(format!(
@@ -455,7 +455,7 @@ mod tests {
     fn test_hashmap_size() {
         assert_eq!(compute_initial_table_size(100_000).unwrap(), 1 << 11);
         assert_eq!(compute_initial_table_size(1_000_000).unwrap(), 1 << 14);
-        assert_eq!(compute_initial_table_size(10_000_000).unwrap(), 1 << 17);
+        assert_eq!(compute_initial_table_size(10_000_000).unwrap(), 1 << 18);
         assert_eq!(compute_initial_table_size(1_000_000_000).unwrap(), 1 << 19);
         assert_eq!(compute_initial_table_size(4_000_000_000).unwrap(), 1 << 19);
     }

diff --git a/src/postings/mod.rs b/src/postings/mod.rs
@@ -17,7 +17,7 @@ mod serializer;
 mod skip;
 mod term_info;
 
-pub(crate) use stacker::compute_table_size;
+pub(crate) use stacker::compute_table_memory_size;
 
 pub use self::block_segment_postings::BlockSegmentPostings;
 pub(crate) use self::indexing_context::IndexingContext;

diff --git a/stacker/Cargo.toml b/stacker/Cargo.toml
@@ -6,5 +6,19 @@ license = "MIT"
 
 [dependencies]
 murmurhash32 = "0.3"
-byteorder = "1"
 common = { version = "0.5", path = "../common/", package = "tantivy-common" }
+criterion = "0.4.0"
+
+[[bench]]
+harness = false
+name = "crit_bench"
+path = "benches/crit_bench.rs"
+
+[[example]]
+name = "hashmap"
+path = "example/hashmap.rs"
+
+[dev-dependencies]
+rand = "0.8.5"
+zipf = "7.0.0"
+
diff --git a/stacker/benches/crit_bench.rs b/stacker/benches/crit_bench.rs
@@ -0,0 +1,70 @@
+#![allow(dead_code)]
+extern crate criterion;
+
+use criterion::*;
+use rand::SeedableRng;
+use tantivy_stacker::ArenaHashMap;
+
+const ALICE: &str = include_str!("../../benches/alice.txt");
+
+fn bench_hashmap_throughput(c: &mut Criterion) {
+    let plot_config = PlotConfiguration::default().summary_scale(AxisScale::Linear);
+
+    let mut group = c.benchmark_group("CreateHashMap");
+    group.plot_config(plot_config);
+
+    let input_name = "alice";
+    let input_bytes = ALICE.len() as u64;
+    group.throughput(Throughput::Bytes(input_bytes));
+
+    group.bench_with_input(
+        BenchmarkId::new(input_name.to_string(), input_bytes),
+        &ALICE,
+        |b, i| b.iter(|| create_hash_map(i.split_whitespace().map(|el| el.as_bytes()))),
+    );
+    // numbers
+    let input_bytes = 1_000_000 * 8 as u64;
+    group.throughput(Throughput::Bytes(input_bytes));
+
+    group.bench_with_input(
+        BenchmarkId::new("numbers".to_string(), input_bytes),
+        &(0..1_000_000u64),
+        |b, i| b.iter(|| create_hash_map(i.clone().map(|el| el.to_le_bytes()))),
+    );
+
+    // numbers zipf
+    use rand::distributions::Distribution;
+    use rand::rngs::StdRng;
+    let mut rng = StdRng::from_seed([3u8; 32]);
+    let zipf = zipf::ZipfDistribution::new(10_000, 1.03).unwrap();
+
+    let input_bytes = 1_000_000 * 8 as u64;
+    group.throughput(Throughput::Bytes(input_bytes));
+
+    group.bench_with_input(
+        BenchmarkId::new("numbers_zipf".to_string(), input_bytes),
+        &(0..1_000_000u64),
+        |b, i| b.iter(|| create_hash_map(i.clone().map(|_el| zipf.sample(&mut rng).to_le_bytes()))),
+    );
+
+    group.finish();
+}
+
+fn create_hash_map<'a, T: AsRef<[u8]>>(terms: impl Iterator<Item = T>) -> ArenaHashMap {
+    let mut map = ArenaHashMap::with_capacity(4);
+    for term in terms {
+        map.mutate_or_create(term.as_ref(), |val| {
+            if let Some(mut val) = val {
+                val += 1;
+                val
+            } else {
+                1u64
+            }
+        });
+    }
+
+    map
+}
+
+criterion_group!(block_benches, bench_hashmap_throughput,);
+criterion_main!(block_benches);
diff --git a/stacker/example/hashmap.rs b/stacker/example/hashmap.rs
@@ -0,0 +1,27 @@
+use tantivy_stacker::ArenaHashMap;
+
+const ALICE: &str = include_str!("../../benches/alice.txt");
+
+fn main() {
+    create_hash_map((0..100_000_000).map(|el| el.to_string()));
+
+    for _ in 0..1000 {
+        create_hash_map(ALICE.split_whitespace());
+    }
+}
+
+fn create_hash_map<'a, T: AsRef<str>>(terms: impl Iterator<Item = T>) -> ArenaHashMap {
+    let mut map = ArenaHashMap::with_capacity(4);
+    for term in terms {
+        map.mutate_or_create(term.as_ref().as_bytes(), |val| {
+            if let Some(mut val) = val {
+                val += 1;
+                val
+            } else {
+                1u64
+            }
+        });
+    }
+
+    map
+}