Skip to content

Commit

Permalink
perf: faster term hash map (#1940)
Browse files Browse the repository at this point in the history
* add term hashmap benchmark

* refactor arena hashmap

add inlines
remove occupied array and use table_entry.is_empty instead (saves 4 bytes per entry)
reduce saturation threshold from 1/3 to 1/2 to reduce memory
use u32 for UnorderedId (we have the 4billion limit anyways on the Columnar stuff)
fix naming LinearProbing
remove byteorder dependency

memory consumption went down from 2Gb to 1.8GB on indexing wikipedia dataset in tantivy

* Update stacker/src/arena_hashmap.rs

Co-authored-by: Paul Masurel <paul@quickwit.io>

---------

Co-authored-by: Paul Masurel <paul@quickwit.io>
  • Loading branch information
PSeitz and fulmicoton authored Apr 17, 2023
1 parent 780e263 commit e83abbf
Show file tree
Hide file tree
Showing 8 changed files with 195 additions and 53 deletions.
6 changes: 3 additions & 3 deletions src/indexer/segment_writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ use crate::fastfield::FastFieldsWriter;
use crate::fieldnorm::{FieldNormReaders, FieldNormsWriter};
use crate::indexer::segment_serializer::SegmentSerializer;
use crate::postings::{
compute_table_size, serialize_postings, IndexingContext, IndexingPosition,
compute_table_memory_size, serialize_postings, IndexingContext, IndexingPosition,
PerFieldPostingsWriter, PostingsWriter,
};
use crate::schema::{FieldEntry, FieldType, Schema, Term, Value};
Expand All @@ -26,7 +26,7 @@ fn compute_initial_table_size(per_thread_memory_budget: usize) -> crate::Result<
let table_memory_upper_bound = per_thread_memory_budget / 3;
(10..20) // We cap it at 2^19 = 512K capacity.
.map(|power| 1 << power)
.take_while(|capacity| compute_table_size(*capacity) < table_memory_upper_bound)
.take_while(|capacity| compute_table_memory_size(*capacity) < table_memory_upper_bound)
.last()
.ok_or_else(|| {
crate::TantivyError::InvalidArgument(format!(
Expand Down Expand Up @@ -455,7 +455,7 @@ mod tests {
fn test_hashmap_size() {
assert_eq!(compute_initial_table_size(100_000).unwrap(), 1 << 11);
assert_eq!(compute_initial_table_size(1_000_000).unwrap(), 1 << 14);
assert_eq!(compute_initial_table_size(10_000_000).unwrap(), 1 << 17);
assert_eq!(compute_initial_table_size(10_000_000).unwrap(), 1 << 18);
assert_eq!(compute_initial_table_size(1_000_000_000).unwrap(), 1 << 19);
assert_eq!(compute_initial_table_size(4_000_000_000).unwrap(), 1 << 19);
}
Expand Down
2 changes: 1 addition & 1 deletion src/postings/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ mod serializer;
mod skip;
mod term_info;

pub(crate) use stacker::compute_table_size;
pub(crate) use stacker::compute_table_memory_size;

pub use self::block_segment_postings::BlockSegmentPostings;
pub(crate) use self::indexing_context::IndexingContext;
Expand Down
16 changes: 15 additions & 1 deletion stacker/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,19 @@ license = "MIT"

[dependencies]
murmurhash32 = "0.3"
byteorder = "1"
common = { version = "0.5", path = "../common/", package = "tantivy-common" }
criterion = "0.4.0"

[[bench]]
harness = false
name = "crit_bench"
path = "benches/crit_bench.rs"

[[example]]
name = "hashmap"
path = "example/hashmap.rs"

[dev-dependencies]
rand = "0.8.5"
zipf = "7.0.0"

70 changes: 70 additions & 0 deletions stacker/benches/crit_bench.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
#![allow(dead_code)]
extern crate criterion;

use criterion::*;
use rand::SeedableRng;
use tantivy_stacker::ArenaHashMap;

const ALICE: &str = include_str!("../../benches/alice.txt");

fn bench_hashmap_throughput(c: &mut Criterion) {
let plot_config = PlotConfiguration::default().summary_scale(AxisScale::Linear);

let mut group = c.benchmark_group("CreateHashMap");
group.plot_config(plot_config);

let input_name = "alice";
let input_bytes = ALICE.len() as u64;
group.throughput(Throughput::Bytes(input_bytes));

group.bench_with_input(
BenchmarkId::new(input_name.to_string(), input_bytes),
&ALICE,
|b, i| b.iter(|| create_hash_map(i.split_whitespace().map(|el| el.as_bytes()))),
);
// numbers
let input_bytes = 1_000_000 * 8 as u64;
group.throughput(Throughput::Bytes(input_bytes));

group.bench_with_input(
BenchmarkId::new("numbers".to_string(), input_bytes),
&(0..1_000_000u64),
|b, i| b.iter(|| create_hash_map(i.clone().map(|el| el.to_le_bytes()))),
);

// numbers zipf
use rand::distributions::Distribution;
use rand::rngs::StdRng;
let mut rng = StdRng::from_seed([3u8; 32]);
let zipf = zipf::ZipfDistribution::new(10_000, 1.03).unwrap();

let input_bytes = 1_000_000 * 8 as u64;
group.throughput(Throughput::Bytes(input_bytes));

group.bench_with_input(
BenchmarkId::new("numbers_zipf".to_string(), input_bytes),
&(0..1_000_000u64),
|b, i| b.iter(|| create_hash_map(i.clone().map(|_el| zipf.sample(&mut rng).to_le_bytes()))),
);

group.finish();
}

fn create_hash_map<'a, T: AsRef<[u8]>>(terms: impl Iterator<Item = T>) -> ArenaHashMap {
let mut map = ArenaHashMap::with_capacity(4);
for term in terms {
map.mutate_or_create(term.as_ref(), |val| {
if let Some(mut val) = val {
val += 1;
val
} else {
1u64
}
});
}

map
}

criterion_group!(block_benches, bench_hashmap_throughput,);
criterion_main!(block_benches);
27 changes: 27 additions & 0 deletions stacker/example/hashmap.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
use tantivy_stacker::ArenaHashMap;

const ALICE: &str = include_str!("../../benches/alice.txt");

fn main() {
create_hash_map((0..100_000_000).map(|el| el.to_string()));

for _ in 0..1000 {
create_hash_map(ALICE.split_whitespace());
}
}

fn create_hash_map<'a, T: AsRef<str>>(terms: impl Iterator<Item = T>) -> ArenaHashMap {
let mut map = ArenaHashMap::with_capacity(4);
for term in terms {
map.mutate_or_create(term.as_ref().as_bytes(), |val| {
if let Some(mut val) = val {
val += 1;
val
} else {
1u64
}
});
}

map
}
Loading

0 comments on commit e83abbf

Please sign in to comment.