Skip to content

Commit 263b64f

Browse files
authored
Merge pull request #5 from Vagabond/adt/more-improvements
Further improvements
2 parents 9c0f014 + ed65adf commit 263b64f

File tree

10 files changed

+163
-141
lines changed

10 files changed

+163
-141
lines changed

crates/bloom/Cargo.lock

+1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

crates/bloom/Cargo.toml

+1
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,4 @@ crate-type = ["dylib"]
1111
[dependencies]
1212
rustler = ">=0.13.0"
1313
bloomfilter = "1"
14+
siphasher = "*"

crates/bloom/src/lib.rs

+86-20
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,16 @@
11
extern crate bloomfilter;
22
#[macro_use]
33
extern crate rustler;
4+
extern crate siphasher;
45

56
use bloomfilter::Bloom;
6-
use rustler::{Encoder, Env, NifResult, Term, OwnedBinary, Binary};
77
use rustler::resource::ResourceArc;
8-
use std::sync::RwLock;
8+
use rustler::{Binary, Encoder, Env, NifResult, OwnedBinary, Term};
9+
use siphasher::sip::SipHasher13;
10+
use std::hash::Hash;
11+
use std::hash::Hasher;
912
use std::io::Write;
13+
use std::sync::RwLock;
1014

1115
mod atoms {
1216
rustler_atoms! {
@@ -15,7 +19,7 @@ mod atoms {
1519
}
1620

1721
struct FilterResource {
18-
filter: RwLock<Bloom <Vec<u8>>>
22+
filter: RwLock<Bloom<[u8]>>,
1923
}
2024

2125
rustler_export_nifs!(
@@ -26,7 +30,8 @@ rustler_export_nifs!(
2630
("serialize", 1, serialize),
2731
("deserialize", 7, deserialize),
2832
("set", 2, set),
29-
("check", 2, check),
33+
("check_nif", 2, check),
34+
("check_nif", 8, check_ro),
3035
("check_and_set", 2, check_and_set),
3136
("clear", 1, clear),
3237
],
@@ -43,9 +48,7 @@ fn new<'a>(env: Env<'a>, args: &[Term<'a>]) -> NifResult<Term<'a>> {
4348
let items_count: i64 = args[1].decode()?;
4449

4550
let resource = ResourceArc::new(FilterResource {
46-
filter: RwLock::new(
47-
Bloom::new(bitmap_size as usize, items_count as usize)
48-
)
51+
filter: RwLock::new(Bloom::new(bitmap_size as usize, items_count as usize)),
4952
});
5053

5154
Ok((atoms::ok(), resource.encode(env)).encode(env))
@@ -56,15 +59,12 @@ fn new_for_fp_rate<'a>(env: Env<'a>, args: &[Term<'a>]) -> NifResult<Term<'a>> {
5659
let fp_p: f64 = args[1].decode()?;
5760

5861
let resource = ResourceArc::new(FilterResource {
59-
filter: RwLock::new(
60-
Bloom::new_for_fp_rate(items_count as usize, fp_p)
61-
)
62+
filter: RwLock::new(Bloom::new_for_fp_rate(items_count as usize, fp_p)),
6263
});
6364

6465
Ok((atoms::ok(), resource).encode(env))
6566
}
6667

67-
6868
fn serialize<'a>(env: Env<'a>, args: &[Term<'a>]) -> NifResult<Term<'a>> {
6969
let resource: ResourceArc<FilterResource> = args[0].decode()?;
7070

@@ -74,11 +74,17 @@ fn serialize<'a>(env: Env<'a>, args: &[Term<'a>]) -> NifResult<Term<'a>> {
7474
let mut binary = OwnedBinary::new(bitmap.len()).unwrap();
7575
binary.as_mut_slice().write_all(&bitmap).unwrap();
7676

77-
Ok((atoms::ok(), (Binary::from_owned(binary, env),
78-
filter.number_of_bits(),
79-
filter.number_of_hash_functions(),
80-
sips[0],
81-
sips[1])).encode(env))
77+
Ok((
78+
atoms::ok(),
79+
(
80+
Binary::from_owned(binary, env),
81+
filter.number_of_bits(),
82+
filter.number_of_hash_functions(),
83+
sips[0],
84+
sips[1],
85+
),
86+
)
87+
.encode(env))
8288
}
8389

8490
fn deserialize<'a>(env: Env<'a>, args: &[Term<'a>]) -> NifResult<Term<'a>> {
@@ -96,15 +102,19 @@ fn deserialize<'a>(env: Env<'a>, args: &[Term<'a>]) -> NifResult<Term<'a>> {
96102
num_bits,
97103
num_funs,
98104
[(sip00, sip01), (sip10, sip11)],
99-
))
105+
)),
100106
});
101107

102108
Ok((atoms::ok(), resource).encode(env))
103109
}
104110

105111
fn set<'a>(env: Env<'a>, args: &[Term<'a>]) -> NifResult<Term<'a>> {
106112
let resource: ResourceArc<FilterResource> = args[0].decode()?;
107-
let key: Vec<u8> = args[1].decode()?;
113+
let key: Binary = if args[1].is_binary() {
114+
args[1].decode()?
115+
} else {
116+
Binary::from_owned(args[1].to_binary(), env)
117+
};
108118

109119
let mut filter = resource.filter.write().unwrap();
110120
(*filter).set(&key);
@@ -114,16 +124,72 @@ fn set<'a>(env: Env<'a>, args: &[Term<'a>]) -> NifResult<Term<'a>> {
114124

115125
fn check<'a>(env: Env<'a>, args: &[Term<'a>]) -> NifResult<Term<'a>> {
116126
let resource: ResourceArc<FilterResource> = args[0].decode()?;
117-
let key: Vec<u8> = args[1].decode()?;
127+
let key: Binary = if args[1].is_binary() {
128+
args[1].decode()?
129+
} else {
130+
Binary::from_owned(args[1].to_binary(), env)
131+
};
118132

119133
let filter = resource.filter.read().unwrap();
120134

121135
Ok(filter.check(&key).encode(env))
122136
}
123137

138+
// check a serialized bloom for key membership without fully deserializing the bloom
139+
// specifically we want to avoid the very slow bitvec deserialization and simply compute
140+
// the hash keys manually and check them inside the Erlang binary by hand
141+
// for a 50mb bloom, this improves checking a serialized bloom from 25 seconds to 35 microseconds
142+
fn check_ro<'a>(env: Env<'a>, args: &[Term<'a>]) -> NifResult<Term<'a>> {
143+
let bitmap: Binary = args[0].decode()?;
144+
let num_bits: u64 = args[1].decode()?;
145+
let num_funs: u32 = args[2].decode()?;
146+
let sip00: u64 = args[3].decode()?;
147+
let sip01: u64 = args[4].decode()?;
148+
let sip10: u64 = args[5].decode()?;
149+
let sip11: u64 = args[6].decode()?;
150+
let key: Binary = if args[7].is_binary() {
151+
args[7].decode()?
152+
} else {
153+
Binary::from_owned(args[7].to_binary(), env)
154+
};
155+
156+
let sips = [
157+
SipHasher13::new_with_keys(sip00, sip01),
158+
SipHasher13::new_with_keys(sip10, sip11),
159+
];
160+
161+
let mut hashes = [0u64, 0u64];
162+
for k_i in 0..num_funs {
163+
let bit_offset = (bloom_hash(&mut hashes, &key, k_i, &sips) % num_bits) as usize;
164+
let byte_offset = bit_offset / 8;
165+
let bit = 7 - (bit_offset % 8);
166+
if (bitmap[byte_offset] >> bit) & 1 != 1 {
167+
return Ok(false.encode(env));
168+
}
169+
}
170+
Ok(true.encode(env))
171+
}
172+
173+
// helper for check_ro, extracted from the bloom crate source code
174+
fn bloom_hash(hashes: &mut [u64; 2], item: &[u8], k_i: u32, sips: &[SipHasher13; 2]) -> u64 {
175+
if k_i < 2 {
176+
let mut sip = sips[k_i as usize];
177+
item.hash(&mut sip);
178+
let hash = sip.finish();
179+
hashes[k_i as usize] = hash;
180+
hash
181+
} else {
182+
hashes[0].wrapping_add((u64::from(k_i)).wrapping_mul(hashes[1]) % 0xffff_ffff_ffff_ffc5)
183+
}
184+
}
185+
124186
fn check_and_set<'a>(env: Env<'a>, args: &[Term<'a>]) -> NifResult<Term<'a>> {
125187
let resource: ResourceArc<FilterResource> = args[0].decode()?;
126-
let key: Vec<u8> = args[1].decode()?;
188+
let key: Binary = if args[1].is_binary() {
189+
args[1].decode()?
190+
} else {
191+
Binary::from_owned(args[1].to_binary(), env)
192+
};
127193

128194
let mut filter = resource.filter.write().unwrap();
129195

src/bloom.erl

+39-4
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88
serialize/1,
99
deserialize/1,
1010
deserialize/7,
11+
to_bin/1,
12+
from_bin/1,
1113
set/2,
1214
check/2,
1315
check_and_set/2,
@@ -18,6 +20,9 @@
1820
-export([load/0]).
1921
-on_load(load/0).
2022

23+
%% rev this if the internal structure of the bloom filter changes
24+
-define(ERBLOOM_VERSION1, 1).
25+
2126
-type sip_keys() :: {non_neg_integer(), non_neg_integer()}.
2227

2328
-type serialized_bloom() :: {Bitmap :: binary(), NumBits :: pos_integer(), NumFuns :: pos_integer(), sip_keys(), sip_keys()}.
@@ -36,11 +41,26 @@ new(_BitmapSize, _ItemsCount) ->
3641
new_for_fp_rate(_ItemsCount, _FP_Rate) ->
3742
not_loaded(?LINE).
3843

39-
%% @doc Serialize a bloom filter to Erlang terms.
44+
%% @doc Serialize a bloom filter to Erlang terms. `check/2' can be used against this serialized form efficently.
4045
-spec serialize(Bloom :: bloom()) -> {ok, serialized_bloom()}.
4146
serialize(_Ref) ->
4247
not_loaded(?LINE).
4348

49+
%% @doc Serialize a bloom filter and encode it as a versioned binary. `check/2' can be used against this binary form efficently.
50+
-spec to_bin(bloom()) -> binary().
51+
to_bin(Ref) ->
52+
{ok, {Bitmap,NumBits,NumFuns,{Sv00,Sv01},{Sv10,Sv11}}} = serialize(Ref),
53+
<<?ERBLOOM_VERSION1:8/integer, NumBits:64/integer-unsigned-little, NumFuns:32/integer-unsigned-little,
54+
Sv00:64/integer-unsigned-little, Sv01:64/integer-unsigned-little,
55+
Sv10:64/integer-unsigned-little, Sv11:64/integer-unsigned-little, Bitmap/binary>>.
56+
57+
%% @doc Deserialize a versioned binary into a bloom filter reference.
58+
-spec from_bin(binary()) -> bloom().
59+
from_bin(<<?ERBLOOM_VERSION1:8/integer, NumBits:64/integer-unsigned-little, NumFuns:32/integer-unsigned-little,
60+
Sv00:64/integer-unsigned-little, Sv01:64/integer-unsigned-little,
61+
Sv10:64/integer-unsigned-little, Sv11:64/integer-unsigned-little, Bitmap/binary>>) ->
62+
deserialize(Bitmap, NumBits, NumFuns, Sv00, Sv01, Sv10, Sv11).
63+
4464
%% @doc Deserialize a previously serialized bloom filter back into a bloom filter reference.
4565
-spec deserialize(serialized_bloom()) -> {ok, bloom()}.
4666
deserialize({Bitmap,NumBits,NumFuns,{Sv00,Sv01},{Sv10,Sv11}}) ->
@@ -56,9 +76,18 @@ set(_Ref, _Key) ->
5676
not_loaded(?LINE).
5777

5878
%% @doc Check for the presence of `Key' in `Bloom'.
59-
-spec check(Bloom :: bloom(), Key :: term()) -> boolean().
60-
check(_Ref, _Key) ->
61-
not_loaded(?LINE).
79+
%% Serialized and binary encoded bloom filters can be used with this
80+
%% function when you wish to check for the key and do not need to use set
81+
%% (eg. a static bloom filter stored in a database).
82+
-spec check(bloom() | serialized_bloom() | binary(), term()) -> boolean().
83+
check(Bloom, Key) when is_reference(Bloom) ->
84+
check_nif(Bloom, Key);
85+
check(<<?ERBLOOM_VERSION1:8/integer, NumBits:64/integer-unsigned-little, NumFuns:32/integer-unsigned-little,
86+
Sv00:64/integer-unsigned-little, Sv01:64/integer-unsigned-little,
87+
Sv10:64/integer-unsigned-little, Sv11:64/integer-unsigned-little, Bitmap/binary>>, Key) ->
88+
check_nif(Bitmap, NumBits, NumFuns, Sv00, Sv01, Sv10, Sv11, Key);
89+
check({Bitmap,NumBits,NumFuns,{Sv00,Sv01},{Sv10,Sv11}}, Key) ->
90+
check_nif(Bitmap, NumBits, NumFuns, Sv00, Sv01, Sv10, Sv11, Key).
6291

6392
%% @doc Record the presence of `Key' in `Bloom' and return whether it was present before.
6493
-spec check_and_set(Bloom :: bloom(), Key :: term()) -> boolean().
@@ -70,6 +99,12 @@ check_and_set(_Ref, _Key) ->
7099
clear(_Ref) ->
71100
not_loaded(?LINE).
72101

102+
check_nif(_Ref, _Key) ->
103+
not_loaded(?LINE).
104+
105+
check_nif(_Bitmap, _NumBits, _NumFuns, _Sv00, _Sv01, _Sv10, _Sv11, _Key) ->
106+
not_loaded(?LINE).
107+
73108
%% @private
74109
load() ->
75110
erlang:load_nif(filename:join(priv(), "libbloom"), none).

src/erbloom.app.src

-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
[{description, "Fast Bloom Filter"},
33
{vsn, "0.1.0"},
44
{registered, []},
5-
{mod, { erbloom_app, []}},
65
{applications,
76
[kernel,
87
stdlib

src/erbloom_app.erl

-26
This file was deleted.

src/erbloom_sup.erl

-43
This file was deleted.

0 commit comments

Comments
 (0)