···11+The MIT License
22+33+Copyright (c) 2019 Clément Pascutto
44+55+Permission is hereby granted, free of charge, to any person obtaining a copy
66+of this software and associated documentation files (the "Software"), to deal
77+in the Software without restriction, including without limitation the rights
88+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
99+copies of the Software, and to permit persons to whom the Software is
1010+furnished to do so, subject to the following conditions:
1111+1212+The above copyright notice and this permission notice shall be included in
1313+all copies or substantial portions of the Software.
1414+1515+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1616+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1717+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
1818+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
1919+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
2020+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
2121+THE SOFTWARE.
···11+# Bloomf - Efficient Bloom filters for OCaml [](https://ci.ocamllabs.io/github/mirage/bloomf)
22+Bloom filters are memory and time efficient data structures allowing
33+probabilistic membership queries in a set.
44+55+A query negative result ensures that the element is not present in the set,
66+while a positive result might be a false positive, i.e. the element might not be
77+present and the BF membership query can return true anyway.
88+99+Internal parameters of the BF allow to control its false positive rate depending
1010+on the expected number of elements in it.
1111+1212+Online documentation is available [here](https://mirage.github.io/bloomf/).
1313+1414+## Install
1515+1616+The latest version of `bloomf` is available on opam with `opam install bloomf`.
1717+1818+Alternatively, you can build from sources with `make` or `dune build`.
1919+2020+## Tests
2121+2222+Some of the tests, measuring false positive rate or size estimation, might fail
2323+once in a while since they are randomized. They are thus removed from `dune
2424+runtest` alias.
2525+2626+To run the whole test suite, run `dune build @runtest-rand` instead.
2727+2828+## Benchmarks
2929+3030+Micro benchmarks are provided for `create`, `add`, `mem` and `size_estimate`
3131+operations. Expected error rate is 0.01.
3232+3333+They preform OLS regression analysis using the development version of
3434+[bechamel](https://github.com/dinosaure/bechamel). To reproduce them, pin
3535+`bechamel` then run `dune build @bench`.
+32
bloom.opam
···11+# This file is generated by dune, edit dune-project instead
22+opam-version: "2.0"
33+synopsis: "Bloom filters for OCaml"
44+description:
55+ "Bloom filters are probabilistic data structures for membership testing. A positive result means the element may be in the set, while a negative result means it is definitely not."
66+maintainer: ["Thomas Gazagnaire"]
77+authors: ["Clément Pascutto" "Thomas Gazagnaire"]
88+homepage: "https://tangled.org/gazagnaire.org/ocaml-bloom"
99+bug-reports: "https://tangled.org/gazagnaire.org/ocaml-bloom/issues"
1010+depends: [
1111+ "dune" {>= "3.0"}
1212+ "ocaml" {>= "5.1"}
1313+ "bitv" {>= "1.4"}
1414+ "alcotest" {>= "1.0" & with-test}
1515+ "crowbar" {>= "0.2" & with-test}
1616+ "odoc" {with-doc}
1717+]
1818+build: [
1919+ ["dune" "subst"] {dev}
2020+ [
2121+ "dune"
2222+ "build"
2323+ "-p"
2424+ name
2525+ "-j"
2626+ jobs
2727+ "@install"
2828+ "@runtest" {with-test}
2929+ "@doc" {with-doc}
3030+ ]
3131+]
3232+dev-repo: "https://tangled.org/gazagnaire.org/ocaml-bloom"
+22
dune-project
···11+(lang dune 3.0)
22+(name bloom)
33+44+(generate_opam_files true)
55+(implicit_transitive_deps false)
66+77+(source (uri https://tangled.org/gazagnaire.org/ocaml-bloom))
88+(bug_reports https://tangled.org/gazagnaire.org/ocaml-bloom/issues)
99+(homepage https://tangled.org/gazagnaire.org/ocaml-bloom)
1010+1111+(maintainers "Thomas Gazagnaire")
1212+(authors "Clément Pascutto" "Thomas Gazagnaire")
1313+1414+(package
1515+ (name bloom)
1616+ (synopsis "Bloom filters for OCaml")
1717+ (description "Bloom filters are probabilistic data structures for membership testing. A positive result means the element may be in the set, while a negative result means it is definitely not.")
1818+ (depends
1919+ (ocaml (>= 5.1))
2020+ (bitv (>= 1.4))
2121+ (alcotest (and (>= 1.0) :with-test))
2222+ (crowbar (and (>= 0.2) :with-test))))
···11+(** Main fuzz test entry point. *)
22+33+let () =
44+ Fuzz_common.run ();
55+ Fuzz_bloom.run ()
+108
fuzz/fuzz_bloom.ml
···11+(** Fuzz tests for Bloom filter module. *)
22+33+open Crowbar
44+55+(** No false negatives - after adding an element, mem must return true. *)
66+let test_no_false_negatives seed elts =
77+ let size = max 100 (String.length seed * 10) in
88+ let bf = Bloom.create ~error_rate:0.01 size in
99+ List.iter (fun e -> Bloom.add bf e) elts;
1010+ List.iter
1111+ (fun e -> if not (Bloom.mem bf e) then fail "false negative detected")
1212+ elts
1313+1414+(** Serialization roundtrip - valid filters must round-trip. *)
1515+let test_serialization_roundtrip elts =
1616+ let bf = Bloom.create ~error_rate:0.01 1000 in
1717+ List.iter (fun e -> Bloom.add bf e) elts;
1818+ match Bloom.to_bytes bf |> Bloom.of_bytes with
1919+ | Error _ -> fail "deserialization failed"
2020+ | Ok bf2 ->
2121+ List.iter
2222+ (fun e ->
2323+ if not (Bloom.mem bf2 e) then fail "element lost after roundtrip")
2424+ elts
2525+2626+(** Union contains all elements from both filters. *)
2727+let test_union elts1 elts2 =
2828+ let bf1 = Bloom.create ~error_rate:0.01 1000 in
2929+ let bf2 = Bloom.create ~error_rate:0.01 1000 in
3030+ List.iter (fun e -> Bloom.add bf1 e) elts1;
3131+ List.iter (fun e -> Bloom.add bf2 e) elts2;
3232+ let bf_union = Bloom.union bf1 bf2 in
3333+ List.iter
3434+ (fun e ->
3535+ if not (Bloom.mem bf_union e) then fail "element missing from union")
3636+ elts1;
3737+ List.iter
3838+ (fun e ->
3939+ if not (Bloom.mem bf_union e) then fail "element missing from union")
4040+ elts2
4141+4242+(** Copy creates independent filter - original elements in both. *)
4343+let test_copy_independence elts extra =
4444+ let bf = Bloom.create ~error_rate:0.01 1000 in
4545+ List.iter (fun e -> Bloom.add bf e) elts;
4646+ let bf_copy = Bloom.copy bf in
4747+ Bloom.add bf extra;
4848+ List.iter
4949+ (fun e ->
5050+ if not (Bloom.mem bf e) then fail "element missing from original";
5151+ if not (Bloom.mem bf_copy e) then fail "element missing from copy")
5252+ elts
5353+5454+(** Clear empties filter - size estimate should be near 0. *)
5555+let test_clear elts =
5656+ let bf = Bloom.create ~error_rate:0.01 1000 in
5757+ List.iter (fun e -> Bloom.add bf e) elts;
5858+ Bloom.clear bf;
5959+ let estimate = Bloom.size_estimate bf in
6060+ if estimate >= 10 then fail "filter not cleared properly"
6161+6262+(** Intersection preserves common elements. *)
6363+let test_intersection elts =
6464+ let bf1 = Bloom.create ~error_rate:0.01 1000 in
6565+ let bf2 = Bloom.create ~error_rate:0.01 1000 in
6666+ (* Add same elements to both *)
6767+ List.iter (fun e -> Bloom.add bf1 e) elts;
6868+ List.iter (fun e -> Bloom.add bf2 e) elts;
6969+ let bf_inter = Bloom.inter bf1 bf2 in
7070+ (* All elements should be in intersection *)
7171+ List.iter
7272+ (fun e ->
7373+ if not (Bloom.mem bf_inter e) then
7474+ fail "element missing from intersection")
7575+ elts
7676+7777+(** Create with invalid error_rate should fail. *)
7878+let test_invalid_error_rate () =
7979+ (try
8080+ let _ = Bloom.create ~error_rate:0.0 100 in
8181+ fail "should reject error_rate=0"
8282+ with Invalid_argument _ -> ());
8383+ (try
8484+ let _ = Bloom.create ~error_rate:1.0 100 in
8585+ fail "should reject error_rate=1"
8686+ with Invalid_argument _ -> ());
8787+ (try
8888+ let _ = Bloom.create ~error_rate:(-0.1) 100 in
8989+ fail "should reject negative error_rate"
9090+ with Invalid_argument _ -> ());
9191+ (try
9292+ let _ = Bloom.create ~error_rate:1.5 100 in
9393+ fail "should reject error_rate>1"
9494+ with Invalid_argument _ -> ())
9595+9696+let run () =
9797+ add_test ~name:"bloom: no false negatives" [ bytes; list bytes ]
9898+ test_no_false_negatives;
9999+ add_test ~name:"bloom: serialization roundtrip" [ list bytes ]
100100+ test_serialization_roundtrip;
101101+ add_test ~name:"bloom: union contains both" [ list bytes; list bytes ]
102102+ test_union;
103103+ add_test ~name:"bloom: copy independence" [ list bytes; bytes ]
104104+ test_copy_independence;
105105+ add_test ~name:"bloom: clear empties filter" [ list bytes ] test_clear;
106106+ add_test ~name:"bloom: intersection preserves common" [ list bytes ]
107107+ test_intersection;
108108+ add_test ~name:"bloom: invalid error_rate" [ const () ] test_invalid_error_rate
+12
fuzz/fuzz_common.ml
···11+(** Common utilities for fuzz tests. *)
22+33+let to_bytes buf =
44+ let len = String.length buf in
55+ let b = Bytes.create len in
66+ Bytes.blit_string buf 0 b 0 len;
77+ b
88+99+let truncate ?(max_len = 4096) buf =
1010+ if String.length buf > max_len then String.sub buf 0 max_len else buf
1111+1212+let run () = ()
···11+type priv = { m : int; k : int; p_len : (int * int) list; b : Bitv.t }
22+type 'a t = priv
33+44+let copy t = { m = t.m; k = t.k; p_len = t.p_len; b = Bitv.copy t.b }
55+let rec gcd a b = if b = 0 then a else gcd b (a mod b)
66+77+let partition_lengths m k =
88+ let rec aux sum acc i =
99+ if List.length acc = k then (sum, acc)
1010+ else
1111+ let rec loop step =
1212+ let k = i + step in
1313+ let gcd_k = gcd k in
1414+ if List.for_all (fun p -> gcd_k p = 1) acc then
1515+ aux (sum + k) (k :: acc) (k + 1)
1616+ else loop (step + 1)
1717+ in
1818+ loop 1
1919+ in
2020+ aux 0 [] (m / k)
2121+2222+let v m k =
2323+ let m, lengths = partition_lengths m k in
2424+ let p_len =
2525+ let rec aux acc off = function
2626+ | [] -> acc
2727+ | h :: t -> aux ((off, h) :: acc) (off + h) t
2828+ in
2929+ aux [] 0 lengths
3030+ in
3131+ try
3232+ let b = Bitv.create m false in
3333+ { m; k; p_len; b }
3434+ with Invalid_argument _ -> invalid_arg "Bloomf.create"
3535+3636+let estimate_parameters n p =
3737+ let log2 = log 2. in
3838+ let nf = float_of_int n in
3939+ let m = ceil (-.nf *. log p /. log (2. ** log2)) in
4040+ let k = ceil (log2 *. m /. nf) in
4141+ (m, k)
4242+4343+let create ?(error_rate = 0.01) n_items =
4444+ let m, k = estimate_parameters n_items error_rate in
4545+ if error_rate <= 0. || error_rate >= 1. then invalid_arg "Bloomf.create";
4646+ v (int_of_float m) (int_of_float k)
4747+4848+let add_priv t hashed_data =
4949+ let rec loop = function
5050+ | [] -> ()
5151+ | (off, len) :: tl ->
5252+ let loc = off + (hashed_data mod len) in
5353+ let () = Bitv.unsafe_set t.b loc true in
5454+ loop tl
5555+ in
5656+ loop t.p_len
5757+5858+let add bf data = add_priv bf (Hashtbl.hash data)
5959+6060+let op f bf1 bf2 =
6161+ if bf1.k <> bf2.k || bf1.m <> bf2.m then
6262+ invalid_arg "incompatible bloom filters";
6363+ { m = bf1.m; k = bf2.k; p_len = bf1.p_len; b = f bf1.b bf2.b }
6464+6565+let union bf1 bf2 = op Bitv.bw_or bf1 bf2
6666+let inter bf1 bf2 = op Bitv.bw_and bf1 bf2
6767+6868+let mem_priv t hashed_data =
6969+ let rec loop = function
7070+ | [] -> true
7171+ | (off, len) :: tl ->
7272+ let loc = off + (hashed_data mod len) in
7373+ let res = Bitv.unsafe_get t.b loc in
7474+ if res then loop tl else false
7575+ in
7676+ loop t.p_len
7777+7878+let mem bf data = mem_priv bf (Hashtbl.hash data)
7979+let clear t = Bitv.fill t.b 0 t.m false
8080+8181+(* Bitv.pop is really slow *)
8282+let size_estimate t =
8383+ let mf = float_of_int t.m in
8484+ let kf = float_of_int t.k in
8585+ let xf = float_of_int (Bitv.pop t.b) in
8686+ int_of_float (-.mf /. kf *. log (1. -. (xf /. mf)))
8787+8888+(* Serialisers *)
8989+9090+external set_64 : bytes -> int -> int64 -> unit = "%caml_string_set64u"
9191+external swap64 : int64 -> int64 = "%bswap_int64"
9292+9393+let set_uint64 buf off v =
9494+ if not Sys.big_endian then set_64 buf off (swap64 v) else set_64 buf off v
9595+9696+(* type priv = { m : int; k : int; p_len : (int * int) list; b : Bitv.t } *)
9797+9898+let to_bytes t =
9999+ let enc_b = Bitv.to_bytes t.b in
100100+ let enc_b_len = Bytes.length enc_b in
101101+ let enc_p_len_len = 16 * List.length t.p_len in
102102+ let len = 8 + 8 + 8 + enc_p_len_len + enc_b_len in
103103+ let buf = Bytes.create len in
104104+ set_uint64 buf 0 (Int64.of_int t.m);
105105+ set_uint64 buf 8 (Int64.of_int t.k);
106106+ set_uint64 buf 16 (Int64.of_int (List.length t.p_len));
107107+ List.iteri
108108+ (fun i (i1, i2) ->
109109+ set_uint64 buf (24 + (8 * (2 * i))) (Int64.of_int i1);
110110+ set_uint64 buf (24 + (8 * ((2 * i) + 1))) (Int64.of_int i2))
111111+ t.p_len;
112112+ Bytes.blit enc_b 0 buf (24 + enc_p_len_len) enc_b_len;
113113+ buf
114114+115115+external get_64 : bytes -> int -> int64 = "%caml_string_get64"
116116+117117+let get_uint64 buf off =
118118+ if not Sys.big_endian then swap64 (get_64 buf off) else get_64 buf off
119119+120120+let of_bytes buf =
121121+ try
122122+ let m = get_uint64 buf 0 |> Int64.to_int in
123123+ let k = get_uint64 buf 8 |> Int64.to_int in
124124+ let p_len_len = get_uint64 buf 16 |> Int64.to_int in
125125+ let p_len =
126126+ List.init p_len_len (fun i ->
127127+ let i1 = get_uint64 buf (24 + (8 * (2 * i))) |> Int64.to_int in
128128+ let i2 = get_uint64 buf (24 + (8 * ((2 * i) + 1))) |> Int64.to_int in
129129+ (i1, i2))
130130+ in
131131+ let read = 24 + (16 * p_len_len) in
132132+ let b = Bytes.sub buf read (Bytes.length buf - read) |> Bitv.of_bytes in
133133+ Ok { m; k; p_len; b }
134134+ with _ -> Error (`Msg "invalid serialisation format")
135135+136136+module type Hashable = sig
137137+ type t
138138+139139+ val hash : t -> int
140140+end
141141+142142+module Make (H : Hashable) = struct
143143+ type t = priv
144144+145145+ let create = create
146146+ let copy = copy
147147+ let add bf data = add_priv bf (H.hash data)
148148+ let mem bf data = mem_priv bf (H.hash data)
149149+ let clear = clear
150150+ let size_estimate = size_estimate
151151+ let to_bytes = to_bytes
152152+ let of_bytes = of_bytes
153153+end
+93
src/bloom.mli
···11+(** Bloom filters
22+33+ Bloom is an implementation of Bloom filters in OCaml.
44+55+ Bloom filters are memory and time efficient data structures allowing
66+ probabilistic membership queries in a set. A query negative result ensures
77+ that the element is not present in the set, while a positive result might be
88+ a false positive, i.e. the element might not be present and the BF
99+ membership query can return true anyway. Internal parameters of the BF allow
1010+ to control its false positive rate depending on the expected number of
1111+ elements in it. *)
1212+1313+(** {1 Generic interface} *)
1414+1515+type 'a t
1616+(** The type of the Bloom filter. *)
1717+1818+val create : ?error_rate:float -> int -> 'a t
1919+(** [create ~error_rate size] creates a fresh BF for which expected false
2020+ positive rate when filled with [size] elements is [error_rate].
2121+2222+ @raise Invalid_argument
2323+ if [error_rate] is not in \]0, 1\[, or [size] is negative. *)
2424+2525+val copy : 'a t -> 'a t
2626+(** [copy t] copies the BF. The fresh returned BF is completely separated from
2727+ the given one. *)
2828+2929+val add : 'a t -> 'a -> unit
3030+(** [add t e] adds [e] to [t]. *)
3131+3232+val mem : 'a t -> 'a -> bool
3333+(** [mem t e] is [true] if [e] is in [t]. *)
3434+3535+val clear : 'a t -> unit
3636+(** [clear t] clears the contents of [t]. *)
3737+3838+val union : 'a t -> 'a t -> 'a t
3939+(** [union t1 t2] computes the union of the two inputs. This operation is
4040+ lossless in the sense that the resulting Bloom filter is the same as the
4141+ Bloom filter created from scratch using the union of the two sets.
4242+4343+ Raises [Invalid_argument] if the two bloom filters were created with
4444+ different parameters *)
4545+4646+val inter : 'a t -> 'a t -> 'a t
4747+(** [inter t1 t2] computes the intersection of the two inputs. The false
4848+ positive probability in the resulting Bloom filter is at most the
4949+ false-positive probability in one of the constituent Bloom filters, but may
5050+ be larger than the false positive probability in the Bloom filter created
5151+ from scratch using the intersection of the two sets.
5252+5353+ Raises [Invalid_argument] if the two bloom filters were created with
5454+ different parameters *)
5555+5656+val size_estimate : 'a t -> int
5757+(** [size_estimate t] is an approximation of the number of elements stored in
5858+ the bloom filter. Please note that this operation is costly (see
5959+ benchmarks). *)
6060+6161+(** {2 Serializers/Deserializers} *)
6262+6363+val to_bytes : 'a t -> bytes
6464+val of_bytes : bytes -> ('a t, [ `Msg of string ]) result
6565+6666+(** {1 Functorial interface} *)
6767+6868+(** The functorial interface allows you to specify your own hash function. *)
6969+7070+(** The input interface for [Bloom.Make]. *)
7171+module type Hashable = sig
7272+ type t
7373+ (** The type of the values to be stored. *)
7474+7575+ val hash : t -> int
7676+ (** The hash function. {e This function must return positive integers.}
7777+ Behavior is undefined otherwise. Please note that false positive rate
7878+ might be affected by unevenly distributed hash functions. *)
7979+end
8080+8181+(** The output interface for [Bloom.Make]. *)
8282+module Make (H : Hashable) : sig
8383+ type t
8484+8585+ val create : ?error_rate:float -> int -> t
8686+ val copy : t -> t
8787+ val add : t -> H.t -> unit
8888+ val mem : t -> H.t -> bool
8989+ val clear : t -> unit
9090+ val size_estimate : t -> int
9191+ val to_bytes : t -> bytes
9292+ val of_bytes : bytes -> (t, [ `Msg of string ]) result
9393+end