Persistent store with Git semantics: lazy reads, delayed writes, content-addressing
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

irmin: fold merkle-sync into irmin/lib/sync.ml

Sync.S module type (discover/locate/fetch/push) + default implementations:
- exchange: anti-entropy gossip for branch head dissemination
- merkle_diff: DAG descent for block transfer
- Bloom: probabilistic set membership submodule
- dune fmt

+200 -10
+136
lib/sync.ml
··· 1 + (** Sync backend interface + default implementations. *) 2 + 1 3 module type S = sig 2 4 type t 3 5 type hash ··· 13 15 hash -> 14 16 (unit, [ `Msg of string ]) result 15 17 end 18 + 19 + (** {1 Anti-entropy gossip} 20 + 21 + Exchange branch head vectors between peers. Pure, stateless. Ref: Demers et 22 + al., "Epidemic Algorithms for Replicated Database Maintenance", PODC 1987. 23 + *) 24 + 25 + type 'hash heads = (string * 'hash) list 26 + 27 + type 'hash gossip_action = 28 + | Up_to_date 29 + | Pull of string * 'hash 30 + | Push of string * 'hash 31 + | Diverged of string * 'hash * 'hash 32 + 33 + let exchange ~equal ~local ~remote ~is_ancestor = 34 + let all_branches = 35 + List.sort_uniq String.compare (List.map fst local @ List.map fst remote) 36 + in 37 + List.filter_map 38 + (fun branch -> 39 + let l = List.assoc_opt branch local in 40 + let r = List.assoc_opt branch remote in 41 + match (l, r) with 42 + | None, None -> None 43 + | None, Some rh -> Some (Pull (branch, rh)) 44 + | Some lh, None -> Some (Push (branch, lh)) 45 + | Some lh, Some rh -> 46 + if equal lh rh then None 47 + else if is_ancestor lh rh then Some (Pull (branch, rh)) 48 + else if is_ancestor rh lh then Some (Push (branch, lh)) 49 + else Some (Diverged (branch, lh, rh))) 50 + all_branches 51 + 52 + (** {1 Merkle descent} 53 + 54 + Compare root hashes, descend into differing subtrees, transfer the delta. 55 + Result is in dependency order: children before parents. *) 56 + 57 + let merkle_diff ~local_has ~remote_get root = 58 + let result = ref [] in 59 + let visited = Hashtbl.create 256 in 60 + let rec walk h = 61 + if Hashtbl.mem visited h || local_has h then () 62 + else ( 63 + Hashtbl.replace visited h (); 64 + match remote_get h with 65 + | None -> () 66 + | Some (children, data) -> 67 + List.iter walk children; 68 + result := (h, data) :: !result) 69 + in 70 + walk root; 71 + List.rev !result 72 + 73 + (** {1 Bloom filter} 74 + 75 + Probabilistic set membership. ~1% false positive rate with 10 bits per entry 76 + and 7 FNV-1a probes. *) 77 + 78 + module Bloom = struct 79 + let bits_per_entry = 10 80 + let num_probes = 7 81 + 82 + type t = { num_entries : int; bits : Bytes.t } 83 + 84 + let size n = max 8 ((n * bits_per_entry / 8) + 1) 85 + 86 + let fnv1a ~seed s = 87 + let h = ref (0x811c9dc5 lxor seed) in 88 + String.iter 89 + (fun c -> 90 + h := !h lxor Char.code c; 91 + h := !h * 0x01000193) 92 + s; 93 + !h land 0x3FFFFFFF 94 + 95 + let hash s probe total_bits = 96 + abs (fnv1a ~seed:(probe * 0x9e3779b9) s mod total_bits) 97 + 98 + let of_strings hashes = 99 + let n = List.length hashes in 100 + let sz = size n in 101 + let bits = Bytes.make sz '\000' in 102 + let total_bits = sz * 8 in 103 + List.iter 104 + (fun s -> 105 + for probe = 0 to num_probes - 1 do 106 + let bit = hash s probe total_bits in 107 + let byte_idx = bit / 8 in 108 + let bit_idx = bit mod 8 in 109 + let old = Char.code (Bytes.get bits byte_idx) in 110 + Bytes.set bits byte_idx (Char.chr (old lor (1 lsl bit_idx))) 111 + done) 112 + hashes; 113 + { num_entries = n; bits } 114 + 115 + let mem b s = 116 + if b.num_entries = 0 then false 117 + else 118 + let total_bits = Bytes.length b.bits * 8 in 119 + let rec check probe = 120 + if probe >= num_probes then true 121 + else 122 + let bit = hash s probe total_bits in 123 + let byte_idx = bit / 8 in 124 + let bit_idx = bit mod 8 in 125 + let v = Char.code (Bytes.get b.bits byte_idx) in 126 + if v land (1 lsl bit_idx) = 0 then false else check (probe + 1) 127 + in 128 + check 0 129 + 130 + let encode b = 131 + let buf = Buffer.create (4 + Bytes.length b.bits) in 132 + let n = b.num_entries in 133 + Buffer.add_char buf (Char.chr ((n lsr 24) land 0xFF)); 134 + Buffer.add_char buf (Char.chr ((n lsr 16) land 0xFF)); 135 + Buffer.add_char buf (Char.chr ((n lsr 8) land 0xFF)); 136 + Buffer.add_char buf (Char.chr (n land 0xFF)); 137 + Buffer.add_bytes buf b.bits; 138 + Buffer.contents buf 139 + 140 + let decode s = 141 + if String.length s < 4 then Error (`Msg "bloom too short") 142 + else 143 + let n = 144 + (Char.code s.[0] lsl 24) 145 + lor (Char.code s.[1] lsl 16) 146 + lor (Char.code s.[2] lsl 8) 147 + lor Char.code s.[3] 148 + in 149 + let bits = Bytes.of_string (String.sub s 4 (String.length s - 4)) in 150 + Ok { num_entries = n; bits } 151 + end
+64 -10
lib/sync.mli
··· 1 - (** Sync backend interface. 1 + (** Sync: backend interface + default implementations. 2 2 3 - Each backend provides its own implementation of the four sync operations. 4 - Irmin composes them: [discover] → [locate] → [fetch] → [Schema.merge] → 5 - (optionally) [push]. 3 + {b Backend interface}: {!S} defines the four sync operations. Each backend 4 + (Git, ATProto, generic) provides its own implementation. Irmin composes 5 + them: [discover] → [locate] → [fetch] → merge → [push]. 6 + 7 + {b Default implementations}: {!exchange} (anti-entropy gossip), 8 + {!merkle_diff} (DAG descent), {!Bloom} (probabilistic set membership). These 9 + are the building blocks for backends that don't have their own wire 10 + protocol. *) 6 11 7 - - Git: packfile negotiation, [git fetch] / [git push] 8 - - ATProto: CAR download/upload via XRPC 9 - - Local: direct heap access via [Git.Fetch.fetch_local] 10 - - Generic: [ocaml-merkle-sync] (gossip + merkle descent + bloom) *) 12 + (** {1 Backend interface} *) 11 13 12 14 module type S = sig 13 15 type t ··· 20 22 21 23 val locate : t -> local_has:(hash -> bool) -> hash -> hash list 22 24 (** [locate t ~local_has remote_head] returns hashes reachable from 23 - [remote_head] that are missing locally. Result is in dependency order: 24 - children before parents. *) 25 + [remote_head] that are missing locally. Dependency order. *) 25 26 26 27 val fetch : t -> hash list -> (hash * string) list 27 28 (** [fetch t hashes] retrieves blocks by hash from the remote. *) ··· 35 36 (** [push t blocks ~branch head] sends [blocks] to the remote and updates 36 37 [branch] to [head]. Fast-forward only. *) 37 38 end 39 + 40 + (** {1 Anti-entropy gossip} 41 + 42 + Exchange branch head vectors. Pure, stateless. Converges in O(log n) rounds 43 + for n nodes. Ref: Demers et al., PODC 1987. *) 44 + 45 + type 'hash heads = (string * 'hash) list 46 + (** Branch name → head hash. *) 47 + 48 + type 'hash gossip_action = 49 + | Up_to_date 50 + | Pull of string * 'hash 51 + | Push of string * 'hash 52 + | Diverged of string * 'hash * 'hash 53 + 54 + val exchange : 55 + equal:('hash -> 'hash -> bool) -> 56 + local:'hash heads -> 57 + remote:'hash heads -> 58 + is_ancestor:('hash -> 'hash -> bool) -> 59 + 'hash gossip_action list 60 + (** [exchange ~equal ~local ~remote ~is_ancestor] compares head vectors and 61 + returns actions to converge. *) 62 + 63 + (** {1 Merkle descent} 64 + 65 + Compare root hashes, descend into differing subtrees, transfer the delta. 66 + Result in dependency order: children before parents. *) 67 + 68 + val merkle_diff : 69 + local_has:('hash -> bool) -> 70 + remote_get:('hash -> ('hash list * string) option) -> 71 + 'hash -> 72 + ('hash * string) list 73 + (** [merkle_diff ~local_has ~remote_get root] returns blocks reachable from 74 + [root] that are missing locally. *) 75 + 76 + (** {1 Bloom filter} 77 + 78 + Probabilistic set membership. ~1% false positive rate. *) 79 + 80 + module Bloom : sig 81 + type t 82 + 83 + val of_strings : string list -> t 84 + (** Build a Bloom filter from string representations of hashes. *) 85 + 86 + val mem : t -> string -> bool 87 + (** [mem b s] is [true] if [s] is probably in the filter. *) 88 + 89 + val encode : t -> string 90 + val decode : string -> (t, [ `Msg of string ]) result 91 + end