···11defmodule MST do
22 @moduledoc """
33- Documentation for `MST`.
33+ AT Protocol-flavoured Merkle Search Tree (MST) for Elixir.
44+55+ An MST is a content-addressed, deterministic key/value tree where keys are
66+ byte arrays and values are `DASL.CID` links. The tree structure is fully
77+ determined by the current set of key/value pairs — equal content always
88+ produces the same root CID, making it suitable for Merkle proofs and
99+ efficient diffs.
1010+1111+ This library implements the AT Protocol MST specification but is designed to
1212+ be generic: it makes no assumptions about repository structure, commit
1313+ objects, or AT-URI paths.
1414+1515+ ## Quick start
1616+1717+ store = MST.Store.Memory.new()
1818+ tree = MST.new(store)
1919+2020+ val = DASL.CID.compute("my record")
2121+ {:ok, tree} = MST.put(tree, "collection/key", val)
2222+ {:ok, ^val} = MST.get(tree, "collection/key")
2323+2424+ {:ok, tree} = MST.delete(tree, "collection/key")
2525+2626+ ## Loading from a CAR file
2727+2828+ {:ok, tree} = MST.from_car(File.read!("repo.car"))
2929+ {:ok, binary} = MST.to_car(tree)
3030+3131+ ## Diffing two trees
3232+3333+ {:ok, diff} = MST.diff(tree_a, tree_b)
3434+ # diff.record_ops — sorted list of MST.Diff.Op structs
3535+3636+ ## Key depth
3737+3838+ The MST height of a key is derived by SHA-256 hashing it and counting
3939+ leading zero bits divided by 2 (floor), giving a fanout of 4.
4040+4141+ 0 = MST.key_height("2653ae71")
4242+ 1 = MST.key_height("blue")
4343+4444+ Spec: https://atproto.com/specs/repository#mst-structure
4545+ """
4646+4747+ alias MST.{CAR, Diff, Store, Tree}
4848+ alias DASL.CID
4949+5050+ # ---------------------------------------------------------------------------
5151+ # Construction
5252+ # ---------------------------------------------------------------------------
5353+5454+ @doc """
5555+ Returns a new empty tree backed by an `MST.Store.Memory`.
5656+5757+ Pass an explicit store to use a different backend:
5858+5959+ tree = MST.new(MST.Store.Memory.new())
6060+6161+ ## Examples
6262+6363+ iex> tree = MST.new()
6464+ iex> tree.root
6565+ nil
6666+6767+ """
6868+ @spec new() :: Tree.t()
6969+ def new, do: Tree.new(Store.Memory.new())
7070+7171+ @doc """
7272+ Returns a new empty tree backed by the given store.
7373+7474+ ## Examples
7575+7676+ iex> tree = MST.new(MST.Store.Memory.new())
7777+ iex> tree.root
7878+ nil
7979+8080+ """
8181+ @spec new(Store.t()) :: Tree.t()
8282+ def new(store), do: Tree.new(store)
8383+8484+ # ---------------------------------------------------------------------------
8585+ # Lookup / mutation
8686+ # ---------------------------------------------------------------------------
8787+8888+ @doc """
8989+ Looks up `key` in the tree.
9090+9191+ ## Examples
9292+9393+ iex> tree = MST.new()
9494+ iex> MST.get(tree, "col/k")
9595+ {:error, :not_found}
9696+9797+ """
9898+ @spec get(Tree.t(), binary()) :: {:ok, CID.t()} | {:error, :not_found} | {:error, atom()}
9999+ defdelegate get(tree, key), to: Tree
100100+101101+ @doc """
102102+ Inserts or updates `key` → `value`. Returns `{:ok, new_tree}`.
103103+104104+ ## Examples
105105+106106+ iex> tree = MST.new()
107107+ iex> val = DASL.CID.compute("data")
108108+ iex> {:ok, tree} = MST.put(tree, "col/k", val)
109109+ iex> MST.get(tree, "col/k")
110110+ {:ok, val}
111111+112112+ """
113113+ @spec put(Tree.t(), binary(), CID.t()) :: {:ok, Tree.t()} | {:error, atom()}
114114+ defdelegate put(tree, key, value), to: Tree
115115+116116+ @doc """
117117+ Removes `key` from the tree. Returns `{:ok, new_tree}` or
118118+ `{:error, :not_found}`.
119119+120120+ ## Examples
121121+122122+ iex> tree = MST.new()
123123+ iex> val = DASL.CID.compute("data")
124124+ iex> {:ok, tree} = MST.put(tree, "col/k", val)
125125+ iex> {:ok, tree} = MST.delete(tree, "col/k")
126126+ iex> MST.get(tree, "col/k")
127127+ {:error, :not_found}
128128+129129+ """
130130+ @spec delete(Tree.t(), binary()) :: {:ok, Tree.t()} | {:error, :not_found | atom()}
131131+ defdelegate delete(tree, key), to: Tree
132132+133133+ @doc """
134134+ Returns all key-value pairs in sorted order.
135135+ """
136136+ @spec to_list(Tree.t()) :: {:ok, [{binary(), CID.t()}]} | {:error, atom()}
137137+ defdelegate to_list(tree), to: Tree
138138+139139+ @doc """
140140+ Returns a lazy stream of `{key, value_cid}` pairs in sorted order.
141141+ """
142142+ @spec stream(Tree.t()) :: Enumerable.t()
143143+ defdelegate stream(tree), to: Tree
144144+145145+ @doc """
146146+ Returns the number of key-value pairs in the tree.
147147+ """
148148+ @spec length(Tree.t()) :: {:ok, non_neg_integer()} | {:error, atom()}
149149+ defdelegate length(tree), to: Tree
150150+151151+ # ---------------------------------------------------------------------------
152152+ # CAR I/O
153153+ # ---------------------------------------------------------------------------
154154+155155+ @doc """
156156+ Loads an MST from a CAR-encoded binary or an already-decoded `DASL.CAR` struct.
157157+158158+ When given a binary, it is decoded via `DASL.CAR.decode/2` first. When given
159159+ a `%DASL.CAR{}` struct the decoding step is skipped entirely, which avoids a
160160+ redundant encode/decode cycle when you already hold the struct in memory.
161161+162162+ Accepts the same options as `DASL.CAR.decode/2` (`verify: boolean`) when
163163+ called with a binary; options are ignored for the struct variant.
164164+165165+ ## Examples
166166+167167+ iex> tree = MST.new()
168168+ iex> val = DASL.CID.compute("x")
169169+ iex> {:ok, tree} = MST.put(tree, "col/a", val)
170170+ iex> {:ok, bin} = MST.to_car(tree)
171171+ iex> {:ok, tree2} = MST.from_car(bin)
172172+ iex> MST.get(tree2, "col/a")
173173+ {:ok, val}
174174+175175+ iex> tree = MST.new()
176176+ iex> val = DASL.CID.compute("x")
177177+ iex> {:ok, tree} = MST.put(tree, "col/a", val)
178178+ iex> {:ok, bin} = MST.to_car(tree)
179179+ iex> {:ok, car} = DASL.CAR.decode(bin)
180180+ iex> {:ok, tree2} = MST.from_car(car)
181181+ iex> MST.get(tree2, "col/a")
182182+ {:ok, val}
183183+4184 """
185185+ @spec from_car(binary() | DASL.CAR.t(), keyword()) :: {:ok, Tree.t()} | {:error, atom()}
186186+ def from_car(input, opts \\ [])
187187+ def from_car(%DASL.CAR{} = car, _opts), do: CAR.from_car(car)
188188+ def from_car(binary, opts) when is_binary(binary), do: CAR.from_binary(binary, opts)
51896190 @doc """
77- Hello world.
191191+ Serialises an `MST.Tree` to a CAR-encoded binary.
192192+ """
193193+ @spec to_car(Tree.t(), keyword()) :: {:ok, binary()} | {:error, atom()}
194194+ defdelegate to_car(tree, opts \\ []), to: CAR, as: :to_binary
195195+196196+ # ---------------------------------------------------------------------------
197197+ # Diff
198198+ # ---------------------------------------------------------------------------
199199+200200+ @doc """
201201+ Computes the diff from `tree_a` to `tree_b`.
202202+203203+ Returns an `MST.Diff` with `created_nodes`, `deleted_nodes`, and
204204+ `record_ops` sorted by key.
82059206 ## Examples
102071111- iex> MST.hello()
1212- :world
208208+ iex> tree_a = MST.new()
209209+ iex> val = DASL.CID.compute("v")
210210+ iex> {:ok, tree_b} = MST.put(tree_a, "col/a", val)
211211+ iex> {:ok, diff} = MST.diff(tree_a, tree_b)
212212+ iex> length(diff.record_ops)
213213+ 1
1321414215 """
1515- def hello do
1616- :world
1717- end
216216+ @spec diff(Tree.t(), Tree.t()) :: {:ok, Diff.t()} | {:error, atom()}
217217+ defdelegate diff(tree_a, tree_b), to: Diff, as: :compute
218218+219219+ # ---------------------------------------------------------------------------
220220+ # Utilities
221221+ # ---------------------------------------------------------------------------
222222+223223+ @doc """
224224+ Returns the MST depth for a key.
225225+226226+ SHA-256 hashes `key` and counts leading zero bits divided by 2 (floor).
227227+228228+ ## Examples
229229+230230+ iex> MST.key_height("2653ae71")
231231+ 0
232232+233233+ iex> MST.key_height("blue")
234234+ 1
235235+236236+ """
237237+ @spec key_height(binary()) :: non_neg_integer()
238238+ defdelegate key_height(key), to: MST.Height, as: :for_key
18239end
+290
lib/mst/car.ex
···11+defmodule MST.CAR do
22+ @moduledoc """
33+ Bridges `MST.Tree` with the DASL CAR file format.
44+55+ Provides functions to load an MST from a CAR binary or stream, and to export
66+ an MST back to CAR format. The CAR header's first root CID is treated as the
77+ MST root; any additional roots are ignored.
88+99+ MST node blocks (DAG-CBOR codec, `:drisl`) are decoded into `MST.Node`
1010+ structs and stored in an `MST.Store.Memory`. Non-MST blocks (e.g. record
1111+ data with the `:raw` codec) are ignored during import — the store only holds
1212+ MST structural nodes.
1313+1414+ ## Example
1515+1616+ {:ok, tree} = MST.CAR.from_binary(File.read!("repo.car"))
1717+ {:ok, pairs} = MST.Tree.to_list(tree)
1818+1919+ """
2020+2121+ alias DASL.{CAR, CID}
2222+ alias MST.{Node, Store, Tree}
2323+2424+ @type car_error() ::
2525+ {:error, :header, atom()}
2626+ | {:error, :block, atom()}
2727+ | {:error, atom()}
2828+2929+ # ---------------------------------------------------------------------------
3030+ # Import
3131+ # ---------------------------------------------------------------------------
3232+3333+ @doc """
3434+ Loads an MST from an already-decoded `DASL.CAR` struct.
3535+3636+ Populates an `MST.Store.Memory` from the struct's blocks map and returns an
3737+ `MST.Tree` rooted at the CAR's first root CID. Use this when you already hold
3838+ a `%DASL.CAR{}` in memory and want to avoid a redundant encode/decode cycle.
3939+4040+ ## Examples
4141+4242+ iex> store = MST.Store.Memory.new()
4343+ iex> tree = MST.Tree.new(store)
4444+ iex> val = DASL.CID.compute("data")
4545+ iex> {:ok, tree} = MST.Tree.put(tree, "col/key", val)
4646+ iex> {:ok, binary} = MST.CAR.to_binary(tree)
4747+ iex> {:ok, car} = DASL.CAR.decode(binary)
4848+ iex> {:ok, tree2} = MST.CAR.from_car(car)
4949+ iex> MST.Tree.get(tree2, "col/key")
5050+ {:ok, val}
5151+5252+ """
5353+ @spec from_car(CAR.t()) :: {:ok, Tree.t()} | car_error()
5454+ def from_car(%CAR{roots: roots, blocks: blocks}), do: build_tree(roots, blocks)
5555+5656+ @doc """
5757+ Loads an MST from a CAR-encoded binary.
5858+5959+ Decodes all blocks, populates an `MST.Store.Memory` with MST nodes (DAG-CBOR
6060+ codec), and returns an `MST.Tree` rooted at the CAR's first root CID.
6161+6262+ Accepts the same options as `DASL.CAR.decode/2` (`verify: boolean`).
6363+6464+ ## Examples
6565+6666+ iex> store = MST.Store.Memory.new()
6767+ iex> tree = MST.Tree.new(store)
6868+ iex> val = DASL.CID.compute("data")
6969+ iex> {:ok, tree} = MST.Tree.put(tree, "col/key", val)
7070+ iex> {:ok, binary} = MST.CAR.to_binary(tree)
7171+ iex> {:ok, tree2} = MST.CAR.from_binary(binary)
7272+ iex> MST.Tree.get(tree2, "col/key")
7373+ {:ok, val}
7474+7575+ """
7676+ @spec from_binary(binary(), keyword()) :: {:ok, Tree.t()} | car_error()
7777+ def from_binary(binary, opts \\ []) when is_binary(binary) do
7878+ try do
7979+ with {:ok, car} <- CAR.decode(binary, opts) do
8080+ build_tree(car.roots, car.blocks)
8181+ end
8282+ rescue
8383+ e in ArgumentError -> {:error, :header, {:invalid_binary, e.message}}
8484+ end
8585+ end
8686+8787+ @doc """
8888+ Loads an MST from a CAR stream (an `Enumerable` of binary chunks).
8989+9090+ Streams blocks through `DASL.CAR.stream_decode/2`, populating an
9191+ `MST.Store.Memory` incrementally. Useful for large files where you want to
9292+ avoid loading the full binary into memory at once. Converts stream raises
9393+ to error tuples.
9494+9595+ ## Options
9696+9797+ - `:verify` — verify CID digests of incoming blocks (default: `true`)
9898+9999+ ## Examples
100100+101101+ iex> store = MST.Store.Memory.new()
102102+ iex> tree = MST.Tree.new(store)
103103+ iex> val = DASL.CID.compute("data")
104104+ iex> {:ok, tree} = MST.Tree.put(tree, "col/key", val)
105105+ iex> {:ok, binary} = MST.CAR.to_binary(tree)
106106+ iex> chunk_stream = [binary]
107107+ iex> {:ok, tree2} = MST.CAR.from_stream(chunk_stream)
108108+ iex> MST.Tree.get(tree2, "col/key")
109109+ {:ok, val}
110110+111111+ """
112112+ @spec from_stream(Enumerable.t(), keyword()) :: {:ok, Tree.t()} | car_error()
113113+ def from_stream(stream, opts \\ []) do
114114+ try do
115115+ {roots, blocks} =
116116+ stream
117117+ |> CAR.stream_decode(opts)
118118+ |> Enum.reduce({nil, %{}}, fn
119119+ {:header, _version, roots}, {_roots, blocks} ->
120120+ {roots, blocks}
121121+122122+ {:block, cid, data}, {roots, blocks} ->
123123+ {roots, Map.put(blocks, cid, data)}
124124+ end)
125125+126126+ build_tree(roots || [], blocks)
127127+ rescue
128128+ e in RuntimeError -> {:error, {:stream_decode, e.message}}
129129+ end
130130+ end
131131+132132+ # ---------------------------------------------------------------------------
133133+ # Export
134134+ # ---------------------------------------------------------------------------
135135+136136+ @doc """
137137+ Serialises an `MST.Tree` to a CAR-encoded binary.
138138+139139+ Collects all reachable MST node blocks and wraps them in a CARv1 file with
140140+ the tree root as the sole header root.
141141+142142+ ## Examples
143143+144144+ iex> store = MST.Store.Memory.new()
145145+ iex> tree = MST.Tree.new(store)
146146+ iex> val = DASL.CID.compute("data")
147147+ iex> {:ok, tree} = MST.Tree.put(tree, "col/key", val)
148148+ iex> {:ok, binary} = MST.CAR.to_binary(tree)
149149+ iex> is_binary(binary)
150150+ true
151151+152152+ """
153153+ @spec to_binary(Tree.t()) :: {:ok, binary()} | car_error()
154154+ def to_binary(tree), do: to_binary(tree, [])
155155+156156+ @doc false
157157+ @spec to_binary(Tree.t(), keyword()) :: {:ok, binary()} | car_error()
158158+ def to_binary(%Tree{root: nil}, _opts) do
159159+ # Empty tree — emit a CAR with an empty node as root
160160+ empty_node = Node.empty()
161161+162162+ with {:ok, bytes} <- Node.encode(empty_node) do
163163+ cid = CID.compute(bytes, :drisl)
164164+165165+ car = %CAR{
166166+ version: 1,
167167+ roots: [cid],
168168+ blocks: %{cid => bytes}
169169+ }
170170+171171+ CAR.encode(car)
172172+ else
173173+ {:error, :encode, reason} -> {:error, reason}
174174+ end
175175+ end
176176+177177+ def to_binary(%Tree{root: root} = tree, opts) do
178178+ with {:ok, blocks} <- Tree.collect_blocks(tree) do
179179+ car = %CAR{
180180+ version: 1,
181181+ roots: [root],
182182+ blocks: blocks
183183+ }
184184+185185+ CAR.encode(car, opts)
186186+ end
187187+ end
188188+189189+ @doc """
190190+ Returns a stream of `DASL.CAR` stream items for the tree in pre-order
191191+ (root first, then depth-first left-to-right).
192192+193193+ Emits `{:header, 1, [root_cid]}` followed by `{:block, cid, bytes}` for
194194+ each reachable MST node.
195195+196196+ This stream can be piped into a custom CAR writer. It does **not** produce
197197+ a fully-encoded CAR binary — use `to_binary/2` for that.
198198+199199+ """
200200+ @spec to_stream(Tree.t()) :: Enumerable.t()
201201+ def to_stream(%Tree{root: nil}) do
202202+ empty_node = Node.empty()
203203+ {:ok, bytes} = Node.encode(empty_node)
204204+ cid = CID.compute(bytes, :drisl)
205205+206206+ [
207207+ {:header, 1, [cid]},
208208+ {:block, cid, bytes}
209209+ ]
210210+ end
211211+212212+ def to_stream(%Tree{root: root, store: store}) do
213213+ header = [{:header, 1, [root]}]
214214+ blocks = preorder_stream(store, root)
215215+ Stream.concat(header, blocks)
216216+ end
217217+218218+ # ---------------------------------------------------------------------------
219219+ # Private — tree construction from decoded blocks
220220+ # ---------------------------------------------------------------------------
221221+222222+ @spec build_tree([CID.t()], %{CID.t() => binary()}) :: {:ok, Tree.t()} | car_error()
223223+ defp build_tree([], _blocks), do: {:ok, Tree.new(Store.Memory.new())}
224224+225225+ defp build_tree([root | _], blocks) do
226226+ # Decode all DAG-CBOR blocks into MST nodes; ignore raw-codec blocks.
227227+ result =
228228+ Enum.reduce_while(blocks, {:ok, Store.Memory.new()}, fn {cid, data}, {:ok, store} ->
229229+ case decode_block(cid, data) do
230230+ {:ok, node} ->
231231+ {:cont, {:ok, Store.put(store, cid, node)}}
232232+233233+ :skip ->
234234+ {:cont, {:ok, store}}
235235+236236+ {:error, _} = err ->
237237+ {:halt, err}
238238+ end
239239+ end)
240240+241241+ case result do
242242+ {:ok, store} -> {:ok, Tree.from_root(root, store)}
243243+ err -> err
244244+ end
245245+ end
246246+247247+ @spec decode_block(CID.t(), binary()) :: {:ok, Node.t()} | :skip | {:error, atom()}
248248+ defp decode_block(%CID{codec: :raw}, _data), do: :skip
249249+250250+ defp decode_block(%CID{codec: :drisl}, data) do
251251+ case Node.decode(data) do
252252+ {:ok, node} -> {:ok, node}
253253+ {:error, :decode, reason} -> {:error, reason}
254254+ end
255255+ end
256256+257257+ # ---------------------------------------------------------------------------
258258+ # Private — pre-order DFS stream
259259+ # ---------------------------------------------------------------------------
260260+261261+ @spec preorder_stream(Store.t(), CID.t()) :: Enumerable.t()
262262+ defp preorder_stream(store, root) do
263263+ Stream.resource(
264264+ fn -> [root] end,
265265+ fn
266266+ [] ->
267267+ {:halt, []}
268268+269269+ [cid | rest] ->
270270+ case Store.get(store, cid) do
271271+ {:error, :not_found} ->
272272+ raise "MST.CAR.to_stream/1: node not found: #{CID.encode(cid)}"
273273+274274+ {:ok, node} ->
275275+ {:ok, bytes} = Node.encode(node)
276276+ children = subtree_cids(node)
277277+ {[{:block, cid, bytes}], children ++ rest}
278278+ end
279279+ end,
280280+ fn _ -> :ok end
281281+ )
282282+ end
283283+284284+ @spec subtree_cids(Node.t()) :: [CID.t()]
285285+ defp subtree_cids(node) do
286286+ left = if node.left, do: [node.left], else: []
287287+ rights = Enum.flat_map(node.entries, fn e -> if e.right, do: [e.right], else: [] end)
288288+ left ++ rights
289289+ end
290290+end
+214
lib/mst/diff.ex
···11+defmodule MST.Diff do
22+ @moduledoc """
33+ Computes the diff between two `MST.Tree` instances.
44+55+ A diff captures:
66+77+ - Which MST nodes were **created** (present in `b` but not `a`)
88+ - Which MST nodes were **deleted** (present in `a` but not `b`)
99+ - The per-key **record operations** (creates, updates, and deletes)
1010+1111+ ## Algorithm
1212+1313+ Node sets (`created_nodes` / `deleted_nodes`) are computed by collecting all
1414+ reachable node CIDs from each tree root with a DFS, then taking set
1515+ differences. Equal CIDs short-circuit entire subtrees (no need to recurse
1616+ into subtrees both trees share).
1717+1818+ Record ops are computed by fully materialising both trees as sorted key/value
1919+ lists and performing a linear sorted-merge comparison. This is straightforward
2020+ and correct at the cost of O(n) memory; it is the right tradeoff given that
2121+ the diff is typically used to inspect the full changeset anyway.
2222+2323+ ## Example
2424+2525+ {:ok, diff} = MST.Diff.compute(tree_a, tree_b)
2626+ # diff.record_ops is a sorted list of MST.Diff.Op structs
2727+2828+ """
2929+3030+ use TypedStruct
3131+3232+ alias DASL.CID
3333+ alias MST.{Node, Store, Tree}
3434+3535+ @type diff_error() :: {:error, atom()}
3636+3737+ typedstruct enforce: true do
3838+ field :created_nodes, MapSet.t(CID.t()), default: MapSet.new()
3939+ field :deleted_nodes, MapSet.t(CID.t()), default: MapSet.new()
4040+ field :record_ops, [MST.Diff.Op.t()], default: []
4141+ end
4242+4343+ # ---------------------------------------------------------------------------
4444+ # Public API
4545+ # ---------------------------------------------------------------------------
4646+4747+ @doc """
4848+ Computes the diff from `tree_a` to `tree_b`.
4949+5050+ Both trees must use stores that have their nodes populated (e.g. loaded from
5151+ CAR files). Returns `{:ok, diff}` or an error if a node is missing.
5252+5353+ ## Examples
5454+5555+ iex> store = MST.Store.Memory.new()
5656+ iex> ta = MST.Tree.new(store)
5757+ iex> val = DASL.CID.compute("v")
5858+ iex> {:ok, tb} = MST.Tree.put(ta, "col/a", val)
5959+ iex> {:ok, diff} = MST.Diff.compute(ta, tb)
6060+ iex> length(diff.record_ops)
6161+ 1
6262+ iex> hd(diff.record_ops).key
6363+ "col/a"
6464+6565+ """
6666+ @spec compute(Tree.t(), Tree.t()) :: {:ok, t()} | diff_error()
6767+ def compute(%Tree{root: root_a, store: store_a}, %Tree{root: root_b, store: store_b}) do
6868+ with {:ok, nodes_a} <- reachable_nodes(store_a, root_a),
6969+ {:ok, nodes_b} <- reachable_nodes(store_b, root_b),
7070+ {:ok, leaves_a} <- collect_leaves(store_a, root_a),
7171+ {:ok, leaves_b} <- collect_leaves(store_b, root_b) do
7272+ ops = merge_ops(leaves_a, leaves_b, [])
7373+7474+ {:ok,
7575+ %__MODULE__{
7676+ created_nodes: MapSet.difference(nodes_b, nodes_a),
7777+ deleted_nodes: MapSet.difference(nodes_a, nodes_b),
7878+ record_ops: ops
7979+ }}
8080+ end
8181+ end
8282+8383+ # ---------------------------------------------------------------------------
8484+ # Private — reachable node collection
8585+ # ---------------------------------------------------------------------------
8686+8787+ @spec reachable_nodes(Store.t(), CID.t() | nil) :: {:ok, MapSet.t(CID.t())} | diff_error()
8888+ defp reachable_nodes(_store, nil), do: {:ok, MapSet.new()}
8989+ defp reachable_nodes(store, root), do: collect_nodes(store, root, MapSet.new())
9090+9191+ @spec collect_nodes(Store.t(), CID.t(), MapSet.t(CID.t())) ::
9292+ {:ok, MapSet.t(CID.t())} | diff_error()
9393+ defp collect_nodes(store, cid, visited) do
9494+ if MapSet.member?(visited, cid) do
9595+ {:ok, visited}
9696+ else
9797+ with {:ok, node} <- fetch(store, cid) do
9898+ visited = MapSet.put(visited, cid)
9999+100100+ Enum.reduce_while(subtree_cids(node), {:ok, visited}, fn sub, {:ok, v} ->
101101+ case collect_nodes(store, sub, v) do
102102+ {:ok, v} -> {:cont, {:ok, v}}
103103+ err -> {:halt, err}
104104+ end
105105+ end)
106106+ end
107107+ end
108108+ end
109109+110110+ @spec subtree_cids(Node.t()) :: [CID.t()]
111111+ defp subtree_cids(node) do
112112+ left = if node.left, do: [node.left], else: []
113113+ rights = Enum.flat_map(node.entries, fn e -> if e.right, do: [e.right], else: [] end)
114114+ left ++ rights
115115+ end
116116+117117+ # ---------------------------------------------------------------------------
118118+ # Private — leaf collection (in sorted order)
119119+ # ---------------------------------------------------------------------------
120120+121121+ @spec collect_leaves(Store.t(), CID.t() | nil) ::
122122+ {:ok, [{binary(), CID.t()}]} | diff_error()
123123+ defp collect_leaves(_store, nil), do: {:ok, []}
124124+125125+ defp collect_leaves(store, root) do
126126+ with {:ok, pairs} <- do_walk(store, root, []) do
127127+ {:ok, Enum.reverse(pairs)}
128128+ end
129129+ end
130130+131131+ # Accumulates pairs in reverse order (prepend for efficiency, reverse at end).
132132+ @spec do_walk(Store.t(), CID.t(), [{binary(), CID.t()}]) ::
133133+ {:ok, [{binary(), CID.t()}]} | diff_error()
134134+ defp do_walk(store, cid, acc) do
135135+ with {:ok, node} <- fetch(store, cid) do
136136+ full_keys = Node.keys(node)
137137+ do_walk_left(store, node, full_keys, acc)
138138+ end
139139+ end
140140+141141+ @spec do_walk_left(Store.t(), Node.t(), [binary()], [{binary(), CID.t()}]) ::
142142+ {:ok, [{binary(), CID.t()}]} | diff_error()
143143+ defp do_walk_left(store, node, full_keys, acc) do
144144+ with {:ok, acc} <- maybe_do_walk(store, node.left, acc) do
145145+ do_walk_entries(store, node.entries, full_keys, acc)
146146+ end
147147+ end
148148+149149+ defp maybe_do_walk(_store, nil, acc), do: {:ok, acc}
150150+ defp maybe_do_walk(store, cid, acc), do: do_walk(store, cid, acc)
151151+152152+ defp do_walk_entries(_store, [], [], acc), do: {:ok, acc}
153153+154154+ defp do_walk_entries(store, [entry | rest_e], [key | rest_k], acc) do
155155+ acc = [{key, entry.value} | acc]
156156+157157+ with {:ok, acc} <- maybe_do_walk(store, entry.right, acc) do
158158+ do_walk_entries(store, rest_e, rest_k, acc)
159159+ end
160160+ end
161161+162162+ # ---------------------------------------------------------------------------
163163+ # Private — sorted-merge diff
164164+ # ---------------------------------------------------------------------------
165165+166166+ @spec merge_ops(
167167+ [{binary(), CID.t()}],
168168+ [{binary(), CID.t()}],
169169+ [MST.Diff.Op.t()]
170170+ ) :: [MST.Diff.Op.t()]
171171+ defp merge_ops([], [], ops), do: Enum.reverse(ops)
172172+173173+ defp merge_ops([], [{kb, vb} | rest_b], ops) do
174174+ op = %MST.Diff.Op{key: kb, old_value: nil, new_value: vb}
175175+ merge_ops([], rest_b, [op | ops])
176176+ end
177177+178178+ defp merge_ops([{ka, va} | rest_a], [], ops) do
179179+ op = %MST.Diff.Op{key: ka, old_value: va, new_value: nil}
180180+ merge_ops(rest_a, [], [op | ops])
181181+ end
182182+183183+ defp merge_ops([{ka, va} | rest_a], [{kb, vb} | rest_b], ops) do
184184+ cond do
185185+ ka == kb ->
186186+ new_ops =
187187+ if va == vb,
188188+ do: ops,
189189+ else: [%MST.Diff.Op{key: ka, old_value: va, new_value: vb} | ops]
190190+191191+ merge_ops(rest_a, rest_b, new_ops)
192192+193193+ ka < kb ->
194194+ op = %MST.Diff.Op{key: ka, old_value: va, new_value: nil}
195195+ merge_ops(rest_a, [{kb, vb} | rest_b], [op | ops])
196196+197197+ true ->
198198+ op = %MST.Diff.Op{key: kb, old_value: nil, new_value: vb}
199199+ merge_ops([{ka, va} | rest_a], rest_b, [op | ops])
200200+ end
201201+ end
202202+203203+ # ---------------------------------------------------------------------------
204204+ # Private — store access
205205+ # ---------------------------------------------------------------------------
206206+207207+ @spec fetch(Store.t(), CID.t()) :: {:ok, Node.t()} | diff_error()
208208+ defp fetch(store, cid) do
209209+ case Store.get(store, cid) do
210210+ {:ok, node} -> {:ok, node}
211211+ {:error, :not_found} -> {:error, :missing_node}
212212+ end
213213+ end
214214+end
+19
lib/mst/diff/op.ex
···11+defmodule MST.Diff.Op do
22+ @moduledoc """
33+ A single key-level operation produced by `MST.Diff.compute/2`.
44+55+ - `old_value: nil, new_value: cid` — create
66+ - `old_value: cid, new_value: cid` — update
77+ - `old_value: cid, new_value: nil` — delete
88+ """
99+1010+ use TypedStruct
1111+1212+ alias DASL.CID
1313+1414+ typedstruct enforce: true do
1515+ field :key, binary()
1616+ field :old_value, CID.t() | nil
1717+ field :new_value, CID.t() | nil
1818+ end
1919+end
+71
lib/mst/height.ex
···11+defmodule MST.Height do
22+ @moduledoc """
33+ Key-depth computation for the AT Protocol Merkle Search Tree.
44+55+ Each key's depth (also called "layer" or "height") is derived by SHA-256
66+ hashing the key and counting the number of leading zero bits, divided by two
77+ (rounding down). This gives a theoretical fanout of 4: each additional level
88+ of depth is four times rarer than the previous.
99+1010+ Spec: https://atproto.com/specs/repository#mst-structure
1111+ """
1212+1313+ @doc """
1414+ Computes the MST depth for a given key.
1515+1616+ SHA-256 hashes `key` and counts the number of leading zero bits in the
1717+ binary output, then divides by 2 (floor). Returns a non-negative integer;
1818+ depth 0 is the most common (probability ~75% per key), each higher depth
1919+ is four times rarer.
2020+2121+ ## Examples
2222+2323+ iex> MST.Height.for_key("2653ae71")
2424+ 0
2525+2626+ iex> MST.Height.for_key("blue")
2727+ 1
2828+2929+ iex> MST.Height.for_key("app.bsky.feed.post/454397e440ec")
3030+ 4
3131+3232+ iex> MST.Height.for_key("app.bsky.feed.post/9adeb165882c")
3333+ 8
3434+3535+ """
3636+ @spec for_key(binary()) :: non_neg_integer()
3737+ def for_key(key) when is_binary(key) do
3838+ :crypto.hash(:sha256, key)
3939+ |> leading_zero_bits()
4040+ |> div(2)
4141+ end
4242+4343+ # ---------------------------------------------------------------------------
4444+ # Private helpers
4545+ # ---------------------------------------------------------------------------
4646+4747+ @spec leading_zero_bits(binary()) :: non_neg_integer()
4848+ defp leading_zero_bits(<<>>), do: 0
4949+5050+ defp leading_zero_bits(<<byte, rest::binary>>) do
5151+ lz = leading_zeros_in_byte(byte)
5252+5353+ if lz == 8 do
5454+ 8 + leading_zero_bits(rest)
5555+ else
5656+ lz
5757+ end
5858+ end
5959+6060+ # Returns the count of leading zero bits in a single byte (0–8).
6161+ @spec leading_zeros_in_byte(byte()) :: 0..8
6262+ defp leading_zeros_in_byte(0), do: 8
6363+ defp leading_zeros_in_byte(b) when b >= 128, do: 0
6464+ defp leading_zeros_in_byte(b) when b >= 64, do: 1
6565+ defp leading_zeros_in_byte(b) when b >= 32, do: 2
6666+ defp leading_zeros_in_byte(b) when b >= 16, do: 3
6767+ defp leading_zeros_in_byte(b) when b >= 8, do: 4
6868+ defp leading_zeros_in_byte(b) when b >= 4, do: 5
6969+ defp leading_zeros_in_byte(b) when b >= 2, do: 6
7070+ defp leading_zeros_in_byte(_), do: 7
7171+end
+282
lib/mst/node.ex
···11+defmodule MST.Node do
22+ @moduledoc """
33+ Wire-format representation of a single MST node, plus encode/decode.
44+55+ An MST node holds an optional left subtree CID (`left`) and an ordered list
66+ of `MST.Node.Entry` values, each carrying a key suffix, a value CID, and an
77+ optional right subtree CID. This maps exactly to the AT Protocol node schema:
88+99+ { l: CID | null, e: [ { p, k, v, t } ] }
1010+1111+ Keys inside a node are prefix-compressed: each entry's `key_suffix` is the
1212+ portion of the full key that follows the bytes it shares with the previous
1313+ entry's full key. The first entry always has `prefix_len: 0` and carries its
1414+ full key in `key_suffix`. Prefix compression is mandatory — the serialised
1515+ form must be deterministic across implementations.
1616+1717+ Spec: https://atproto.com/specs/repository#mst-structure
1818+ """
1919+2020+ use TypedStruct
2121+2222+ alias DASL.{CID, DRISL}
2323+ alias MST.Node.Entry
2424+2525+ @type encode_error() :: {:error, :encode, atom()}
2626+ @type decode_error() :: {:error, :decode, atom()}
2727+2828+ typedstruct enforce: true do
2929+ field :left, CID.t() | nil
3030+ field :entries, [Entry.t()], default: []
3131+ end
3232+3333+ # ---------------------------------------------------------------------------
3434+ # Construction helpers
3535+ # ---------------------------------------------------------------------------
3636+3737+ @doc """
3838+ Returns an empty MST node — the only valid representation of an empty tree.
3939+4040+ ## Examples
4141+4242+ iex> MST.Node.empty()
4343+ %MST.Node{left: nil, entries: []}
4444+4545+ """
4646+ @spec empty() :: t()
4747+ def empty, do: %__MODULE__{left: nil, entries: []}
4848+4949+ # ---------------------------------------------------------------------------
5050+ # Key expansion
5151+ # ---------------------------------------------------------------------------
5252+5353+ @doc """
5454+ Reconstructs the full keys for all entries in the node.
5555+5656+ Each entry stores only the suffix of its key relative to the previous entry.
5757+ This function walks the entry list and accumulates the full key for each.
5858+5959+ ## Examples
6060+6161+ iex> cid = DASL.CID.compute("a")
6262+ iex> entries = [
6363+ ...> %MST.Node.Entry{prefix_len: 0, key_suffix: "foo/bar", value: cid, right: nil},
6464+ ...> %MST.Node.Entry{prefix_len: 4, key_suffix: "baz", value: cid, right: nil},
6565+ ...> ]
6666+ iex> MST.Node.keys(%MST.Node{left: nil, entries: entries})
6767+ ["foo/bar", "foo/baz"]
6868+6969+ """
7070+ @spec keys(t()) :: [binary()]
7171+ def keys(%__MODULE__{entries: entries}), do: expand_keys(entries, "", [])
7272+7373+ # ---------------------------------------------------------------------------
7474+ # CID computation
7575+ # ---------------------------------------------------------------------------
7676+7777+ @doc """
7878+ Computes the `:drisl`-codec CID for this node.
7979+8080+ Encodes the node to DRISL CBOR bytes and hashes them. Returns an error tuple
8181+ if encoding fails.
8282+8383+ ## Examples
8484+8585+ iex> {:ok, cid} = MST.Node.cid(MST.Node.empty())
8686+ iex> cid.codec
8787+ :drisl
8888+8989+ """
9090+ @spec cid(t()) :: {:ok, CID.t()} | encode_error()
9191+ def cid(node) do
9292+ with {:ok, bytes} <- encode(node) do
9393+ {:ok, CID.compute(bytes, :drisl)}
9494+ end
9595+ end
9696+9797+ # ---------------------------------------------------------------------------
9898+ # Encoding
9999+ # ---------------------------------------------------------------------------
100100+101101+ @doc """
102102+ Encodes an `MST.Node` to DRISL CBOR bytes.
103103+104104+ `nil` subtree links are serialised as explicit CBOR `null` — this is
105105+ mandatory for cross-implementation CID compatibility: skipping a key vs.
106106+ writing `null` produces different bytes and therefore a different CID.
107107+108108+ ## Examples
109109+110110+ iex> {:ok, bytes} = MST.Node.encode(MST.Node.empty())
111111+ iex> is_binary(bytes)
112112+ true
113113+114114+ """
115115+ @spec encode(t()) :: {:ok, binary()} | encode_error()
116116+ def encode(%__MODULE__{left: left, entries: entries}) do
117117+ with {:ok, entry_maps} <- encode_entries(entries),
118118+ {:ok, bytes} <- DRISL.encode(%{"e" => entry_maps, "l" => left}) do
119119+ {:ok, bytes}
120120+ else
121121+ {:error, reason} when is_atom(reason) -> {:error, :encode, reason}
122122+ {:error, :encode, _} = err -> err
123123+ end
124124+ end
125125+126126+ # ---------------------------------------------------------------------------
127127+ # Decoding
128128+ # ---------------------------------------------------------------------------
129129+130130+ @doc """
131131+ Decodes DRISL CBOR bytes into an `MST.Node`.
132132+133133+ ## Examples
134134+135135+ iex> {:ok, bytes} = MST.Node.encode(MST.Node.empty())
136136+ iex> {:ok, node} = MST.Node.decode(bytes)
137137+ iex> node.entries
138138+ []
139139+ iex> node.left
140140+ nil
141141+142142+ """
143143+ @spec decode(binary()) :: {:ok, t()} | decode_error()
144144+ def decode(bytes) when is_binary(bytes) do
145145+ with {:ok, term, <<>>} <- DRISL.decode(bytes),
146146+ {:ok, node} <- decode_term(term) do
147147+ {:ok, node}
148148+ else
149149+ {:ok, _, _leftover} -> {:error, :decode, :trailing_bytes}
150150+ {:error, reason} when is_atom(reason) -> {:error, :decode, reason}
151151+ {:error, :decode, _} = err -> err
152152+ end
153153+ end
154154+155155+ # ---------------------------------------------------------------------------
156156+ # Compression helpers (used by MST.Tree)
157157+ # ---------------------------------------------------------------------------
158158+159159+ @doc """
160160+ Compresses a list of `{full_key, value_cid, right_cid | nil}` tuples into a
161161+ list of `MST.Node.Entry` structs using the key prefix-compression scheme.
162162+163163+ The first entry always has `prefix_len: 0`. Each subsequent entry computes
164164+ how many leading bytes it shares with the previous full key.
165165+166166+ ## Examples
167167+168168+ iex> cid = DASL.CID.compute("x")
169169+ iex> entries = MST.Node.compress_entries([{"abc/def", cid, nil}, {"abc/ghi", cid, nil}])
170170+ iex> hd(tl(entries)).prefix_len
171171+ 4
172172+173173+ """
174174+ @spec compress_entries([{binary(), CID.t(), CID.t() | nil}]) :: [Entry.t()]
175175+ def compress_entries(triples), do: do_compress(triples, "", [])
176176+177177+ # ---------------------------------------------------------------------------
178178+ # Private helpers
179179+ # ---------------------------------------------------------------------------
180180+181181+ @spec expand_keys([Entry.t()], binary(), [binary()]) :: [binary()]
182182+ defp expand_keys([], _prev, acc), do: Enum.reverse(acc)
183183+184184+ defp expand_keys([entry | rest], prev, acc) do
185185+ full_key = binary_part(prev, 0, entry.prefix_len) <> entry.key_suffix
186186+ expand_keys(rest, full_key, [full_key | acc])
187187+ end
188188+189189+ @spec do_compress([{binary(), CID.t(), CID.t() | nil}], binary(), [Entry.t()]) :: [Entry.t()]
190190+ defp do_compress([], _prev, acc), do: Enum.reverse(acc)
191191+192192+ defp do_compress([{key, value, right} | rest], prev, acc) do
193193+ plen = common_prefix_length(prev, key)
194194+ suffix = binary_part(key, plen, byte_size(key) - plen)
195195+196196+ entry = %Entry{
197197+ prefix_len: plen,
198198+ key_suffix: suffix,
199199+ value: value,
200200+ right: right
201201+ }
202202+203203+ do_compress(rest, key, [entry | acc])
204204+ end
205205+206206+ @spec common_prefix_length(binary(), binary()) :: non_neg_integer()
207207+ defp common_prefix_length(a, b), do: cpl(a, b, 0)
208208+209209+ defp cpl(<<c, ra::binary>>, <<c, rb::binary>>, n), do: cpl(ra, rb, n + 1)
210210+ defp cpl(_, _, n), do: n
211211+212212+ @spec encode_entries([Entry.t()]) :: {:ok, [map()]} | encode_error()
213213+ defp encode_entries(entries) do
214214+ result =
215215+ Enum.reduce_while(entries, {:ok, []}, fn entry, {:ok, acc} ->
216216+ {:ok, map} = encode_entry(entry)
217217+ {:cont, {:ok, [map | acc]}}
218218+ end)
219219+220220+ case result do
221221+ {:ok, reversed} -> {:ok, Enum.reverse(reversed)}
222222+ err -> err
223223+ end
224224+ end
225225+226226+ @spec encode_entry(Entry.t()) :: {:ok, map()} | encode_error()
227227+ defp encode_entry(%Entry{prefix_len: p, key_suffix: k, value: v, right: t}) do
228228+ {:ok,
229229+ %{
230230+ "k" => %CBOR.Tag{tag: :bytes, value: k},
231231+ "p" => p,
232232+ "t" => t,
233233+ "v" => v
234234+ }}
235235+ end
236236+237237+ @spec decode_term(any()) :: {:ok, t()} | decode_error()
238238+ defp decode_term(%{"e" => entries_raw, "l" => left_raw}) when is_list(entries_raw) do
239239+ with {:ok, left} <- decode_cid_or_null(left_raw),
240240+ {:ok, entries} <- decode_entries(entries_raw) do
241241+ {:ok, %__MODULE__{left: left, entries: entries}}
242242+ end
243243+ end
244244+245245+ defp decode_term(_), do: {:error, :decode, :invalid_structure}
246246+247247+ @spec decode_entries(list()) :: {:ok, [Entry.t()]} | decode_error()
248248+ defp decode_entries(entries_raw) do
249249+ result =
250250+ Enum.reduce_while(entries_raw, {:ok, []}, fn raw, {:ok, acc} ->
251251+ case decode_entry(raw) do
252252+ {:ok, entry} -> {:cont, {:ok, [entry | acc]}}
253253+ {:error, :decode, _} = err -> {:halt, err}
254254+ end
255255+ end)
256256+257257+ case result do
258258+ {:ok, reversed} -> {:ok, Enum.reverse(reversed)}
259259+ err -> err
260260+ end
261261+ end
262262+263263+ @spec decode_entry(any()) :: {:ok, Entry.t()} | decode_error()
264264+ defp decode_entry(%{
265265+ "k" => %CBOR.Tag{tag: :bytes, value: k},
266266+ "p" => p,
267267+ "t" => t_raw,
268268+ "v" => %CID{} = v
269269+ })
270270+ when is_integer(p) and p >= 0 and is_binary(k) do
271271+ with {:ok, right} <- decode_cid_or_null(t_raw) do
272272+ {:ok, %Entry{prefix_len: p, key_suffix: k, value: v, right: right}}
273273+ end
274274+ end
275275+276276+ defp decode_entry(_), do: {:error, :decode, :invalid_entry}
277277+278278+ @spec decode_cid_or_null(any()) :: {:ok, CID.t() | nil} | decode_error()
279279+ defp decode_cid_or_null(nil), do: {:ok, nil}
280280+ defp decode_cid_or_null(%CID{} = cid), do: {:ok, cid}
281281+ defp decode_cid_or_null(_), do: {:error, :decode, :invalid_cid_link}
282282+end
+20
lib/mst/node/entry.ex
···11+defmodule MST.Node.Entry do
22+ @moduledoc """
33+ A single entry within an `MST.Node`.
44+55+ Stores a compressed key (`prefix_len` bytes shared with the previous entry's
66+ full key, plus `key_suffix`), the CID of the value record (`value`), and an
77+ optional CID pointing to a right subtree (`right`).
88+ """
99+1010+ use TypedStruct
1111+1212+ alias DASL.CID
1313+1414+ typedstruct enforce: true do
1515+ field :prefix_len, non_neg_integer()
1616+ field :key_suffix, binary()
1717+ field :value, CID.t()
1818+ field :right, CID.t() | nil
1919+ end
2020+end
+94
lib/mst/store.ex
···11+defmodule MST.Store do
22+ @moduledoc """
33+ Behaviour for MST node block stores.
44+55+ A store holds a mapping from `DASL.CID` to decoded `MST.Node` structs. All
66+ implementations must satisfy these callbacks. The store state is an opaque
77+ term managed by the implementation module.
88+99+ The primary built-in implementation is `MST.Store.Memory`, a simple
1010+ map-backed store suitable for in-memory use and tests.
1111+1212+ ## Usage
1313+1414+ An `MST.Tree` holds its store as a `{module, state}` pair. You interact with
1515+ the store through the tree API; direct store access is only needed when
1616+ building trees from external data (e.g., importing a CAR file).
1717+1818+ store = MST.Store.Memory.new()
1919+ tree = MST.Tree.new(store)
2020+2121+ """
2222+2323+ alias DASL.CID
2424+ alias MST.Node
2525+2626+ @type t() :: {module(), any()}
2727+2828+ @doc """
2929+ Retrieves a node by CID. Returns `{:ok, node}` or `{:error, :not_found}`.
3030+ """
3131+ @callback get(state :: any(), CID.t()) :: {:ok, Node.t()} | {:error, :not_found}
3232+3333+ @doc """
3434+ Stores a node under its CID. Returns updated state.
3535+ """
3636+ @callback put(state :: any(), CID.t(), Node.t()) :: any()
3737+3838+ @doc """
3939+ Returns `true` if the store contains a node for the given CID.
4040+ """
4141+ @callback has?(state :: any(), CID.t()) :: boolean()
4242+4343+ @doc """
4444+ Returns all CIDs present in the store.
4545+ """
4646+ @callback cids(state :: any()) :: [CID.t()]
4747+4848+ # ---------------------------------------------------------------------------
4949+ # Dispatch helpers
5050+ # ---------------------------------------------------------------------------
5151+5252+ @doc """
5353+ Retrieves a node from the store.
5454+5555+ ## Examples
5656+5757+ iex> store = MST.Store.Memory.new()
5858+ iex> cid = DASL.CID.compute("test", :drisl)
5959+ iex> MST.Store.get(store, cid)
6060+ {:error, :not_found}
6161+6262+ """
6363+ @spec get(t(), CID.t()) :: {:ok, Node.t()} | {:error, :not_found}
6464+ def get({mod, state}, cid), do: mod.get(state, cid)
6565+6666+ @doc """
6767+ Stores a node in the store, returning the updated `{module, state}` pair.
6868+6969+ ## Examples
7070+7171+ iex> store = MST.Store.Memory.new()
7272+ iex> node = MST.Node.empty()
7373+ iex> {:ok, cid} = MST.Node.cid(node)
7474+ iex> store2 = MST.Store.put(store, cid, node)
7575+ iex> {:ok, _} = MST.Store.get(store2, cid)
7676+ iex> :ok
7777+ :ok
7878+7979+ """
8080+ @spec put(t(), CID.t(), Node.t()) :: t()
8181+ def put({mod, state}, cid, node), do: {mod, mod.put(state, cid, node)}
8282+8383+ @doc """
8484+ Returns `true` if the store contains a node for the given CID.
8585+ """
8686+ @spec has?(t(), CID.t()) :: boolean()
8787+ def has?({mod, state}, cid), do: mod.has?(state, cid)
8888+8989+ @doc """
9090+ Returns all CIDs present in the store.
9191+ """
9292+ @spec cids(t()) :: [CID.t()]
9393+ def cids({mod, state}), do: mod.cids(state)
9494+end
+61
lib/mst/store/memory.ex
···11+defmodule MST.Store.Memory do
22+ @moduledoc """
33+ Map-backed in-memory MST node store.
44+55+ Suitable for tests and any use case where the full tree fits in memory.
66+ The store state is a plain `%{}` map from `DASL.CID` to `MST.Node`.
77+88+ ## Usage
99+1010+ store = MST.Store.Memory.new()
1111+ tree = MST.Tree.new(store)
1212+1313+ """
1414+1515+ @behaviour MST.Store
1616+1717+ alias DASL.CID
1818+ alias MST.Node
1919+2020+ @type state() :: %{CID.t() => Node.t()}
2121+2222+ @doc """
2323+ Returns a new, empty memory store as a `{MST.Store.Memory, %{}}` pair.
2424+2525+ ## Examples
2626+2727+ iex> {mod, state} = MST.Store.Memory.new()
2828+ iex> mod
2929+ MST.Store.Memory
3030+ iex> state
3131+ %{}
3232+3333+ """
3434+ @spec new() :: MST.Store.t()
3535+ def new, do: {__MODULE__, %{}}
3636+3737+ # ---------------------------------------------------------------------------
3838+ # MST.Store callbacks
3939+ # ---------------------------------------------------------------------------
4040+4141+ @impl MST.Store
4242+ @spec get(state(), CID.t()) :: {:ok, Node.t()} | {:error, :not_found}
4343+ def get(state, cid) do
4444+ case Map.fetch(state, cid) do
4545+ {:ok, node} -> {:ok, node}
4646+ :error -> {:error, :not_found}
4747+ end
4848+ end
4949+5050+ @impl MST.Store
5151+ @spec put(state(), CID.t(), Node.t()) :: state()
5252+ def put(state, cid, node), do: Map.put(state, cid, node)
5353+5454+ @impl MST.Store
5555+ @spec has?(state(), CID.t()) :: boolean()
5656+ def has?(state, cid), do: Map.has_key?(state, cid)
5757+5858+ @impl MST.Store
5959+ @spec cids(state()) :: [CID.t()]
6060+ def cids(state), do: Map.keys(state)
6161+end
+809
lib/mst/tree.ex
···11+defmodule MST.Tree do
22+ @moduledoc """
33+ An in-memory Merkle Search Tree.
44+55+ `MST.Tree` is the primary interface for building and querying MSTs. It pairs
66+ a root CID (or `nil` for an empty tree) with a block store that maps CIDs to
77+ decoded `MST.Node` structs.
88+99+ All mutation operations (`put/3`, `delete/3`) return a new `MST.Tree` —
1010+ the data structure is persistent/immutable. The underlying store accumulates
1111+ all written nodes across mutations; stale nodes are not removed (no GC).
1212+1313+ ## Example
1414+1515+ store = MST.Store.Memory.new()
1616+ tree = MST.Tree.new(store)
1717+1818+ val = DASL.CID.compute("my record data")
1919+ {:ok, tree} = MST.Tree.put(tree, "collection/record-key", val)
2020+ {:ok, ^val} = MST.Tree.get(tree, "collection/record-key")
2121+2222+ {:ok, tree} = MST.Tree.delete(tree, "collection/record-key")
2323+ {:error, :not_found} = MST.Tree.get(tree, "collection/record-key")
2424+2525+ """
2626+2727+ use TypedStruct
2828+ import Kernel, except: [length: 1]
2929+3030+ alias DASL.CID
3131+ alias MST.{Height, Node, Node.Entry, Store}
3232+3333+ @type store() :: Store.t()
3434+ @type tree_error() :: {:error, atom()}
3535+3636+ typedstruct enforce: true do
3737+ field :root, CID.t() | nil
3838+ field :store, store()
3939+ end
4040+4141+ # ---------------------------------------------------------------------------
4242+ # Construction
4343+ # ---------------------------------------------------------------------------
4444+4545+ @doc """
4646+ Returns a new, empty tree backed by the given store.
4747+4848+ ## Examples
4949+5050+ iex> tree = MST.Tree.new(MST.Store.Memory.new())
5151+ iex> tree.root
5252+ nil
5353+5454+ """
5555+ @spec new(store()) :: t()
5656+ def new(store), do: %__MODULE__{root: nil, store: store}
5757+5858+ @doc """
5959+ Returns a tree referencing an existing root node CID in the given store.
6060+6161+ Use this to wrap an already-populated store (e.g. after loading from a CAR
6262+ file).
6363+6464+ ## Examples
6565+6666+ iex> store = MST.Store.Memory.new()
6767+ iex> node = MST.Node.empty()
6868+ iex> {:ok, cid} = MST.Node.cid(node)
6969+ iex> store = MST.Store.put(store, cid, node)
7070+ iex> tree = MST.Tree.from_root(cid, store)
7171+ iex> tree.root == cid
7272+ true
7373+7474+ """
7575+ @spec from_root(CID.t() | nil, store()) :: t()
7676+ def from_root(root, store), do: %__MODULE__{root: root, store: store}
7777+7878+ # ---------------------------------------------------------------------------
7979+ # Lookup
8080+ # ---------------------------------------------------------------------------
8181+8282+ @doc """
8383+ Looks up `key` in the tree.
8484+8585+ Returns `{:ok, value_cid}` if found, `{:error, :not_found}` otherwise.
8686+8787+ ## Examples
8888+8989+ iex> store = MST.Store.Memory.new()
9090+ iex> tree = MST.Tree.new(store)
9191+ iex> MST.Tree.get(tree, "col/key")
9292+ {:error, :not_found}
9393+9494+ """
9595+ @spec get(t(), binary()) :: {:ok, CID.t()} | {:error, :not_found} | tree_error()
9696+ def get(%__MODULE__{root: nil}, _key), do: {:error, :not_found}
9797+9898+ def get(%__MODULE__{root: root, store: store}, key) do
9999+ search(store, root, key)
100100+ end
101101+102102+ # ---------------------------------------------------------------------------
103103+ # Mutation
104104+ # ---------------------------------------------------------------------------
105105+106106+ @doc """
107107+ Inserts or updates `key` → `value` in the tree.
108108+109109+ Returns `{:ok, new_tree}` on success. The new tree shares the store with the
110110+ old tree, but both may be used independently — nodes are append-only.
111111+112112+ ## Examples
113113+114114+ iex> store = MST.Store.Memory.new()
115115+ iex> tree = MST.Tree.new(store)
116116+ iex> val = DASL.CID.compute("data")
117117+ iex> {:ok, tree2} = MST.Tree.put(tree, "col/key", val)
118118+ iex> MST.Tree.get(tree2, "col/key")
119119+ {:ok, val}
120120+121121+ """
122122+ @spec put(t(), binary(), CID.t()) :: {:ok, t()} | tree_error()
123123+ def put(%__MODULE__{root: nil, store: store}, key, value) do
124124+ # Empty tree — create a leaf node. No intermediate layers needed for a
125125+ # single-key tree (the spec says empty nodes at the top must be pruned).
126126+ node = leaf_node(key, value)
127127+128128+ with {:ok, root, store} <- write_node(store, node) do
129129+ {:ok, %__MODULE__{root: root, store: store}}
130130+ end
131131+ end
132132+133133+ def put(%__MODULE__{root: root, store: store}, key, value) do
134134+ with {:ok, root_node} <- fetch_node(store, root) do
135135+ if root_node.entries == [] and root_node.left == nil do
136136+ # Empty root from CAR import — treat as fresh empty tree.
137137+ put(%__MODULE__{root: nil, store: store}, key, value)
138138+ else
139139+ key_height = Height.for_key(key)
140140+ tree_height = require_height(store, root_node)
141141+142142+ with {:ok, new_root, store} <-
143143+ do_insert(store, root, key, value, key_height, tree_height) do
144144+ {:ok, %__MODULE__{root: new_root, store: store}}
145145+ end
146146+ end
147147+ end
148148+ end
149149+150150+ @doc """
151151+ Removes `key` from the tree.
152152+153153+ Returns `{:ok, new_tree}` on success, `{:error, :not_found}` if the key
154154+ does not exist.
155155+156156+ ## Examples
157157+158158+ iex> store = MST.Store.Memory.new()
159159+ iex> tree = MST.Tree.new(store)
160160+ iex> val = DASL.CID.compute("data")
161161+ iex> {:ok, tree2} = MST.Tree.put(tree, "col/key", val)
162162+ iex> {:ok, tree3} = MST.Tree.delete(tree2, "col/key")
163163+ iex> MST.Tree.get(tree3, "col/key")
164164+ {:error, :not_found}
165165+166166+ """
167167+ @spec delete(t(), binary()) :: {:ok, t()} | {:error, :not_found} | tree_error()
168168+ def delete(%__MODULE__{root: nil}, _key), do: {:error, :not_found}
169169+170170+ def delete(%__MODULE__{root: root, store: store}, key) do
171171+ with {:ok, root_node} <- fetch_node(store, root) do
172172+ key_height = Height.for_key(key)
173173+ tree_height = require_height(store, root_node)
174174+175175+ with {:ok, new_root, store} <-
176176+ do_remove(store, root, key, key_height, tree_height) do
177177+ # Trim empty wrappers from the top after deletion.
178178+ {:ok, new_root, store} = trim_top(store, new_root)
179179+ {:ok, %__MODULE__{root: new_root, store: store}}
180180+ end
181181+ end
182182+ end
183183+184184+ # ---------------------------------------------------------------------------
185185+ # Traversal
186186+ # ---------------------------------------------------------------------------
187187+188188+ @doc """
189189+ Returns all key-value pairs in the tree as a sorted list.
190190+191191+ ## Examples
192192+193193+ iex> store = MST.Store.Memory.new()
194194+ iex> tree = MST.Tree.new(store)
195195+ iex> val = DASL.CID.compute("data")
196196+ iex> {:ok, tree} = MST.Tree.put(tree, "col/b", val)
197197+ iex> {:ok, tree} = MST.Tree.put(tree, "col/a", val)
198198+ iex> {:ok, pairs} = MST.Tree.to_list(tree)
199199+ iex> Enum.map(pairs, &elem(&1, 0))
200200+ ["col/a", "col/b"]
201201+202202+ """
203203+ @spec to_list(t()) :: {:ok, [{binary(), CID.t()}]} | tree_error()
204204+ def to_list(%__MODULE__{root: nil}), do: {:ok, []}
205205+206206+ def to_list(%__MODULE__{root: root, store: store}) do
207207+ walk(store, root, [])
208208+ end
209209+210210+ @doc """
211211+ Returns a lazy stream of `{key, value_cid}` pairs in sorted order.
212212+213213+ The stream reads nodes from the store on demand. Raises on missing nodes
214214+ (consistent with lazy stream semantics).
215215+216216+ ## Examples
217217+218218+ iex> store = MST.Store.Memory.new()
219219+ iex> tree = MST.Tree.new(store)
220220+ iex> val = DASL.CID.compute("x")
221221+ iex> {:ok, tree} = MST.Tree.put(tree, "col/a", val)
222222+ iex> tree |> MST.Tree.stream() |> Enum.to_list()
223223+ [{"col/a", val}]
224224+225225+ """
226226+ @spec stream(t()) :: Enumerable.t()
227227+ def stream(%__MODULE__{root: nil}), do: []
228228+229229+ def stream(%__MODULE__{root: root, store: store}) do
230230+ Stream.resource(
231231+ fn -> [root] end,
232232+ fn
233233+ [] ->
234234+ {:halt, []}
235235+236236+ [cid | rest] ->
237237+ node = fetch_node!(store, cid)
238238+ full_keys = Node.keys(node)
239239+ {yields, next_stack} = node_to_stream_items(node, full_keys, rest)
240240+ {yields, next_stack}
241241+ end,
242242+ fn _ -> :ok end
243243+ )
244244+ end
245245+246246+ @doc """
247247+ Returns the number of key-value pairs in the tree.
248248+249249+ ## Examples
250250+251251+ iex> store = MST.Store.Memory.new()
252252+ iex> tree = MST.Tree.new(store)
253253+ iex> {:ok, 0} = MST.Tree.length(tree)
254254+ iex> val = DASL.CID.compute("x")
255255+ iex> {:ok, tree} = MST.Tree.put(tree, "col/a", val)
256256+ iex> MST.Tree.length(tree)
257257+ {:ok, 1}
258258+259259+ """
260260+ @spec length(t()) :: {:ok, non_neg_integer()} | tree_error()
261261+ def length(tree) do
262262+ with {:ok, pairs} <- to_list(tree) do
263263+ {:ok, Kernel.length(pairs)}
264264+ end
265265+ end
266266+267267+ # ---------------------------------------------------------------------------
268268+ # Block collection
269269+ # ---------------------------------------------------------------------------
270270+271271+ @doc """
272272+ Collects all MST nodes reachable from the root into a map of CID → encoded bytes.
273273+274274+ Useful for serialising the tree to a CAR file.
275275+276276+ ## Examples
277277+278278+ iex> store = MST.Store.Memory.new()
279279+ iex> tree = MST.Tree.new(store)
280280+ iex> val = DASL.CID.compute("x")
281281+ iex> {:ok, tree} = MST.Tree.put(tree, "col/a", val)
282282+ iex> {:ok, blocks} = MST.Tree.collect_blocks(tree)
283283+ iex> map_size(blocks) >= 1
284284+ true
285285+286286+ """
287287+ @spec collect_blocks(t()) :: {:ok, %{CID.t() => binary()}} | tree_error()
288288+ def collect_blocks(%__MODULE__{root: nil}), do: {:ok, %{}}
289289+290290+ def collect_blocks(%__MODULE__{root: root, store: store}) do
291291+ collect_reachable(store, root, %{})
292292+ end
293293+294294+ # ---------------------------------------------------------------------------
295295+ # Private — search
296296+ # ---------------------------------------------------------------------------
297297+298298+ @spec search(store(), CID.t(), binary()) ::
299299+ {:ok, CID.t()} | {:error, :not_found} | tree_error()
300300+ defp search(store, cid, key) do
301301+ with {:ok, node} <- fetch_node(store, cid) do
302302+ full_keys = Node.keys(node)
303303+ search_node(store, node, full_keys, key)
304304+ end
305305+ end
306306+307307+ @spec search_node(store(), Node.t(), [binary()], binary()) ::
308308+ {:ok, CID.t()} | {:error, :not_found} | tree_error()
309309+ defp search_node(store, node, full_keys, key) do
310310+ case locate(full_keys, key) do
311311+ {:found, idx} ->
312312+ {:ok, Enum.at(node.entries, idx).value}
313313+314314+ {:left} ->
315315+ descend(store, node.left, key)
316316+317317+ {:right, idx} ->
318318+ descend(store, Enum.at(node.entries, idx).right, key)
319319+ end
320320+ end
321321+322322+ @spec descend(store(), CID.t() | nil, binary()) ::
323323+ {:ok, CID.t()} | {:error, :not_found} | tree_error()
324324+ defp descend(_store, nil, _key), do: {:error, :not_found}
325325+ defp descend(store, cid, key), do: search(store, cid, key)
326326+327327+ # ---------------------------------------------------------------------------
328328+ # Private — insert
329329+ # ---------------------------------------------------------------------------
330330+331331+ # Recursive insert into the subtree rooted at `cid`.
332332+ # `tree_height` is the known height of the node (threaded from the parent).
333333+ @spec do_insert(store(), CID.t(), binary(), CID.t(), non_neg_integer(), non_neg_integer()) ::
334334+ {:ok, CID.t(), store()} | tree_error()
335335+ defp do_insert(store, cid, key, value, key_height, tree_height) do
336336+ with {:ok, node} <- fetch_node(store, cid) do
337337+ cond do
338338+ key_height > tree_height ->
339339+ # Key belongs at a higher layer. Wrap the current node in an empty
340340+ # parent and recurse at tree_height + 1.
341341+ wrapper = %Node{left: cid, entries: []}
342342+343343+ with {:ok, wrapper_cid, store} <- write_node(store, wrapper) do
344344+ do_insert(store, wrapper_cid, key, value, key_height, tree_height + 1)
345345+ end
346346+347347+ key_height < tree_height ->
348348+ # Descend into the appropriate subtree.
349349+ {kv_pairs, subtrees} = node_to_arrays(node)
350350+ keys = Enum.map(kv_pairs, &elem(&1, 0))
351351+ idx = lower_bound(keys, key)
352352+ subtree_cid = Enum.at(subtrees, idx)
353353+354354+ with {:ok, new_sub, store} <-
355355+ insert_into_subtree(
356356+ store,
357357+ subtree_cid,
358358+ key,
359359+ value,
360360+ key_height,
361361+ tree_height - 1
362362+ ) do
363363+ new_subtrees = List.replace_at(subtrees, idx, new_sub)
364364+ write_node(store, arrays_to_node(kv_pairs, new_subtrees))
365365+ end
366366+367367+ true ->
368368+ # key_height == tree_height — insert at this level.
369369+ put_here(store, node, key, value)
370370+ end
371371+ end
372372+ end
373373+374374+ # Insert into a subtree that may be nil. When nil, creates a new leaf and
375375+ # wraps it in as many empty intermediate layers as needed.
376376+ @spec insert_into_subtree(
377377+ store(),
378378+ CID.t() | nil,
379379+ binary(),
380380+ CID.t(),
381381+ non_neg_integer(),
382382+ non_neg_integer()
383383+ ) :: {:ok, CID.t(), store()} | tree_error()
384384+ defp insert_into_subtree(store, nil, key, value, key_height, expected_height) do
385385+ leaf = leaf_node(key, value)
386386+387387+ with {:ok, leaf_cid, store} <- write_node(store, leaf) do
388388+ wrap_with_empty_layers(store, leaf_cid, expected_height - key_height)
389389+ end
390390+ end
391391+392392+ defp insert_into_subtree(store, cid, key, value, key_height, expected_height) do
393393+ do_insert(store, cid, key, value, key_height, expected_height)
394394+ end
395395+396396+ # Insert a key at the current level (key_height == tree_height).
397397+ # Splits the subtree at the insertion point recursively.
398398+ @spec put_here(store(), Node.t(), binary(), CID.t()) ::
399399+ {:ok, CID.t(), store()} | tree_error()
400400+ defp put_here(store, node, key, value) do
401401+ {kv_pairs, subtrees} = node_to_arrays(node)
402402+ keys = Enum.map(kv_pairs, &elem(&1, 0))
403403+ idx = lower_bound(keys, key)
404404+405405+ if idx < Kernel.length(keys) and Enum.at(keys, idx) == key do
406406+ # Overwrite existing key.
407407+ new_kv = List.replace_at(kv_pairs, idx, {key, value})
408408+ write_node(store, arrays_to_node(new_kv, subtrees))
409409+ else
410410+ # Split the subtree at the insertion point recursively.
411411+ with {:ok, lsub, rsub, store} <- split_on_key(store, Enum.at(subtrees, idx), key) do
412412+ new_kv = List.insert_at(kv_pairs, idx, {key, value})
413413+414414+ new_subtrees =
415415+ List.replace_at(subtrees, idx, lsub) |> List.insert_at(idx + 1, rsub)
416416+417417+ write_node(store, arrays_to_node(new_kv, new_subtrees))
418418+ end
419419+ end
420420+ end
421421+422422+ # Recursively splits the subtree at `key`. Returns {left_cid, right_cid}
423423+ # where left contains all keys < `key` and right contains all keys >= `key`.
424424+ # Either side may be nil if empty.
425425+ @spec split_on_key(store(), CID.t() | nil, binary()) ::
426426+ {:ok, CID.t() | nil, CID.t() | nil, store()} | tree_error()
427427+ defp split_on_key(store, nil, _key), do: {:ok, nil, nil, store}
428428+429429+ defp split_on_key(store, cid, key) do
430430+ with {:ok, node} <- fetch_node(store, cid) do
431431+ {kv_pairs, subtrees} = node_to_arrays(node)
432432+ keys = Enum.map(kv_pairs, &elem(&1, 0))
433433+ idx = lower_bound(keys, key)
434434+435435+ # Recursively split the subtree at the boundary position.
436436+ with {:ok, inner_l, inner_r, store} <-
437437+ split_on_key(store, Enum.at(subtrees, idx), key) do
438438+ left_kv = Enum.take(kv_pairs, idx)
439439+ left_subs = Enum.take(subtrees, idx) ++ [inner_l]
440440+441441+ right_kv = Enum.drop(kv_pairs, idx)
442442+ right_subs = [inner_r | Enum.drop(subtrees, idx + 1)]
443443+444444+ with {:ok, left_cid, store} <-
445445+ write_node_to_nullable(store, arrays_to_node(left_kv, left_subs)),
446446+ {:ok, right_cid, store} <-
447447+ write_node_to_nullable(store, arrays_to_node(right_kv, right_subs)) do
448448+ {:ok, left_cid, right_cid, store}
449449+ end
450450+ end
451451+ end
452452+ end
453453+454454+ # ---------------------------------------------------------------------------
455455+ # Private — delete
456456+ # ---------------------------------------------------------------------------
457457+458458+ # Recursive delete. `tree_height` is the known height of the node at `cid`.
459459+ @spec do_remove(store(), CID.t(), binary(), non_neg_integer(), non_neg_integer()) ::
460460+ {:ok, CID.t() | nil, store()} | {:error, :not_found} | tree_error()
461461+ defp do_remove(store, cid, key, key_height, tree_height) do
462462+ with {:ok, node} <- fetch_node(store, cid) do
463463+ cond do
464464+ key_height > tree_height ->
465465+ {:error, :not_found}
466466+467467+ key_height < tree_height ->
468468+ {kv_pairs, subtrees} = node_to_arrays(node)
469469+ keys = Enum.map(kv_pairs, &elem(&1, 0))
470470+ idx = lower_bound(keys, key)
471471+472472+ case Enum.at(subtrees, idx) do
473473+ nil ->
474474+ {:error, :not_found}
475475+476476+ sub_cid ->
477477+ with {:ok, new_sub, store} <-
478478+ do_remove(store, sub_cid, key, key_height, tree_height - 1) do
479479+ new_subtrees = List.replace_at(subtrees, idx, new_sub)
480480+ write_node_to_nullable(store, arrays_to_node(kv_pairs, new_subtrees))
481481+ end
482482+ end
483483+484484+ true ->
485485+ # key_height == tree_height — key must be at this level if it exists.
486486+ {kv_pairs, subtrees} = node_to_arrays(node)
487487+ keys = Enum.map(kv_pairs, &elem(&1, 0))
488488+ idx = lower_bound(keys, key)
489489+490490+ if idx < Kernel.length(keys) and Enum.at(keys, idx) == key do
491491+ # Found! Merge the adjacent subtrees that flanked the deleted key.
492492+ with {:ok, merged_sub, store} <-
493493+ do_merge(store, Enum.at(subtrees, idx), Enum.at(subtrees, idx + 1)) do
494494+ new_kv = List.delete_at(kv_pairs, idx)
495495+496496+ new_subtrees =
497497+ Enum.take(subtrees, idx) ++ [merged_sub | Enum.drop(subtrees, idx + 2)]
498498+499499+ write_node_to_nullable(store, arrays_to_node(new_kv, new_subtrees))
500500+ end
501501+ else
502502+ {:error, :not_found}
503503+ end
504504+ end
505505+ end
506506+ end
507507+508508+ # Recursively merges two adjacent subtree pointers. The boundary subtrees
509509+ # (rightmost of left, leftmost of right) are merged recursively.
510510+ @spec do_merge(store(), CID.t() | nil, CID.t() | nil) ::
511511+ {:ok, CID.t() | nil, store()} | tree_error()
512512+ defp do_merge(store, nil, right_cid), do: {:ok, right_cid, store}
513513+ defp do_merge(store, left_cid, nil), do: {:ok, left_cid, store}
514514+515515+ defp do_merge(store, left_cid, right_cid) do
516516+ with {:ok, left_node} <- fetch_node(store, left_cid),
517517+ {:ok, right_node} <- fetch_node(store, right_cid) do
518518+ {left_kv, left_subs} = node_to_arrays(left_node)
519519+ {right_kv, right_subs} = node_to_arrays(right_node)
520520+521521+ with {:ok, merged_boundary, store} <-
522522+ do_merge(store, List.last(left_subs), hd(right_subs)) do
523523+ new_kv = left_kv ++ right_kv
524524+ new_subs = Enum.slice(left_subs, 0..-2//1) ++ [merged_boundary | tl(right_subs)]
525525+ write_node_to_nullable(store, arrays_to_node(new_kv, new_subs))
526526+ end
527527+ end
528528+ end
529529+530530+ # Strips empty wrapper nodes from the top of the tree. Only called after
531531+ # a top-level delete — intermediate empty nodes are preserved during
532532+ # recursive descent.
533533+ @spec trim_top(store(), CID.t() | nil) :: {:ok, CID.t() | nil, store()} | tree_error()
534534+ defp trim_top(store, nil), do: {:ok, nil, store}
535535+536536+ defp trim_top(store, cid) do
537537+ with {:ok, node} <- fetch_node(store, cid) do
538538+ cond do
539539+ node.entries != [] -> {:ok, cid, store}
540540+ node.left == nil -> {:ok, nil, store}
541541+ true -> trim_top(store, node.left)
542542+ end
543543+ end
544544+ end
545545+546546+ # ---------------------------------------------------------------------------
547547+ # Private — in-order traversal (to_list)
548548+ # ---------------------------------------------------------------------------
549549+550550+ @spec walk(store(), CID.t(), [{binary(), CID.t()}]) ::
551551+ {:ok, [{binary(), CID.t()}]} | tree_error()
552552+ defp walk(store, cid, acc) do
553553+ with {:ok, node} <- fetch_node(store, cid) do
554554+ full_keys = Node.keys(node)
555555+ walk_node(store, node, full_keys, acc)
556556+ end
557557+ end
558558+559559+ @spec walk_node(store(), Node.t(), [binary()], [{binary(), CID.t()}]) ::
560560+ {:ok, [{binary(), CID.t()}]} | tree_error()
561561+ defp walk_node(store, node, full_keys, acc) do
562562+ # Walk in-order: left subtree, then entries interleaved with right subtrees.
563563+ # We collect in reverse for efficiency, then reverse at the end.
564564+ with {:ok, acc} <- walk_subtree(store, node.left, acc) do
565565+ walk_entries(store, node.entries, full_keys, acc)
566566+ end
567567+ end
568568+569569+ @spec walk_subtree(store(), CID.t() | nil, [{binary(), CID.t()}]) ::
570570+ {:ok, [{binary(), CID.t()}]} | tree_error()
571571+ defp walk_subtree(_store, nil, acc), do: {:ok, acc}
572572+ defp walk_subtree(store, cid, acc), do: walk(store, cid, acc)
573573+574574+ @spec walk_entries(store(), [Entry.t()], [binary()], [{binary(), CID.t()}]) ::
575575+ {:ok, [{binary(), CID.t()}]} | tree_error()
576576+ defp walk_entries(_store, [], [], acc), do: {:ok, Enum.reverse(acc)}
577577+578578+ defp walk_entries(store, [entry | rest_e], [key | rest_k], acc) do
579579+ acc = [{key, entry.value} | acc]
580580+581581+ with {:ok, acc} <- walk_subtree(store, entry.right, acc) do
582582+ walk_entries(store, rest_e, rest_k, acc)
583583+ end
584584+ end
585585+586586+ # ---------------------------------------------------------------------------
587587+ # Private — stream helpers
588588+ # ---------------------------------------------------------------------------
589589+590590+ # Turn a node into a stream item list (yields) and the updated DFS stack.
591591+ # We push right subtrees and yield leaf entries in left-to-right order.
592592+ @spec node_to_stream_items(Node.t(), [binary()], list()) :: {[{binary(), CID.t()}], list()}
593593+ defp node_to_stream_items(node, full_keys, rest_stack) do
594594+ # Build a plan: [{:visit, cid} | {:yield, key, val}] in order
595595+ left_visits = if node.left, do: [{:visit, node.left}], else: []
596596+597597+ entry_items =
598598+ Enum.zip(node.entries, full_keys)
599599+ |> Enum.flat_map(fn {e, k} ->
600600+ right_visits = if e.right, do: [{:visit, e.right}], else: []
601601+ [{:yield, k, e.value} | right_visits]
602602+ end)
603603+604604+ plan = left_visits ++ entry_items
605605+606606+ # Collect yields and build the new stack (visits go onto the front).
607607+ # We want left-to-right order: process plan in reverse to prepend onto stack.
608608+ Enum.reduce(Enum.reverse(plan), {[], rest_stack}, fn
609609+ {:yield, k, v}, {yields, stack} -> {[{k, v} | yields], stack}
610610+ {:visit, cid}, {yields, stack} -> {yields, [cid | stack]}
611611+ end)
612612+ end
613613+614614+ # ---------------------------------------------------------------------------
615615+ # Private — block collection
616616+ # ---------------------------------------------------------------------------
617617+618618+ @spec collect_reachable(store(), CID.t(), %{CID.t() => binary()}) ::
619619+ {:ok, %{CID.t() => binary()}} | tree_error()
620620+ defp collect_reachable(store, cid, acc) do
621621+ if Map.has_key?(acc, cid) do
622622+ {:ok, acc}
623623+ else
624624+ with {:ok, node} <- fetch_node(store, cid),
625625+ {:ok, bytes} <- Node.encode(node) do
626626+ acc = Map.put(acc, cid, bytes)
627627+ collect_children(store, node, acc)
628628+ else
629629+ {:error, :not_found} -> {:error, :missing_node}
630630+ {:error, :encode, reason} -> {:error, reason}
631631+ end
632632+ end
633633+ end
634634+635635+ @spec collect_children(store(), Node.t(), %{CID.t() => binary()}) ::
636636+ {:ok, %{CID.t() => binary()}} | tree_error()
637637+ defp collect_children(store, node, acc) do
638638+ subtrees =
639639+ if(node.left, do: [node.left], else: []) ++
640640+ Enum.flat_map(node.entries, fn e -> if e.right, do: [e.right], else: [] end)
641641+642642+ Enum.reduce_while(subtrees, {:ok, acc}, fn cid, {:ok, acc} ->
643643+ case collect_reachable(store, cid, acc) do
644644+ {:ok, acc} -> {:cont, {:ok, acc}}
645645+ err -> {:halt, err}
646646+ end
647647+ end)
648648+ end
649649+650650+ # ---------------------------------------------------------------------------
651651+ # Private — node I/O
652652+ # ---------------------------------------------------------------------------
653653+654654+ @spec fetch_node(store(), CID.t()) :: {:ok, Node.t()} | tree_error()
655655+ defp fetch_node(store, cid) do
656656+ case Store.get(store, cid) do
657657+ {:ok, node} -> {:ok, node}
658658+ {:error, :not_found} -> {:error, :missing_node}
659659+ end
660660+ end
661661+662662+ @spec fetch_node!(store(), CID.t()) :: Node.t()
663663+ defp fetch_node!(store, cid) do
664664+ case Store.get(store, cid) do
665665+ {:ok, node} -> node
666666+ {:error, :not_found} -> raise "MST node not found: #{CID.encode(cid)}"
667667+ end
668668+ end
669669+670670+ @spec write_node(store(), Node.t()) :: {:ok, CID.t(), store()} | tree_error()
671671+ defp write_node(store, node) do
672672+ case Node.cid(node) do
673673+ {:ok, cid} -> {:ok, cid, Store.put(store, cid, node)}
674674+ {:error, :encode, reason} -> {:error, reason}
675675+ end
676676+ end
677677+678678+ # Write a node unless it is truly empty (no entries, no left). Returns nil
679679+ # for empty leaf-level nodes; preserves empty intermediate nodes that have
680680+ # a left subtree pointer.
681681+ @spec write_node_to_nullable(store(), Node.t()) ::
682682+ {:ok, CID.t() | nil, store()} | tree_error()
683683+ defp write_node_to_nullable(store, %Node{left: nil, entries: []}), do: {:ok, nil, store}
684684+ defp write_node_to_nullable(store, node), do: write_node(store, node)
685685+686686+ # Wraps a CID in `n` empty intermediate nodes (left-pointer only).
687687+ @spec wrap_with_empty_layers(store(), CID.t(), non_neg_integer()) ::
688688+ {:ok, CID.t(), store()} | tree_error()
689689+ defp wrap_with_empty_layers(store, cid, 0), do: {:ok, cid, store}
690690+691691+ defp wrap_with_empty_layers(store, cid, n) when n > 0 do
692692+ wrapper = %Node{left: cid, entries: []}
693693+694694+ with {:ok, wrapper_cid, store} <- write_node(store, wrapper) do
695695+ wrap_with_empty_layers(store, wrapper_cid, n - 1)
696696+ end
697697+ end
698698+699699+ # ---------------------------------------------------------------------------
700700+ # Private — key position helpers
701701+ # ---------------------------------------------------------------------------
702702+703703+ # Returns the position of `key` in the sorted `full_keys` list:
704704+ # {:found, idx} — key is at index idx
705705+ # {:left} — key < all keys (belongs in left subtree)
706706+ # {:right, idx} — key > keys[idx] (belongs in right subtree of entry idx)
707707+ @spec locate([binary()], binary()) ::
708708+ {:found, non_neg_integer()} | {:left} | {:right, non_neg_integer()}
709709+ defp locate([], _key), do: {:left}
710710+711711+ defp locate(keys, key) do
712712+ n = Kernel.length(keys)
713713+ bin_locate(keys, key, 0, n - 1, n)
714714+ end
715715+716716+ @spec bin_locate([binary()], binary(), integer(), integer(), non_neg_integer()) ::
717717+ {:found, non_neg_integer()} | {:left} | {:right, non_neg_integer()}
718718+ defp bin_locate(_keys, _key, lo, hi, _n) when lo > hi do
719719+ if lo == 0, do: {:left}, else: {:right, lo - 1}
720720+ end
721721+722722+ defp bin_locate(keys, key, lo, hi, n) do
723723+ mid = div(lo + hi, 2)
724724+ mid_key = Enum.at(keys, mid)
725725+726726+ cond do
727727+ mid_key == key -> {:found, mid}
728728+ mid_key < key -> bin_locate(keys, key, mid + 1, hi, n)
729729+ true -> bin_locate(keys, key, lo, mid - 1, n)
730730+ end
731731+ end
732732+733733+ # Returns the index of the first key >= `target`, or `length(keys)` if none.
734734+ @spec lower_bound([binary()], binary()) :: non_neg_integer()
735735+ defp lower_bound(keys, target) do
736736+ Enum.find_index(keys, fn k -> k >= target end) || Kernel.length(keys)
737737+ end
738738+739739+ # ---------------------------------------------------------------------------
740740+ # Private — layer inference
741741+ # ---------------------------------------------------------------------------
742742+743743+ # Infer the MST layer of a non-empty node from its first entry's key.
744744+ @spec node_layer(Node.t()) :: non_neg_integer() | nil
745745+ defp node_layer(%Node{entries: []}), do: nil
746746+747747+ defp node_layer(%Node{entries: [first | _]}) do
748748+ Height.for_key(first.key_suffix)
749749+ end
750750+751751+ # Compute the height of a node, walking into children if the node has no
752752+ # entries (empty intermediate nodes).
753753+ @spec require_height(store(), Node.t()) :: non_neg_integer()
754754+ defp require_height(store, node) do
755755+ case node_layer(node) do
756756+ nil ->
757757+ if node.left do
758758+ {:ok, child} = fetch_node(store, node.left)
759759+ require_height(store, child) + 1
760760+ else
761761+ 0
762762+ end
763763+764764+ h ->
765765+ h
766766+ end
767767+ end
768768+769769+ # ---------------------------------------------------------------------------
770770+ # Private — construction helpers
771771+ # ---------------------------------------------------------------------------
772772+773773+ @spec leaf_node(binary(), CID.t()) :: Node.t()
774774+ defp leaf_node(key, value) do
775775+ %Node{
776776+ left: nil,
777777+ entries: [%Entry{prefix_len: 0, key_suffix: key, value: value, right: nil}]
778778+ }
779779+ end
780780+781781+ # ---------------------------------------------------------------------------
782782+ # Private — node array conversions
783783+ # ---------------------------------------------------------------------------
784784+785785+ # Converts a node into a parallel-array representation:
786786+ # {[{key, value}], [subtree_cid | nil]}
787787+ # where subtrees has length(kv_pairs) + 1.
788788+ # subtrees[0] = node.left, subtrees[i+1] = entries[i].right.
789789+ @spec node_to_arrays(Node.t()) :: {[{binary(), CID.t()}], [CID.t() | nil]}
790790+ defp node_to_arrays(node) do
791791+ full_keys = Node.keys(node)
792792+ kv_pairs = Enum.zip(full_keys, Enum.map(node.entries, & &1.value))
793793+ subtrees = [node.left | Enum.map(node.entries, & &1.right)]
794794+ {kv_pairs, subtrees}
795795+ end
796796+797797+ # Converts the parallel-array representation back to a `Node`.
798798+ @spec arrays_to_node([{binary(), CID.t()}], [CID.t() | nil]) :: Node.t()
799799+ defp arrays_to_node(kv_pairs, subtrees) do
800800+ [left | right_ptrs] = subtrees
801801+802802+ triples =
803803+ Enum.zip(kv_pairs, right_ptrs)
804804+ |> Enum.map(fn {{k, v}, r} -> {k, v, r} end)
805805+806806+ entries = Node.compress_entries(triples)
807807+ %Node{left: left, entries: entries}
808808+ end
809809+end
+193
test/mst/car_test.exs
···11+defmodule MST.CARTest do
22+ use ExUnit.Case, async: true
33+44+ doctest MST.CAR
55+66+ alias DASL.CID
77+ alias MST.{CAR, Tree}
88+99+ defp new_tree, do: Tree.new(MST.Store.Memory.new())
1010+ defp val(s), do: CID.compute(s, :raw)
1111+1212+ describe "to_binary/2 and from_binary/2" do
1313+ test "empty tree round-trips" do
1414+ tree = new_tree()
1515+ assert {:ok, binary} = CAR.to_binary(tree)
1616+ assert is_binary(binary)
1717+ assert {:ok, tree2} = CAR.from_binary(binary)
1818+ assert {:ok, []} = Tree.to_list(tree2)
1919+ end
2020+2121+ test "single-key tree round-trips" do
2222+ v = val("data")
2323+ {:ok, tree} = Tree.put(new_tree(), "col/key", v)
2424+ assert {:ok, binary} = CAR.to_binary(tree)
2525+ assert {:ok, tree2} = CAR.from_binary(binary)
2626+ assert {:ok, ^v} = Tree.get(tree2, "col/key")
2727+ end
2828+2929+ test "multi-key tree round-trips with all keys intact" do
3030+ pairs = for i <- 1..5, do: {"col/key#{i}", val("v#{i}")}
3131+3232+ tree =
3333+ Enum.reduce(pairs, new_tree(), fn {k, v}, acc ->
3434+ {:ok, t} = Tree.put(acc, k, v)
3535+ t
3636+ end)
3737+3838+ assert {:ok, binary} = CAR.to_binary(tree)
3939+ assert {:ok, tree2} = CAR.from_binary(binary)
4040+4141+ for {k, v} <- pairs do
4242+ assert {:ok, ^v} = Tree.get(tree2, k)
4343+ end
4444+ end
4545+4646+ test "round-trip preserves root CID" do
4747+ v = val("data")
4848+ {:ok, tree} = Tree.put(new_tree(), "col/key", v)
4949+ assert {:ok, binary} = CAR.to_binary(tree)
5050+ assert {:ok, tree2} = CAR.from_binary(binary)
5151+ assert tree.root == tree2.root
5252+ end
5353+5454+ test "round-trip preserves sorted order" do
5555+ v = val("v")
5656+ keys = ["col/z", "col/a", "col/m", "col/b"]
5757+5858+ tree =
5959+ Enum.reduce(keys, new_tree(), fn k, acc ->
6060+ {:ok, t} = Tree.put(acc, k, v)
6161+ t
6262+ end)
6363+6464+ assert {:ok, binary} = CAR.to_binary(tree)
6565+ assert {:ok, tree2} = CAR.from_binary(binary)
6666+ assert {:ok, pairs} = Tree.to_list(tree2)
6767+ result_keys = Enum.map(pairs, &elem(&1, 0))
6868+ assert result_keys == Enum.sort(keys)
6969+ end
7070+ end
7171+7272+ describe "from_binary/2 error handling" do
7373+ test "returns error for invalid binary" do
7474+ assert {:error, _, _} = CAR.from_binary(<<0xFF, 0xFF, 0xFF>>)
7575+ end
7676+7777+ test "returns error for empty binary" do
7878+ assert {:error, _, _} = CAR.from_binary(<<>>)
7979+ end
8080+ end
8181+8282+ describe "from_car/1" do
8383+ test "empty tree round-trips via struct" do
8484+ tree = new_tree()
8585+ assert {:ok, binary} = CAR.to_binary(tree)
8686+ assert {:ok, car} = DASL.CAR.decode(binary)
8787+ assert {:ok, tree2} = CAR.from_car(car)
8888+ assert {:ok, []} = Tree.to_list(tree2)
8989+ end
9090+9191+ test "single-key tree round-trips via struct" do
9292+ v = val("data")
9393+ {:ok, tree} = Tree.put(new_tree(), "col/key", v)
9494+ assert {:ok, binary} = CAR.to_binary(tree)
9595+ assert {:ok, car} = DASL.CAR.decode(binary)
9696+ assert {:ok, tree2} = CAR.from_car(car)
9797+ assert {:ok, ^v} = Tree.get(tree2, "col/key")
9898+ end
9999+100100+ test "multi-key tree round-trips via struct with all keys intact" do
101101+ pairs = for i <- 1..5, do: {"col/key#{i}", val("v#{i}")}
102102+103103+ tree =
104104+ Enum.reduce(pairs, new_tree(), fn {k, v}, acc ->
105105+ {:ok, t} = Tree.put(acc, k, v)
106106+ t
107107+ end)
108108+109109+ assert {:ok, binary} = CAR.to_binary(tree)
110110+ assert {:ok, car} = DASL.CAR.decode(binary)
111111+ assert {:ok, tree2} = CAR.from_car(car)
112112+113113+ for {k, v} <- pairs do
114114+ assert {:ok, ^v} = Tree.get(tree2, k)
115115+ end
116116+ end
117117+118118+ test "preserves root CID" do
119119+ v = val("data")
120120+ {:ok, tree} = Tree.put(new_tree(), "col/key", v)
121121+ assert {:ok, binary} = CAR.to_binary(tree)
122122+ assert {:ok, car} = DASL.CAR.decode(binary)
123123+ assert {:ok, tree2} = CAR.from_car(car)
124124+ assert tree.root == tree2.root
125125+ end
126126+127127+ test "struct with no roots returns empty tree" do
128128+ car = %DASL.CAR{version: 1, roots: [], blocks: %{}}
129129+ assert {:ok, tree} = CAR.from_car(car)
130130+ assert {:ok, []} = Tree.to_list(tree)
131131+ end
132132+ end
133133+134134+ describe "from_stream/2" do
135135+ test "stream round-trip matches binary round-trip" do
136136+ v = val("v")
137137+ {:ok, tree} = Tree.put(new_tree(), "col/a", v)
138138+ {:ok, binary} = CAR.to_binary(tree)
139139+140140+ # Stream the binary as a single chunk
141141+ stream =
142142+ Stream.unfold(binary, fn
143143+ <<>> -> nil
144144+ bin -> {bin, <<>>}
145145+ end)
146146+147147+ assert {:ok, tree_s} = CAR.from_stream(stream)
148148+ assert tree.root == tree_s.root
149149+ end
150150+151151+ test "handles multi-chunk stream" do
152152+ v = val("v")
153153+ {:ok, tree} = Tree.put(new_tree(), "col/a", v)
154154+ {:ok, binary} = CAR.to_binary(tree)
155155+156156+ # Split into small 4-byte chunks
157157+ chunks =
158158+ binary
159159+ |> :binary.bin_to_list()
160160+ |> Enum.chunk_every(4)
161161+ |> Enum.map(&:binary.list_to_bin/1)
162162+163163+ assert {:ok, tree_s} = CAR.from_stream(chunks)
164164+ assert tree.root == tree_s.root
165165+ end
166166+ end
167167+168168+ describe "to_stream/1" do
169169+ test "first item is the header" do
170170+ v = val("v")
171171+ {:ok, tree} = Tree.put(new_tree(), "col/a", v)
172172+ items = CAR.to_stream(tree) |> Enum.to_list()
173173+ assert [{:header, 1, [root]}] = Enum.take(items, 1)
174174+ assert root == tree.root
175175+ end
176176+177177+ test "subsequent items are blocks" do
178178+ v = val("v")
179179+ {:ok, tree} = Tree.put(new_tree(), "col/a", v)
180180+ items = CAR.to_stream(tree) |> Enum.to_list()
181181+ blocks = Enum.drop(items, 1)
182182+ assert Enum.all?(blocks, &match?({:block, _, _}, &1))
183183+ end
184184+185185+ test "stream contains root block" do
186186+ v = val("v")
187187+ {:ok, tree} = Tree.put(new_tree(), "col/a", v)
188188+ items = CAR.to_stream(tree) |> Enum.to_list()
189189+ block_cids = for {:block, cid, _} <- items, do: cid
190190+ assert tree.root in block_cids
191191+ end
192192+ end
193193+end
+115
test/mst/diff_test.exs
···11+defmodule MST.DiffTest do
22+ use ExUnit.Case, async: true
33+44+ doctest MST.Diff
55+66+ alias DASL.CID
77+ alias MST.{Diff, Tree}
88+99+ defp new_tree, do: Tree.new(MST.Store.Memory.new())
1010+ defp val(s), do: CID.compute(s, :raw)
1111+1212+ describe "compute/2" do
1313+ test "two empty trees produce empty diff" do
1414+ assert {:ok, diff} = Diff.compute(new_tree(), new_tree())
1515+ assert MapSet.size(diff.created_nodes) == 0
1616+ assert MapSet.size(diff.deleted_nodes) == 0
1717+ assert diff.record_ops == []
1818+ end
1919+2020+ test "empty → non-empty: all keys are creates" do
2121+ v = val("v")
2222+ {:ok, tree_b} = Tree.put(new_tree(), "col/a", v)
2323+ assert {:ok, diff} = Diff.compute(new_tree(), tree_b)
2424+ assert length(diff.record_ops) == 1
2525+ op = hd(diff.record_ops)
2626+ assert op.key == "col/a"
2727+ assert op.old_value == nil
2828+ assert op.new_value == v
2929+ end
3030+3131+ test "non-empty → empty: all keys are deletes" do
3232+ v = val("v")
3333+ {:ok, tree_a} = Tree.put(new_tree(), "col/a", v)
3434+ assert {:ok, diff} = Diff.compute(tree_a, new_tree())
3535+ assert length(diff.record_ops) == 1
3636+ op = hd(diff.record_ops)
3737+ assert op.key == "col/a"
3838+ assert op.old_value == v
3939+ assert op.new_value == nil
4040+ end
4141+4242+ test "identical trees produce empty diff" do
4343+ v = val("v")
4444+ {:ok, tree} = Tree.put(new_tree(), "col/a", v)
4545+ assert {:ok, diff} = Diff.compute(tree, tree)
4646+ assert diff.record_ops == []
4747+ assert MapSet.size(diff.created_nodes) == 0
4848+ assert MapSet.size(diff.deleted_nodes) == 0
4949+ end
5050+5151+ test "update: same key, different value" do
5252+ v1 = val("v1")
5353+ v2 = val("v2")
5454+ {:ok, tree_a} = Tree.put(new_tree(), "col/a", v1)
5555+ {:ok, tree_b} = Tree.put(new_tree(), "col/a", v2)
5656+ assert {:ok, diff} = Diff.compute(tree_a, tree_b)
5757+ assert length(diff.record_ops) == 1
5858+ op = hd(diff.record_ops)
5959+ assert op.old_value == v1
6060+ assert op.new_value == v2
6161+ end
6262+6363+ test "no-op: same key, same value, different surrounding context" do
6464+ v = val("v")
6565+ v2 = val("v2")
6666+ {:ok, tree_a} = Tree.put(new_tree(), "col/a", v)
6767+ {:ok, tree_a} = Tree.put(tree_a, "col/b", v2)
6868+ {:ok, tree_b} = Tree.put(new_tree(), "col/a", v)
6969+ {:ok, tree_b} = Tree.put(tree_b, "col/c", v2)
7070+ assert {:ok, diff} = Diff.compute(tree_a, tree_b)
7171+ keys = Enum.map(diff.record_ops, & &1.key)
7272+ refute "col/a" in keys
7373+ assert "col/b" in keys
7474+ assert "col/c" in keys
7575+ end
7676+7777+ test "record_ops are sorted by key" do
7878+ v = val("v")
7979+8080+ {:ok, tree_b} =
8181+ Enum.reduce(["col/z", "col/a", "col/m"], new_tree(), fn k, acc ->
8282+ {:ok, t} = Tree.put(acc, k, v)
8383+ t
8484+ end)
8585+ |> then(&{:ok, &1})
8686+8787+ assert {:ok, diff} = Diff.compute(new_tree(), tree_b)
8888+ keys = Enum.map(diff.record_ops, & &1.key)
8989+ assert keys == Enum.sort(keys)
9090+ end
9191+9292+ test "created_nodes and deleted_nodes are non-overlapping for insert" do
9393+ v = val("v")
9494+ {:ok, tree_b} = Tree.put(new_tree(), "col/a", v)
9595+ assert {:ok, diff} = Diff.compute(new_tree(), tree_b)
9696+ assert MapSet.disjoint?(diff.created_nodes, diff.deleted_nodes)
9797+ end
9898+9999+ test "multi-key add and remove" do
100100+ v = val("v")
101101+ va = val("va")
102102+103103+ {:ok, base} = Tree.put(new_tree(), "col/keep", v)
104104+ {:ok, tree_a} = Tree.put(base, "col/remove", v)
105105+ {:ok, tree_b} = Tree.put(base, "col/add", va)
106106+107107+ assert {:ok, diff} = Diff.compute(tree_a, tree_b)
108108+109109+ op_keys = Enum.map(diff.record_ops, & &1.key) |> MapSet.new()
110110+ assert MapSet.member?(op_keys, "col/remove")
111111+ assert MapSet.member?(op_keys, "col/add")
112112+ refute MapSet.member?(op_keys, "col/keep")
113113+ end
114114+ end
115115+end
+173
test/mst/fixtures_test.exs
···11+defmodule MST.FixturesTest do
22+ @moduledoc """
33+ Validates MST behaviour against the exhaustive mst-test-suite fixtures.
44+55+ 128 CAR files (MSTs 0–127) are loaded once at compile time. 16,384 JSON diff
66+ test vectors (all pairwise combinations) are used to verify:
77+88+ - `MST.Diff.compute/2` produces the correct `created_nodes` and
99+ `deleted_nodes` sets
1010+ - Record operations (create / update / delete) match the expected `record_ops`
1111+1212+ Tag: `:slow` — run with `mix test` (included by default).
1313+ To exclude: `mix test --exclude slow`
1414+ """
1515+1616+ use ExUnit.Case, async: true
1717+1818+ @fixture_root Path.join([__DIR__, "..", "fixtures", "mst-test-suite"])
1919+ @cars_dir Path.join(@fixture_root, "cars/exhaustive")
2020+ @diff_dir Path.join(@fixture_root, "tests/diff/exhaustive")
2121+2222+ # ---------------------------------------------------------------------------
2323+ # Load all 128 CAR files at compile time
2424+ # ---------------------------------------------------------------------------
2525+2626+ # %{0 => %MST.Tree{}, 1 => %MST.Tree{}, ...}
2727+ @trees Enum.reduce(0..127, %{}, fn i, acc ->
2828+ name = "exhaustive_#{String.pad_leading("#{i}", 3, "0")}.car"
2929+ path = Path.join(@cars_dir, name)
3030+ binary = File.read!(path)
3131+ {:ok, tree} = MST.CAR.from_binary(binary)
3232+ Map.put(acc, i, tree)
3333+ end)
3434+3535+ # ---------------------------------------------------------------------------
3636+ # CAR loading sanity checks
3737+ # ---------------------------------------------------------------------------
3838+3939+ describe "CAR loading" do
4040+ test "all 128 CAR files load successfully" do
4141+ assert map_size(@trees) == 128
4242+ end
4343+4444+ test "MST 0 (empty) loads as an empty tree (no leaf keys)" do
4545+ tree = @trees[0]
4646+ assert {:ok, []} = MST.Tree.to_list(tree)
4747+ end
4848+4949+ test "MST 127 (all 7 keys) loads with 7 leaf entries" do
5050+ tree = @trees[127]
5151+ assert {:ok, pairs} = MST.Tree.to_list(tree)
5252+ assert length(pairs) == 7
5353+ end
5454+5555+ test "MST root CIDs are stable (decode twice, same root)" do
5656+ # Pick a few indices to spot-check
5757+ for i <- [1, 63, 64, 127] do
5858+ path = Path.join(@cars_dir, "exhaustive_#{String.pad_leading("#{i}", 3, "0")}.car")
5959+ binary = File.read!(path)
6060+ {:ok, tree1} = MST.CAR.from_binary(binary)
6161+ {:ok, tree2} = MST.CAR.from_binary(binary)
6262+ assert tree1.root == tree2.root, "Root mismatch for MST #{i}"
6363+ end
6464+ end
6565+ end
6666+6767+ # ---------------------------------------------------------------------------
6868+ # Diff fixtures (16,384 test vectors)
6969+ # ---------------------------------------------------------------------------
7070+7171+ describe "diff fixtures" do
7272+ # Run every JSON fixture file as a separate test case.
7373+ for path <- Path.wildcard(Path.join(@diff_dir, "*.json")) do
7474+ @path path
7575+7676+ @tag :slow
7777+ test Path.basename(@path, ".json") do
7878+ run_diff_fixture(@path)
7979+ end
8080+ end
8181+ end
8282+8383+ # ---------------------------------------------------------------------------
8484+ # Fixture runner
8585+ # ---------------------------------------------------------------------------
8686+8787+ defp run_diff_fixture(path) do
8888+ fixture = path |> File.read!() |> JSON.decode!()
8989+9090+ # Parse indices from input paths like "./cars/exhaustive/exhaustive_042.car"
9191+ idx_a = parse_car_index(fixture["inputs"]["mst_a"])
9292+ idx_b = parse_car_index(fixture["inputs"]["mst_b"])
9393+9494+ tree_a = @trees[idx_a]
9595+ tree_b = @trees[idx_b]
9696+9797+ expected = fixture["results"]
9898+9999+ assert {:ok, diff} = MST.Diff.compute(tree_a, tree_b)
100100+101101+ # --- created_nodes ---
102102+ expected_created = parse_cid_list(expected["created_nodes"])
103103+104104+ actual_created =
105105+ diff.created_nodes |> MapSet.to_list() |> Enum.map(&DASL.CID.encode/1) |> MapSet.new()
106106+107107+ assert actual_created == expected_created,
108108+ "created_nodes mismatch for #{Path.basename(path)}\n" <>
109109+ " expected: #{inspect(MapSet.to_list(expected_created))}\n" <>
110110+ " got: #{inspect(MapSet.to_list(actual_created))}"
111111+112112+ # --- deleted_nodes ---
113113+ expected_deleted = parse_cid_list(expected["deleted_nodes"])
114114+115115+ actual_deleted =
116116+ diff.deleted_nodes |> MapSet.to_list() |> Enum.map(&DASL.CID.encode/1) |> MapSet.new()
117117+118118+ assert actual_deleted == expected_deleted,
119119+ "deleted_nodes mismatch for #{Path.basename(path)}\n" <>
120120+ " expected: #{inspect(MapSet.to_list(expected_deleted))}\n" <>
121121+ " got: #{inspect(MapSet.to_list(actual_deleted))}"
122122+123123+ # --- record_ops ---
124124+ expected_ops = parse_record_ops(expected["record_ops"])
125125+ actual_ops = format_record_ops(diff.record_ops)
126126+127127+ assert actual_ops == expected_ops,
128128+ "record_ops mismatch for #{Path.basename(path)}\n" <>
129129+ " expected: #{inspect(expected_ops)}\n" <>
130130+ " got: #{inspect(actual_ops)}"
131131+ end
132132+133133+ # ---------------------------------------------------------------------------
134134+ # Parsing helpers
135135+ # ---------------------------------------------------------------------------
136136+137137+ defp parse_car_index(path_str) do
138138+ # "./cars/exhaustive/exhaustive_042.car" → 42
139139+ path_str
140140+ |> Path.basename(".car")
141141+ |> String.replace_prefix("exhaustive_", "")
142142+ |> String.to_integer()
143143+ end
144144+145145+ defp parse_cid_list(list) when is_list(list), do: MapSet.new(list)
146146+ defp parse_cid_list(nil), do: MapSet.new()
147147+148148+ defp parse_record_ops(ops) when is_list(ops) do
149149+ ops
150150+ |> Enum.map(fn op ->
151151+ %{
152152+ key: op["rpath"],
153153+ old_value: op["old_value"],
154154+ new_value: op["new_value"]
155155+ }
156156+ end)
157157+ |> Enum.sort_by(& &1.key)
158158+ end
159159+160160+ defp parse_record_ops(nil), do: []
161161+162162+ defp format_record_ops(ops) do
163163+ ops
164164+ |> Enum.map(fn op ->
165165+ %{
166166+ key: op.key,
167167+ old_value: if(op.old_value, do: DASL.CID.encode(op.old_value), else: nil),
168168+ new_value: if(op.new_value, do: DASL.CID.encode(op.new_value), else: nil)
169169+ }
170170+ end)
171171+ |> Enum.sort_by(& &1.key)
172172+ end
173173+end
+53
test/mst/height_test.exs
···11+defmodule MST.HeightTest do
22+ use ExUnit.Case, async: true
33+44+ doctest MST.Height
55+66+ describe "for_key/1" do
77+ # Spec examples from https://atproto.com/specs/repository#mst-structure
88+ test "spec example: depth 0" do
99+ assert MST.Height.for_key("2653ae71") == 0
1010+ end
1111+1212+ test "spec example: depth 1" do
1313+ assert MST.Height.for_key("blue") == 1
1414+ end
1515+1616+ test "spec example: depth 4" do
1717+ assert MST.Height.for_key("app.bsky.feed.post/454397e440ec") == 4
1818+ end
1919+2020+ test "spec example: depth 8" do
2121+ assert MST.Height.for_key("app.bsky.feed.post/9adeb165882c") == 8
2222+ end
2323+2424+ test "returns non-negative integer" do
2525+ assert MST.Height.for_key("anything") >= 0
2626+ end
2727+2828+ test "empty binary returns non-negative integer" do
2929+ # SHA-256 of empty string is a known value; just verify it doesn't crash
3030+ assert MST.Height.for_key("") >= 0
3131+ end
3232+3333+ test "depth 0 is the most common result" do
3434+ # ~75% of random keys should be depth 0
3535+ keys = for i <- 1..100, do: "test/key#{i}"
3636+ depths = Enum.map(keys, &MST.Height.for_key/1)
3737+ depth_zero_count = Enum.count(depths, &(&1 == 0))
3838+ # Expect at least 50 out of 100 to be depth 0 (loose bound)
3939+ assert depth_zero_count > 50
4040+ end
4141+4242+ test "depth is consistent for the same key" do
4343+ key = "some/key"
4444+ assert MST.Height.for_key(key) == MST.Height.for_key(key)
4545+ end
4646+4747+ test "different keys generally produce different depths" do
4848+ d0 = MST.Height.for_key("2653ae71")
4949+ d1 = MST.Height.for_key("blue")
5050+ assert d0 != d1
5151+ end
5252+ end
5353+end
+199
test/mst/node_test.exs
···11+defmodule MST.NodeTest do
22+ use ExUnit.Case, async: true
33+44+ doctest MST.Node
55+66+ alias DASL.CID
77+ alias MST.Node
88+ alias MST.Node.Entry
99+1010+ # Shared fixtures
1111+ @cid_a CID.compute("value_a", :raw)
1212+ @cid_b CID.compute("value_b", :raw)
1313+ @cid_c CID.compute("value_c", :raw)
1414+1515+ describe "empty/0" do
1616+ test "returns an empty node" do
1717+ assert %Node{left: nil, entries: []} = Node.empty()
1818+ end
1919+ end
2020+2121+ describe "encode/1 and decode/1 round-trip" do
2222+ test "empty node" do
2323+ node = Node.empty()
2424+ assert {:ok, bytes} = Node.encode(node)
2525+ assert {:ok, ^node} = Node.decode(bytes)
2626+ end
2727+2828+ test "node with single entry, no subtrees" do
2929+ entry = %Entry{prefix_len: 0, key_suffix: "col/key", value: @cid_a, right: nil}
3030+ node = %Node{left: nil, entries: [entry]}
3131+3232+ assert {:ok, bytes} = Node.encode(node)
3333+ assert {:ok, decoded} = Node.decode(bytes)
3434+ assert decoded.left == nil
3535+ assert length(decoded.entries) == 1
3636+ assert hd(decoded.entries).key_suffix == "col/key"
3737+ assert hd(decoded.entries).value == @cid_a
3838+ assert hd(decoded.entries).right == nil
3939+ end
4040+4141+ test "node with left subtree pointer" do
4242+ entry = %Entry{prefix_len: 0, key_suffix: "col/key", value: @cid_a, right: nil}
4343+ node = %Node{left: @cid_b, entries: [entry]}
4444+4545+ assert {:ok, bytes} = Node.encode(node)
4646+ assert {:ok, decoded} = Node.decode(bytes)
4747+ assert decoded.left == @cid_b
4848+ end
4949+5050+ test "node with right subtree pointer" do
5151+ entry = %Entry{prefix_len: 0, key_suffix: "col/key", value: @cid_a, right: @cid_b}
5252+ node = %Node{left: nil, entries: [entry]}
5353+5454+ assert {:ok, bytes} = Node.encode(node)
5555+ assert {:ok, decoded} = Node.decode(bytes)
5656+ assert hd(decoded.entries).right == @cid_b
5757+ end
5858+5959+ test "node with multiple entries and prefix compression" do
6060+ # "app.bsky.feed.post/" is 19 bytes, so prefix_len for bbb/ccc is 19
6161+ entries = [
6262+ %Entry{prefix_len: 0, key_suffix: "app.bsky.feed.post/aaa", value: @cid_a, right: nil},
6363+ %Entry{prefix_len: 19, key_suffix: "bbb", value: @cid_b, right: nil},
6464+ %Entry{prefix_len: 19, key_suffix: "ccc", value: @cid_c, right: nil}
6565+ ]
6666+6767+ node = %Node{left: nil, entries: entries}
6868+6969+ assert {:ok, bytes} = Node.encode(node)
7070+ assert {:ok, decoded} = Node.decode(bytes)
7171+7272+ assert Node.keys(decoded) == [
7373+ "app.bsky.feed.post/aaa",
7474+ "app.bsky.feed.post/bbb",
7575+ "app.bsky.feed.post/ccc"
7676+ ]
7777+ end
7878+7979+ test "CID is stable across encode → decode → re-encode" do
8080+ entry = %Entry{prefix_len: 0, key_suffix: "col/key", value: @cid_a, right: nil}
8181+ node = %Node{left: nil, entries: [entry]}
8282+8383+ assert {:ok, bytes1} = Node.encode(node)
8484+ assert {:ok, decoded} = Node.decode(bytes1)
8585+ assert {:ok, bytes2} = Node.encode(decoded)
8686+ assert bytes1 == bytes2
8787+ end
8888+8989+ test "explicit null for nil left is required for determinism" do
9090+ # Two encodings of a node with left=nil must produce the same bytes
9191+ node1 = Node.empty()
9292+ node2 = Node.empty()
9393+ assert {:ok, bytes1} = Node.encode(node1)
9494+ assert {:ok, bytes2} = Node.encode(node2)
9595+ assert bytes1 == bytes2
9696+ end
9797+ end
9898+9999+ describe "cid/1" do
100100+ test "returns a :drisl codec CID" do
101101+ assert {:ok, cid} = Node.cid(Node.empty())
102102+ assert cid.codec == :drisl
103103+ end
104104+105105+ test "same node always produces the same CID" do
106106+ node = Node.empty()
107107+ assert {:ok, cid1} = Node.cid(node)
108108+ assert {:ok, cid2} = Node.cid(node)
109109+ assert cid1 == cid2
110110+ end
111111+112112+ test "different nodes produce different CIDs" do
113113+ node_a = Node.empty()
114114+115115+ entry = %Entry{prefix_len: 0, key_suffix: "col/key", value: @cid_a, right: nil}
116116+ node_b = %Node{left: nil, entries: [entry]}
117117+118118+ assert {:ok, cid_a} = Node.cid(node_a)
119119+ assert {:ok, cid_b} = Node.cid(node_b)
120120+ assert cid_a != cid_b
121121+ end
122122+ end
123123+124124+ describe "keys/1" do
125125+ test "empty node returns empty list" do
126126+ assert Node.keys(Node.empty()) == []
127127+ end
128128+129129+ test "reconstructs full keys from prefix-compressed entries" do
130130+ entries = [
131131+ %Entry{prefix_len: 0, key_suffix: "foo/aaa", value: @cid_a, right: nil},
132132+ %Entry{prefix_len: 4, key_suffix: "bbb", value: @cid_b, right: nil},
133133+ %Entry{prefix_len: 4, key_suffix: "ccc", value: @cid_c, right: nil}
134134+ ]
135135+136136+ node = %Node{left: nil, entries: entries}
137137+ assert Node.keys(node) == ["foo/aaa", "foo/bbb", "foo/ccc"]
138138+ end
139139+140140+ test "first entry always has prefix_len 0" do
141141+ entry = %Entry{prefix_len: 0, key_suffix: "full/key", value: @cid_a, right: nil}
142142+ node = %Node{left: nil, entries: [entry]}
143143+ assert Node.keys(node) == ["full/key"]
144144+ end
145145+ end
146146+147147+ describe "compress_entries/1" do
148148+ test "single entry has prefix_len 0" do
149149+ entries = Node.compress_entries([{"col/key", @cid_a, nil}])
150150+ assert hd(entries).prefix_len == 0
151151+ assert hd(entries).key_suffix == "col/key"
152152+ end
153153+154154+ test "adjacent entries with common prefix are compressed" do
155155+ # "app.bsky.feed.post/" = 19 bytes shared; then 'a' vs 'b' diverge
156156+ entries =
157157+ Node.compress_entries([
158158+ {"app.bsky.feed.post/aaa", @cid_a, nil},
159159+ {"app.bsky.feed.post/bbb", @cid_b, nil}
160160+ ])
161161+162162+ [e1, e2] = entries
163163+ assert e1.prefix_len == 0
164164+ assert e1.key_suffix == "app.bsky.feed.post/aaa"
165165+ assert e2.prefix_len == 19
166166+ assert e2.key_suffix == "bbb"
167167+ end
168168+169169+ test "no shared prefix means prefix_len stays 0" do
170170+ entries = Node.compress_entries([{"aaa/x", @cid_a, nil}, {"zzz/y", @cid_b, nil}])
171171+ assert Enum.at(entries, 1).prefix_len == 0
172172+ end
173173+174174+ test "compress then expand is identity" do
175175+ keys = ["col/aaa", "col/bbb", "col/ccc"]
176176+ triples = Enum.map(keys, fn k -> {k, @cid_a, nil} end)
177177+ entries = Node.compress_entries(triples)
178178+ node = %Node{left: nil, entries: entries}
179179+ assert Node.keys(node) == keys
180180+ end
181181+ end
182182+183183+ describe "decode/1 error cases" do
184184+ test "returns error for non-CBOR bytes" do
185185+ assert {:error, :decode, _} = Node.decode(<<0xFF, 0xFF, 0xFF>>)
186186+ end
187187+188188+ test "returns error for trailing bytes" do
189189+ {:ok, bytes} = Node.encode(Node.empty())
190190+ assert {:error, :decode, :trailing_bytes} = Node.decode(bytes <> <<0x00>>)
191191+ end
192192+193193+ test "returns error for invalid structure (not a map)" do
194194+ # CBOR-encode a plain integer
195195+ {:ok, not_a_map} = DASL.DRISL.encode(42)
196196+ assert {:error, :decode, _} = Node.decode(not_a_map)
197197+ end
198198+ end
199199+end
+88
test/mst/store/memory_test.exs
···11+defmodule MST.Store.MemoryTest do
22+ use ExUnit.Case, async: true
33+44+ doctest MST.Store.Memory
55+66+ alias DASL.CID
77+ alias MST.{Node, Store}
88+99+ # Shared fixture
1010+ @node Node.empty()
1111+ @cid with {:ok, c} <- Node.cid(@node), do: c
1212+1313+ describe "new/0" do
1414+ test "returns a {module, state} pair" do
1515+ assert {MST.Store.Memory, %{}} = MST.Store.Memory.new()
1616+ end
1717+ end
1818+1919+ describe "get/2" do
2020+ test "returns :not_found for missing CID" do
2121+ store = MST.Store.Memory.new()
2222+ assert {:error, :not_found} = Store.get(store, @cid)
2323+ end
2424+2525+ test "returns node after put" do
2626+ store = MST.Store.Memory.new() |> Store.put(@cid, @node)
2727+ assert {:ok, @node} = Store.get(store, @cid)
2828+ end
2929+ end
3030+3131+ describe "put/3" do
3232+ test "returns updated store" do
3333+ store = MST.Store.Memory.new()
3434+ store2 = Store.put(store, @cid, @node)
3535+ assert Store.has?(store2, @cid)
3636+ end
3737+3838+ test "original store is unaffected (immutable)" do
3939+ store = MST.Store.Memory.new()
4040+ _store2 = Store.put(store, @cid, @node)
4141+ refute Store.has?(store, @cid)
4242+ end
4343+ end
4444+4545+ describe "has?/2" do
4646+ test "false for missing CID" do
4747+ store = MST.Store.Memory.new()
4848+ refute Store.has?(store, @cid)
4949+ end
5050+5151+ test "true after put" do
5252+ store = MST.Store.Memory.new() |> Store.put(@cid, @node)
5353+ assert Store.has?(store, @cid)
5454+ end
5555+ end
5656+5757+ describe "cids/1" do
5858+ test "empty store returns empty list" do
5959+ assert Store.cids(MST.Store.Memory.new()) == []
6060+ end
6161+6262+ test "returns all inserted CIDs" do
6363+ other_node = %Node{
6464+ left: nil,
6565+ entries: [
6666+ %MST.Node.Entry{
6767+ prefix_len: 0,
6868+ key_suffix: "x",
6969+ value: CID.compute("v"),
7070+ right: nil
7171+ }
7272+ ]
7373+ }
7474+7575+ {:ok, other_cid} = Node.cid(other_node)
7676+7777+ store =
7878+ MST.Store.Memory.new()
7979+ |> Store.put(@cid, @node)
8080+ |> Store.put(other_cid, other_node)
8181+8282+ cids = Store.cids(store)
8383+ assert @cid in cids
8484+ assert other_cid in cids
8585+ assert length(cids) == 2
8686+ end
8787+ end
8888+end
+358
test/mst/tree_test.exs
···11+defmodule MST.TreeTest do
22+ use ExUnit.Case, async: true
33+44+ doctest MST.Tree
55+66+ alias DASL.CID
77+ alias MST.Tree
88+99+ # Helpers
1010+ defp new_tree, do: Tree.new(MST.Store.Memory.new())
1111+ defp val(s), do: CID.compute(s, :raw)
1212+1313+ describe "new/1" do
1414+ test "creates an empty tree" do
1515+ tree = new_tree()
1616+ assert tree.root == nil
1717+ assert {:ok, []} = Tree.to_list(tree)
1818+ end
1919+ end
2020+2121+ describe "put/3 and get/3" do
2222+ test "insert and retrieve a single key" do
2323+ v = val("data")
2424+ assert {:ok, tree} = Tree.put(new_tree(), "col/key", v)
2525+ assert {:ok, ^v} = Tree.get(tree, "col/key")
2626+ end
2727+2828+ test "get returns :not_found for missing key" do
2929+ tree = new_tree()
3030+ assert {:error, :not_found} = Tree.get(tree, "col/missing")
3131+ end
3232+3333+ test "insert multiple keys and retrieve each" do
3434+ pairs = for i <- 1..10, do: {"col/k#{String.pad_leading("#{i}", 3, "0")}", val("v#{i}")}
3535+3636+ tree =
3737+ Enum.reduce(pairs, new_tree(), fn {k, v}, acc ->
3838+ {:ok, t} = Tree.put(acc, k, v)
3939+ t
4040+ end)
4141+4242+ for {k, v} <- pairs do
4343+ assert {:ok, ^v} = Tree.get(tree, k)
4444+ end
4545+ end
4646+4747+ test "overwrite existing key updates value" do
4848+ v1 = val("first")
4949+ v2 = val("second")
5050+ {:ok, tree} = Tree.put(new_tree(), "col/key", v1)
5151+ {:ok, tree} = Tree.put(tree, "col/key", v2)
5252+ assert {:ok, ^v2} = Tree.get(tree, "col/key")
5353+ end
5454+5555+ test "insert is immutable (old tree unaffected)" do
5656+ v = val("data")
5757+ tree0 = new_tree()
5858+ {:ok, tree1} = Tree.put(tree0, "col/key", v)
5959+ assert {:error, :not_found} = Tree.get(tree0, "col/key")
6060+ assert {:ok, ^v} = Tree.get(tree1, "col/key")
6161+ end
6262+6363+ test "keys at different heights coexist correctly" do
6464+ # "blue" is depth 1, "2653ae71" is depth 0
6565+ v1 = val("v1")
6666+ v2 = val("v2")
6767+ {:ok, tree} = Tree.put(new_tree(), "blue", v1)
6868+ {:ok, tree} = Tree.put(tree, "2653ae71", v2)
6969+ assert {:ok, ^v1} = Tree.get(tree, "blue")
7070+ assert {:ok, ^v2} = Tree.get(tree, "2653ae71")
7171+ end
7272+ end
7373+7474+ describe "delete/2" do
7575+ test "delete returns :not_found for missing key" do
7676+ assert {:error, :not_found} = Tree.delete(new_tree(), "col/missing")
7777+ end
7878+7979+ test "delete removes a key" do
8080+ v = val("data")
8181+ {:ok, tree} = Tree.put(new_tree(), "col/key", v)
8282+ {:ok, tree} = Tree.delete(tree, "col/key")
8383+ assert {:error, :not_found} = Tree.get(tree, "col/key")
8484+ end
8585+8686+ test "delete last key empties the tree" do
8787+ v = val("data")
8888+ {:ok, tree} = Tree.put(new_tree(), "col/key", v)
8989+ {:ok, tree} = Tree.delete(tree, "col/key")
9090+ assert {:ok, []} = Tree.to_list(tree)
9191+ end
9292+9393+ test "delete one of several keys" do
9494+ v = val("v")
9595+ {:ok, tree} = Tree.put(new_tree(), "col/a", v)
9696+ {:ok, tree} = Tree.put(tree, "col/b", v)
9797+ {:ok, tree} = Tree.put(tree, "col/c", v)
9898+ {:ok, tree} = Tree.delete(tree, "col/b")
9999+ assert {:error, :not_found} = Tree.get(tree, "col/b")
100100+ assert {:ok, ^v} = Tree.get(tree, "col/a")
101101+ assert {:ok, ^v} = Tree.get(tree, "col/c")
102102+ end
103103+104104+ test "delete is immutable (old tree unaffected)" do
105105+ v = val("data")
106106+ {:ok, tree1} = Tree.put(new_tree(), "col/key", v)
107107+ {:ok, _tree2} = Tree.delete(tree1, "col/key")
108108+ assert {:ok, ^v} = Tree.get(tree1, "col/key")
109109+ end
110110+ end
111111+112112+ describe "to_list/1" do
113113+ test "empty tree returns empty list" do
114114+ assert {:ok, []} = Tree.to_list(new_tree())
115115+ end
116116+117117+ test "returns keys in sorted order" do
118118+ v = val("v")
119119+120120+ tree =
121121+ ["col/z", "col/a", "col/m", "col/b"]
122122+ |> Enum.reduce(new_tree(), fn k, acc ->
123123+ {:ok, t} = Tree.put(acc, k, v)
124124+ t
125125+ end)
126126+127127+ assert {:ok, pairs} = Tree.to_list(tree)
128128+ keys = Enum.map(pairs, &elem(&1, 0))
129129+ assert keys == Enum.sort(keys)
130130+ assert keys == ["col/a", "col/b", "col/m", "col/z"]
131131+ end
132132+133133+ test "values are correct for each key" do
134134+ pairs = [{"col/a", val("va")}, {"col/b", val("vb")}, {"col/c", val("vc")}]
135135+136136+ tree =
137137+ Enum.reduce(pairs, new_tree(), fn {k, v}, acc ->
138138+ {:ok, t} = Tree.put(acc, k, v)
139139+ t
140140+ end)
141141+142142+ assert {:ok, result} = Tree.to_list(tree)
143143+ assert result == Enum.sort_by(pairs, &elem(&1, 0))
144144+ end
145145+ end
146146+147147+ describe "stream/1" do
148148+ test "empty tree streams nothing" do
149149+ assert [] = Tree.stream(new_tree()) |> Enum.to_list()
150150+ end
151151+152152+ test "stream yields same pairs as to_list" do
153153+ v = val("v")
154154+155155+ tree =
156156+ ["col/c", "col/a", "col/b"]
157157+ |> Enum.reduce(new_tree(), fn k, acc ->
158158+ {:ok, t} = Tree.put(acc, k, v)
159159+ t
160160+ end)
161161+162162+ assert {:ok, list_pairs} = Tree.to_list(tree)
163163+ assert Tree.stream(tree) |> Enum.to_list() == list_pairs
164164+ end
165165+ end
166166+167167+ describe "length/1" do
168168+ test "empty tree has length 0" do
169169+ assert {:ok, 0} = Tree.length(new_tree())
170170+ end
171171+172172+ test "size tracks insertions" do
173173+ v = val("v")
174174+ {:ok, t1} = Tree.put(new_tree(), "col/a", v)
175175+ {:ok, t2} = Tree.put(t1, "col/b", v)
176176+ assert {:ok, 1} = Tree.length(t1)
177177+ assert {:ok, 2} = Tree.length(t2)
178178+ end
179179+180180+ test "overwrite does not change size" do
181181+ v = val("v")
182182+ {:ok, tree} = Tree.put(new_tree(), "col/a", v)
183183+ {:ok, tree} = Tree.put(tree, "col/a", val("v2"))
184184+ assert {:ok, 1} = Tree.length(tree)
185185+ end
186186+187187+ test "delete reduces size" do
188188+ v = val("v")
189189+ {:ok, tree} = Tree.put(new_tree(), "col/a", v)
190190+ {:ok, tree} = Tree.put(tree, "col/b", v)
191191+ {:ok, tree} = Tree.delete(tree, "col/a")
192192+ assert {:ok, 1} = Tree.length(tree)
193193+ end
194194+ end
195195+196196+ describe "collect_blocks/1" do
197197+ test "empty tree returns empty map" do
198198+ assert {:ok, %{}} = Tree.collect_blocks(new_tree())
199199+ end
200200+201201+ test "non-empty tree returns at least one block" do
202202+ v = val("v")
203203+ {:ok, tree} = Tree.put(new_tree(), "col/a", v)
204204+ assert {:ok, blocks} = Tree.collect_blocks(tree)
205205+ assert map_size(blocks) >= 1
206206+ assert Map.has_key?(blocks, tree.root)
207207+ end
208208+209209+ test "all returned CIDs are :drisl codec" do
210210+ v = val("v")
211211+ {:ok, tree} = Tree.put(new_tree(), "col/a", v)
212212+ assert {:ok, blocks} = Tree.collect_blocks(tree)
213213+214214+ for {cid, _bytes} <- blocks do
215215+ assert cid.codec == :drisl
216216+ end
217217+ end
218218+ end
219219+220220+ describe "determinism" do
221221+ test "same keys/values in different insertion order produce the same root CID" do
222222+ v = val("v")
223223+ pairs = [{"col/a", v}, {"col/b", v}, {"col/c", v}]
224224+225225+ tree_forward =
226226+ Enum.reduce(pairs, new_tree(), fn {k, vv}, acc ->
227227+ {:ok, t} = Tree.put(acc, k, vv)
228228+ t
229229+ end)
230230+231231+ tree_reverse =
232232+ Enum.reduce(Enum.reverse(pairs), new_tree(), fn {k, vv}, acc ->
233233+ {:ok, t} = Tree.put(acc, k, vv)
234234+ t
235235+ end)
236236+237237+ assert tree_forward.root == tree_reverse.root
238238+ end
239239+240240+ test "delete then re-insert produces the same root CID as never deleting" do
241241+ v = val("v")
242242+ {:ok, tree_ab} = Tree.put(new_tree(), "col/a", v)
243243+ {:ok, tree_ab} = Tree.put(tree_ab, "col/b", v)
244244+245245+ {:ok, tree_a_only} = Tree.put(new_tree(), "col/a", v)
246246+ {:ok, tree_a_back} = Tree.put(tree_a_only, "col/b", v)
247247+ {:ok, tree_a_back} = Tree.delete(tree_a_back, "col/b")
248248+ {:ok, tree_a_back} = Tree.put(tree_a_back, "col/b", v)
249249+250250+ assert tree_ab.root == tree_a_back.root
251251+ end
252252+ end
253253+254254+ describe "spec compliance (fixture CID matching)" do
255255+ # The 7 fixture keys with known heights: 0, 1, 0, 2, 0, 1, 0.
256256+ @fixture_keys ["k/00", "k/02", "k/04", "k/39", "k/40", "k/48", "k/49"]
257257+ @fixture_values Map.new(@fixture_keys, fn key ->
258258+ term = %{"$type" => "mst-test-data", "value_for" => key}
259259+ {:ok, bytes} = DASL.DRISL.encode(term)
260260+ {key, DASL.CID.compute(bytes, :drisl)}
261261+ end)
262262+ @cars_dir Path.join([__DIR__, "..", "fixtures", "mst-test-suite", "cars", "exhaustive"])
263263+264264+ # Load all 128 fixture root CIDs at compile time.
265265+ @fixture_roots Enum.reduce(0..127, %{}, fn i, acc ->
266266+ name = "exhaustive_#{String.pad_leading("#{i}", 3, "0")}.car"
267267+ {:ok, tree} = MST.CAR.from_binary(File.read!(Path.join(@cars_dir, name)))
268268+ Map.put(acc, i, tree.root)
269269+ end)
270270+271271+ for i <- 1..127 do
272272+ @tag :slow
273273+ test "MST #{i} built from scratch matches fixture root CID" do
274274+ i = unquote(i)
275275+276276+ active_keys =
277277+ @fixture_keys
278278+ |> Enum.with_index()
279279+ |> Enum.filter(fn {_, j} -> Bitwise.band(i, Bitwise.bsl(1, j)) != 0 end)
280280+ |> Enum.map(&elem(&1, 0))
281281+282282+ tree =
283283+ Enum.reduce(active_keys, new_tree(), fn k, acc ->
284284+ {:ok, t} = Tree.put(acc, k, @fixture_values[k])
285285+ t
286286+ end)
287287+288288+ assert tree.root == @fixture_roots[i],
289289+ "MST #{i} (keys: #{inspect(active_keys)}) root CID mismatch"
290290+ end
291291+ end
292292+293293+ for i <- 1..127 do
294294+ @tag :slow
295295+ test "MST #{i} survives put-all then delete-all cycle" do
296296+ i = unquote(i)
297297+298298+ active_keys =
299299+ @fixture_keys
300300+ |> Enum.with_index()
301301+ |> Enum.filter(fn {_, j} -> Bitwise.band(i, Bitwise.bsl(1, j)) != 0 end)
302302+ |> Enum.map(&elem(&1, 0))
303303+304304+ tree =
305305+ Enum.reduce(active_keys, new_tree(), fn k, acc ->
306306+ {:ok, t} = Tree.put(acc, k, @fixture_values[k])
307307+ t
308308+ end)
309309+310310+ empty =
311311+ Enum.reduce(active_keys, tree, fn k, acc ->
312312+ {:ok, t} = Tree.delete(acc, k)
313313+ t
314314+ end)
315315+316316+ assert {:ok, []} = Tree.to_list(empty)
317317+ assert empty.root == nil
318318+ end
319319+ end
320320+321321+ for i <- 1..127 do
322322+ @tag :slow
323323+ test "MST #{i} put+delete in every permutation produces same CID" do
324324+ i = unquote(i)
325325+326326+ active_keys =
327327+ @fixture_keys
328328+ |> Enum.with_index()
329329+ |> Enum.filter(fn {_, j} -> Bitwise.band(i, Bitwise.bsl(1, j)) != 0 end)
330330+ |> Enum.map(&elem(&1, 0))
331331+332332+ # Only test small permutations (up to 4 keys) to keep tests fast.
333333+ if length(active_keys) <= 4 do
334334+ roots =
335335+ permutations(active_keys)
336336+ |> Enum.map(fn perm ->
337337+ Enum.reduce(perm, new_tree(), fn k, acc ->
338338+ {:ok, t} = Tree.put(acc, k, @fixture_values[k])
339339+ t
340340+ end)
341341+ end)
342342+ |> Enum.map(& &1.root)
343343+ |> Enum.uniq()
344344+345345+ assert length(roots) == 1,
346346+ "MST #{i}: different insertion orders produce different roots"
347347+ end
348348+ end
349349+ end
350350+ end
351351+352352+ # Generate all permutations of a list.
353353+ defp permutations([]), do: [[]]
354354+355355+ defp permutations(list) do
356356+ for elem <- list, rest <- permutations(list -- [elem]), do: [elem | rest]
357357+ end
358358+end
+2-5
test/mst_test.exs
···11defmodule MSTTest do
22- use ExUnit.Case
33- doctest MST
22+ use ExUnit.Case, async: true
4355- test "greets the world" do
66- assert MST.hello() == :world
77- end
44+ doctest MST
85end