···141141Verification asserts the integrity of the repository contents: verifying the signature of the archive's [commit object][commit] (if present) is a separate process, outside the scope of STAR. See atproto [commit signatures][commit-sigs]
142142143143144144+#### Pseudo-code
145145+144146```python
145147# MstNode interface:
146148# is_empty() => bool true if the node has no subtree or value links
···173175 for node, parent in zip(stack[:key_layer], stack[1:]):
174176 if node.is_empty():
175177 continue # skip possible empty bottom-most nodes
176176- parent.link_subtree(compute_cid(node.to_cbor()))
178178+ cid = compute_cid(node.to_cbor())
179179+ parent.link_subtree(cid)
177180 node.reset_to_empty()
178181179182 # add a node entry for the current record
180180- stack[key_layer].link_record(key, compute_cid(record_cbor))
183183+ record_cid = compute_cid(record_cbor)
184184+ stack[key_layer].link_record(key, record_cid)
181185182186 prev_layer = key_layer
183187···185189 for node, parent in zip(stack[:-1], stack[1:]):
186190 if node.is_empty():
187191 continue
188188- parent.link_subtree(compute_cid(node.to_cbor()))
192192+ cid = compute_cid(node.to_cbor())
193193+ parent.link_subtree(cid)
189194 node.reset_to_empty()
190195191196 # get the finished root node, finally.
···194199 else:
195200 root = MstNode() # empty repo: atproto CAR writes one single empty node
196201197197- return compute_cid(root.to_cbor())
202202+ root_cid = compute_cid(root.to_cbor())
203203+ return root_cid
198204```
199205200206···204210205211Since our depth-first walk finalizes children before parents, and the final parent finalizes last, we must unfortunately buffer all serialized CAR frames while the tree is walked. The good news is that a disk-spill-friendly byte log works well for this buffering.
206212207207-#### pseudo-code
213213+#### Pseudo-code
208214209215```python
210216# MstNode interface changes:
211217# entries list of (key, cid, frame position, right link)
212218# left, entries[].right optional subtree link + stashed emit plan
213213-# link_record(key, cid, frame_pos) stash the carv1 frame's byte log position
219219+# link_record(key, cid, frame_position) stash the carv1 frame's byte_log spot
214220# link_subtree(cid, subtree_emit_plan) stash an emit plan with the link
215221216222def car_frame(data_bytes: bytes) -> tuple[Cid, bytes]:
···263269 key_record_pairs must be in lexicographic key order (= depth-first mst walk)
264270 """
265271 stack: list[MstNode] = []
266266- byte_log = bytearray()
272272+ byte_log = bytearray() # append-only storage of CARv1 frames
267273 prev_layer = -1
268274269275 # the actual walk. everything to the left of the stack is finalized.
···284290 continue # skip possible empty bottom-most nodes
285291286292 # put finalized (+serialized, CAR-framed) node into the byte log
287287- frame_position = len(byte_log)
288293 cid, framed = car_frame(node.to_cbor())
294294+ frame_position = len(byte_log)
289295 byte_log.extend(framed)
290296291297 # link it from the parent node now it's finalized with a CID
···294300 node.reset_to_empty()
295301296302 # put the current record into the byte log
297297- frame_position = len(byte_log)
298303 record_cid, framed = car_frame(record_cbor)
304304+ frame_position = len(byte_log)
299305 byte_log.extend(framed)
300306301307 # and link it from the MST node's entries at this layer
···308314 if node.is_empty():
309315 continue
310316311311- frame_position = len(byte_log)
312317 cid, framed = car_frame(node.to_cbor())
318318+ frame_position = len(byte_log)
313319 byte_log.extend(framed)
314320315321 node_emit_plan = build_subtree_emit_plan(node, frame_position)
···323329 root = MstNode() # empty repo: atproto CAR writes one single empty node
324330325331 # frame the root and get it in the logggggggg
326326- root_frame_position = len(byte_log)
327332 root_cid, framed = car_frame(root.to_cbor())
333333+ root_frame_position = len(byte_log)
328334 byte_log.extend(framed)
329335330336 # and pull together the final emit plan
···336342 output.extend(frame_at(byte_log, position))
337343338344 return root_cid, output
345345+```
346346+347347+348348+#### Structural similarity
349349+350350+To emphasize the core of the MST-reconstructing algorithm, here is a diff between the main routine for verification vs. conversion to stream-ordered CARv1.
351351+352352+```diff,python
353353+-def reconstruct_root_cid(key_record_pairs):
354354++def to_stream_ordered_car_body(key_record_pairs):
355355+- """Compute the MST root CID from repo contents
356356++ """Get a stream-ordered atproto CAR body from repository contents
357357+358358+ key_record_pairs must be in lexicographic key order (= depth-first mst walk)
359359+ """
360360+ stack: list[MstNode] = []
361361++ byte_log = bytearray() # append-only storage of CARv1 frames
362362+ prev_layer = -1
363363+364364+ # the actual walk. everything to the left of the stack is finalized.
365365+ # anything remaining in the stack gets rolled up at the end.
366366+ for (key, record_cbor) in key_record_pairs:
367367+ key_layer = compute_mst_layer(key)
368368+369369+ # grow the stack if needed, init with empty nodes.
370370+ while len(stack) <= key_layer:
371371+ stack.append(MstNode())
372372+373373+ # finalize lower levels if this key is at a higher level than last.
374374+ # higher key means everything lower in the stack is left-of-us now.
375375+ if key_layer > prev_layer:
376376+ for node, parent in zip(stack[:key_layer], stack[1:]):
377377+ if node.is_empty():
378378+ continue # skip possible empty bottom-most nodes
379379++
380380++ # put finalized (+serialized, CAR-framed) node into the byte log
381381+- cid = compute_cid(node.to_cbor())
382382++ cid, framed = car_frame(node.to_cbor())
383383++ frame_position = len(byte_log)
384384++ byte_log.extend(framed)
385385++
386386++ # link it from the parent node now it's finalized with a CID
387387++ node_emit_plan = build_subtree_emit_plan(node, frame_position)
388388+- parent.link_subtree(cid)
389389++ parent.link_subtree(cid, node_emit_plan)
390390+ node.reset_to_empty()
391391+392392+ # add a node entry for the current record
393393++ # and put it into the byte log
394394+- record_cid = compute_cid(record_cbor)
395395++ record_cid, framed = car_frame(record_cbor)
396396++ frame_position = len(byte_log)
397397++ byte_log.extend(framed)
398398++
399399+- stack[key_layer].link_record(key, record_cid)
400400++ stack[key_layer].link_record(key, record_cid, frame_position)
401401+402402+ prev_layer = key_layer
403403+404404+ # finalize remaining stack
405405+ for node, parent in zip(stack[:-1], stack[1:]):
406406+ if node.is_empty():
407407+ continue
408408++
409409+- cid = compute_cid(node.to_cbor())
410410++ cid, framed = car_frame(node.to_cbor())
411411++ frame_position = len(byte_log)
412412++ byte_log.extend(framed)
413413++
414414++ node_emit_plan = build_subtree_emit_plan(node, frame_position)
415415+- parent.link_subtree(cid)
416416++ parent.link_subtree(cid, node_emit_plan)
417417+ node.reset_to_empty()
418418+419419+ # get the finished root node, finally.
420420+ if len(stack) > 0:
421421+ root = stack[-1]
422422+ else:
423423+ root = MstNode() # empty repo: atproto CAR writes one single empty node
424424+425425++ # frame the root and get it in the log!
426426+- root_cid = compute_cid(root.to_cbor())
427427++ root_cid, framed = car_frame(root.to_cbor())
428428++ root_frame_position = len(byte_log)
429429++ byte_log.extend(framed)
430430++
431431++ # and pull together the final emit plan
432432++ root_emit_plan = build_subtree_emit_plan(root, root_frame_position)
433433++
434434++ # walk the plan into the final output!!!
435435++ output = bytearray()
436436++ for position in root_emit_plan:
437437++ output.extend(frame_at(byte_log, position))
438438++
439439+- return root_cid
440440++ return root_cid, output
339441```
340442341443