this repo has no description
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

working mst enumeration via NodeWalker

+162 -31
+162 -31
mst.py
··· 6 6 from more_itertools import ilen 7 7 from itertools import takewhile 8 8 from dataclasses import dataclass 9 - from typing import Tuple, Self, Optional, Any, Type, Iterable 9 + from typing import Tuple, Self, Optional, Any, Dict, List, Type, Iterable 10 + from collections import namedtuple 10 11 11 12 from util import indent, hash_to_cid 12 13 from blockstore import BlockStore, MemoryBlockStore ··· 30 31 keys: (0, 1, 2, 3) 31 32 vals: (0, 1, 2, 3) 32 33 subtrees: (0, 1, 2, 3, 4) 34 + 35 + If a method is implemented in this class, it's because it's a function/property 36 + of a single node, as opposed to a whole tree 33 37 """ 34 38 keys: Tuple[str] # collection/rkey 35 39 vals: Tuple[CID] # record CIDs ··· 55 59 vals=() 56 60 ) 57 61 62 + # this should maybe not be implemented here? 58 63 @staticmethod 59 64 def key_height(key: str) -> int: 60 65 digest = int.from_bytes(hashlib.sha256(key.encode()).digest(), "big") ··· 97 102 keys = [] 98 103 vals = [] 99 104 prev_key = b"" 100 - for e in cbor["e"]: # TODO: make extra sure that these checks are watertight 105 + for e in cbor["e"]: # TODO: make extra sure that these checks are watertight wrt non-canonical representations 101 106 if len(e) != 4: # k, p, t, v 102 107 raise ValueError("malformed MST node") 103 108 prefix_len: int = e["p"] ··· 162 167 for loading and storing MSTNodes 163 168 """ 164 169 bs: BlockStore 170 + #cache: Dict[Optional[CID], MSTNode] 171 + #cache_counts: Dict[Optional[CID], int] 165 172 166 173 def __init__(self, bs: BlockStore) -> None: 167 174 self.bs = bs 175 + #self.cache = {} 176 + #self.cache_counts = {} 168 177 169 - # TODO: LRU cache this 178 + # TODO: LRU cache this - this package looks ideal: https://github.com/amitdev/lru-dict 170 179 def get(self, cid: Optional[CID]) -> MSTNode: 171 180 """ 172 181 if cid is None, returns an empty MST node ··· 181 190 self.bs.put(bytes(node.cid), node.serialised) 182 191 return node # this is convenient 183 192 193 + # MST pretty-printing 194 + # this should maybe not be implemented here 195 + def pretty(self, node_cid: Optional[CID]) -> str: 196 + if node_cid is None: 197 + return "<empty>" 198 + node = self.get(node_cid) 199 + res = f"MSTNode<cid={node.cid.encode("base32")}>(\n{indent(self.pretty(node.subtrees[0]))},\n" 200 + for k, v, t in zip(node.keys, node.vals, node.subtrees[1:]): 201 + res += f" {k!r} ({MSTNode.key_height(k)}) -> {v.encode("base32")},\n" 202 + res += indent(self.pretty(t)) + ",\n" 203 + res += ")" 204 + return res 205 + 184 206 185 207 class NodeWrangler: 186 208 """ 187 209 NodeWrangler is where core MST transformation ops are implemented, backed 188 210 by a NodeStore 211 + 212 + The external APIs take a CID (the MST root) and return a CID (the new root), 213 + while storing any newly created nodes in the NodeStore. 214 + 215 + Neither method should ever fail - deleting a node that doesn't exist is a nop, 216 + and adding the same node twice with the same value is also a nop. Callers 217 + can detect these cases by seeing if the initial and final CIDs changed. 189 218 """ 190 219 ns: NodeStore 191 220 ··· 330 359 ), 331 360 ) + right.subtrees[1:] 332 361 ))._to_optional() 362 + 363 + 364 + class NodeWalker: 365 + """ 366 + NodeWalker makes implementing tree diffing and other MST query ops more 367 + convenient (but it does not, itself, implement them). 368 + 369 + A NodeWalker starts off at the root of a tree, and can walk along or recurse 370 + down into subtrees. 371 + 372 + Walking "off the end" of a subtree brings you back up to its parent. 373 + 374 + At any point in time, the current node is given by node_stack[-1], and its current position 375 + within that node is given by idx_stack[-1], which corresponds to a subtree index. 376 + 377 + Recall MSTNode layout: 378 + 379 + keys: (lkey) (0, 1, 2, 3) (rkey) 380 + vals: (0, 1, 2, 3) 381 + subtrees: (0, 1, 2, 3, 4) 382 + """ 383 + ns: NodeStore 384 + 385 + @dataclass 386 + class StackFrame: 387 + node: MSTNode # could store CIDs only to save memory, in theory, but not much point 388 + lkey: str 389 + rkey: str 390 + idx: int 391 + 392 + KEY_MIN = "" # string that compares less than all legal key strings 393 + KEY_MAX = "\xff" # string that compares greater than all legal key strings 394 + 395 + @dataclass 396 + class State: 397 + lkey: str 398 + lval: Optional[CID] 399 + subtree: Optional[CID] 400 + rkey: str 401 + rval: Optional[CID] 402 + 403 + stack: List[StackFrame] 333 404 334 - def pretty(self, node_cid: Optional[CID]) -> str: 335 - if node_cid is None: 336 - return "<empty>" 337 - node = self.ns.get(node_cid) 338 - res = f"MSTNode<cid={node.cid.encode("base32")}>(\n{indent(self.pretty(node.subtrees[0]))},\n" 339 - for k, v, t in zip(node.keys, node.vals, node.subtrees[1:]): 340 - res += f" {k!r} ({MSTNode.key_height(k)}) -> {v.encode("base32")},\n" 341 - res += indent(self.pretty(t)) + ",\n" 342 - res += ")" 343 - return res 405 + def __init__(self, ns: NodeStore, root_cid: CID) -> None: 406 + self.ns = ns 407 + self.stack = [self.StackFrame( 408 + node=self.ns.get(root_cid), 409 + lkey=self.KEY_MIN, 410 + rkey=self.KEY_MAX, 411 + idx=0 412 + )] 413 + 414 + @property 415 + def frame(self) -> StackFrame: 416 + return self.stack[-1] 417 + 418 + @property 419 + def lkey(self) -> str: 420 + return self.frame.lkey if self.frame.idx == 0 else self.frame.node.keys[self.frame.idx - 1] 421 + 422 + @property 423 + def lval(self) -> Optional[CID]: 424 + return None if self.frame.idx == 0 else self.frame.node.vals[self.frame.idx - 1] 425 + 426 + @property 427 + def subtree(self) -> Optional[CID]: 428 + return self.frame.node.subtrees[self.frame.idx] 429 + 430 + @property 431 + def rkey(self) -> str: 432 + return self.frame.rkey if self.frame.idx == len(self.frame.node.keys) else self.frame.node.keys[self.frame.idx] 433 + 434 + @property 435 + def rval(self) -> Optional[CID]: 436 + return None if self.frame.idx == len(self.frame.node.vals) else self.frame.node.vals[self.frame.idx] 437 + 438 + @property 439 + def final(self) -> bool: 440 + return self.subtree is None and self.rkey == NodeWalker.KEY_MAX 441 + 442 + def right(self) -> None: 443 + if (self.frame.idx + 1) >= len(self.frame.node.subtrees): 444 + # we reached the end of this node, go up a level 445 + self.stack.pop() 446 + if not self.stack: 447 + raise StopIteration # you probably want to check .final instead of hitting this 448 + return self.right() # we need to recurse, to skip over empty intermediates on the way back up 449 + self.frame.idx += 1 450 + 451 + def down(self) -> None: 452 + subtree = self.frame.node.subtrees[self.frame.idx] 453 + if subtree is None: 454 + raise Exception("oi, you can't recurse here mate") 455 + 456 + self.stack.append(self.StackFrame( 457 + node=self.ns.get(subtree), 458 + lkey=self.lkey, 459 + rkey=self.rkey, 460 + idx=0 461 + )) 462 + 463 + def enumerate_mst(ns: NodeStore, root_cid: CID): 464 + cur = NodeWalker(ns, root_cid) 465 + while not cur.final: 466 + while cur.subtree: # recurse down every subtree 467 + cur.down() 468 + cur.right() 469 + print(cur.lkey, "->", cur.lval.encode("base32")) # print the kv pair we just jumped over 344 470 345 471 346 472 if __name__ == "__main__": 347 - if 0: 473 + if 1: 348 474 from carfile import ReadOnlyCARBlockStore 349 475 f = open("/home/david/programming/python/bskyclient/retr0id.car", "rb") 350 476 bs = ReadOnlyCARBlockStore(f) 351 477 commit_obj = dag_cbor.decode(bs.get(bytes(bs.car_roots[0]))) 352 478 mst_root: CID = commit_obj["data"] 353 479 ns = NodeStore(bs) 354 - mst = NodeWrangler(ns, mst_root) 355 - print(mst) 480 + #wrangler = NodeWrangler(ns) 481 + #print(wrangler) 482 + enumerate_mst(ns, mst_root) 356 483 else: 357 484 bs = MemoryBlockStore() 358 485 ns = NodeStore(bs) 359 - mst = NodeWrangler(ns) 486 + wrangler = NodeWrangler(ns) 360 487 root = ns.get(None).cid 361 - print(mst.pretty(root)) 362 - root = mst.put(root, "hello", hash_to_cid(b"blah")) 363 - print(mst.pretty(root)) 364 - root = mst.put(root, "foo", hash_to_cid(b"bar")) 365 - print(mst.pretty(root)) 366 - root = mst.put(root, "bar", hash_to_cid(b"bat")) 367 - print(mst.pretty(root)) 368 - root = mst.delete(root, "foo") 369 - root = mst.delete(root, "hello") 370 - print(mst.pretty(root)) 371 - root = mst.delete(root, "bar") 372 - print(mst.pretty(root)) 373 - root = mst.delete(root, "bar") 374 - print(mst.pretty(root)) 488 + print(ns.pretty(root)) 489 + root = wrangler.put(root, "hello", hash_to_cid(b"blah")) 490 + print(ns.pretty(root)) 491 + root = wrangler.put(root, "foo", hash_to_cid(b"bar")) 492 + print(ns.pretty(root)) 493 + root = wrangler.put(root, "bar", hash_to_cid(b"bat")) 494 + root = wrangler.put(root, "xyzz", hash_to_cid(b"bat")) 495 + print(ns.pretty(root)) 496 + #exit() 497 + enumerate_mst(ns, root) 498 + exit() 499 + root = wrangler.delete(root, "foo") 500 + root = wrangler.delete(root, "hello") 501 + print(ns.pretty(root)) 502 + root = wrangler.delete(root, "bar") 503 + print(ns.pretty(root)) 504 + root = wrangler.delete(root, "bar") 505 + print(ns.pretty(root))