this repo has no description
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

barely-working MST writes

+620
+162
blockstore.py
··· 1 + from abc import ABC, abstractmethod 2 + from typing import Self, Optional, Dict, BinaryIO 3 + import sqlite3 4 + 5 + 6 + class BlockStore(ABC): 7 + """ 8 + A block store is a k/v store where values are immutable once set. They can be deleted, though. 9 + In practice, k==hash(v), but this API doesn't care about that. 10 + 11 + I'm not using the "native" __getitem__, __setitem__, __del__ methods because 12 + the semantics of these methods differ subtly. 13 + 14 + if you call put() twice with the same args, the second call is a nop. 15 + if you call put() twice with the same key but different value, you get a ValueError 16 + 17 + get() offers no default return value, you get a KeyError if it doesn't exist. 18 + 19 + if you try to delete a key that doesn't exist, that's a nop. 20 + """ 21 + 22 + @abstractmethod 23 + def put(self, key: bytes, value: bytes) -> None: 24 + pass 25 + 26 + @abstractmethod 27 + def get(self, key: bytes) -> bytes: 28 + pass 29 + 30 + @abstractmethod 31 + def delete(self, key: bytes) -> None: 32 + pass 33 + 34 + 35 + class MemoryBlockStore(BlockStore): 36 + _state = Dict[bytes, bytes] 37 + 38 + def __init__(self, state: Optional[Dict[bytes, bytes]]=None) -> None: 39 + """ 40 + NB: if a state dict is passed, it'll get mutated in-place 41 + """ 42 + self._state = dict() if state is None else state 43 + 44 + def put(self, key: bytes, value: bytes) -> None: 45 + existing_value = self._state.get(key) 46 + if existing_value: 47 + if existing_value == value: 48 + return # the value matches, there's nothing to do 49 + raise ValueError("block values are immutable") 50 + self._state[key] = value 51 + 52 + def get(self, key: bytes) -> bytes: 53 + value = self._state.get(key) 54 + if value is None: 55 + raise KeyError("no block matches this key") 56 + return value 57 + 58 + def delete(self, key: bytes) -> None: 59 + if key in self._state: 60 + del self._state[key] 61 + 62 + 63 + class SqliteBlockStore(BlockStore): 64 + """ 65 + NB: Caller is responsible for calling commit(), etc. 66 + TODO: consider allowing a custom table name? 67 + """ 68 + def __init__(self, con: sqlite3.Connection, table: str="mst_blocks") -> None: 69 + self.table = table 70 + self._cur = con.cursor() 71 + self._cur.execute(f""" 72 + CREATE TABLE IF NOT EXISTS {self.table} ( 73 + block_key BLOB PRIMARY KEY, 74 + block_val BLOB NOT NULL 75 + ) WITHOUT ROWID; 76 + """) 77 + 78 + def put(self, key: bytes, value: bytes) -> None: 79 + # XXX: this will fail silently if the key already exists but with a different value 80 + # (that should never happen but it'd be nice to have guard rails) 81 + self._cur.execute(f"INSERT OR IGNORE INTO {self.table} (block_key, block_val) VALUES (?, ?)", (key, value)) 82 + 83 + def get(self, key: bytes) -> bytes: 84 + row = self._cur.execute(f"SELECT block_val FROM {self.table} WHERE block_key=?", (key,)).fetchone() 85 + if row is None: 86 + raise KeyError("no block matches this key") 87 + return row[0] 88 + 89 + def delete(self, key: bytes) -> None: 90 + self._cur.execute(f"DELETE FROM {self.table} WHERE block_key=?", (key,)) 91 + 92 + 93 + class OverlayBlockStore(BlockStore): 94 + """ 95 + reads come from "upper", then "lower" if they don't exist in upper. 96 + writes/deletes go only to "upper". 97 + """ 98 + 99 + def __init__(self, upper: BlockStore, lower: BlockStore) -> None: 100 + self.upper = upper 101 + self.lower = lower 102 + 103 + def put(self, key: bytes, value: bytes) -> None: 104 + self.upper.put(key, value) 105 + 106 + def get(self, key: bytes) -> bytes: 107 + try: 108 + return self.upper.get(key) 109 + except KeyError: 110 + return self.lower.get(key) 111 + 112 + def delete(self, key: bytes) -> None: 113 + self.upper.delete(key) 114 + 115 + 116 + 117 + if __name__ == "__main__": 118 + import os 119 + 120 + bs = MemoryBlockStore() 121 + bs.put(b"hello", b"world") 122 + 123 + bs.put(b"hello", b"world") # putting twice is a nop 124 + 125 + try: 126 + bs.put(b"hello", b"foobar") 127 + assert(False) # should be unreachable 128 + except ValueError: 129 + pass 130 + 131 + print("hello ->", bs.get(b"hello")) 132 + 133 + bs.delete(b"nothing") # nop 134 + 135 + bs.delete(b"hello") 136 + 137 + try: 138 + bs.get(b"hello") 139 + assert(False) # should be unreachable 140 + except KeyError: 141 + pass 142 + 143 + TEST_DB = "test.db" 144 + 145 + with sqlite3.connect(TEST_DB) as db: 146 + bs = SqliteBlockStore(db) 147 + bs.put(b"hello", b"sqlite world") 148 + 149 + with sqlite3.connect(TEST_DB) as db: 150 + bs = SqliteBlockStore(db) 151 + print("hello ->", bs.get(b"hello")) 152 + bs.delete(b"hello") 153 + 154 + try: 155 + with sqlite3.connect(TEST_DB) as db: 156 + bs = SqliteBlockStore(db) 157 + print("hello ->", bs.get(b"hello")) 158 + assert(False) # should be unreachable 159 + except KeyError: 160 + pass 161 + 162 + os.remove(TEST_DB) # clean up
+74
carfile.py
··· 1 + from typing import Self, Optional, Dict, List, Tuple, BinaryIO 2 + from multiformats import varint, CID 3 + import dag_cbor 4 + 5 + from blockstore import BlockStore 6 + 7 + class ReadOnlyCARBlockStore(BlockStore): 8 + """ 9 + This is a sliiiightly unclean abstraction because BlockStores are indexed 10 + by `bytes` rather than CID, but same idea. This is convenient for verifying 11 + proofs provided in CAR format, and for testing. 12 + """ 13 + 14 + car_roots: List[CID] 15 + block_offsets: Dict[bytes, Tuple[int, int]] # CID -> (offset, length) 16 + 17 + def __init__(self, file: BinaryIO) -> None: 18 + """ 19 + pre-scan over the whole file, recording the offsets of each block 20 + """ 21 + 22 + self.file = file 23 + file.seek(0) 24 + 25 + # parse out CAR header 26 + header_len = varint.decode(file) 27 + header = file.read(header_len) 28 + if len(header) != header_len: 29 + raise EOFError("not enough CAR header bytes") 30 + header_obj = dag_cbor.decode(header) 31 + if header_obj.get("version") != 1: 32 + raise ValueError(f"unsupported CAR version ({header_obj.get('version')})") 33 + self.car_roots = header_obj["roots"] 34 + 35 + # scan through the CAR to find block offsets 36 + self.block_offsets = {} 37 + while True: 38 + try: 39 + length = varint.decode(file) 40 + except ValueError: 41 + break # EOF 42 + start = file.tell() 43 + CID_LENGTH = 36 # XXX: this is a questionable assumption!!! 44 + cid = file.read(CID_LENGTH) 45 + if cid[:4] != b"\x01\x71\x12\x20": # I think this is enough to verify the assumption 46 + raise ValueError("unsupported CID type") 47 + self.block_offsets[cid] = (start + CID_LENGTH, length - CID_LENGTH) 48 + file.seek(start + length) 49 + 50 + def put(self, key: bytes, value: bytes) -> None: 51 + raise NotImplementedError("ReadOnlyCARBlockStore does not support put()") 52 + 53 + def get(self, key: bytes) -> bytes: 54 + offset, length = self.block_offsets[key] 55 + self.file.seek(offset) 56 + value = self.file.read(length) 57 + if len(value) != length: 58 + raise EOFError() 59 + return value 60 + 61 + def delete(self, key: bytes) -> None: 62 + raise NotImplementedError("ReadOnlyCARBlockStore does not support delete()") 63 + 64 + 65 + if __name__ == "__main__": 66 + f = open("/home/david/programming/python/bskyclient/retr0id.car", "rb") 67 + bs = ReadOnlyCARBlockStore(f) 68 + commit_obj = dag_cbor.decode(bs.get(bytes(bs.car_roots[0]))) 69 + print(commit_obj) 70 + mst_root: CID = commit_obj["data"] 71 + 72 + from mst import NodeStore 73 + ns = NodeStore(bs) 74 + print(ns.get(mst_root))
+375
mst.py
··· 1 + import hashlib 2 + import dag_cbor 3 + import operator 4 + from multiformats import multihash, CID 5 + from functools import cached_property 6 + from more_itertools import ilen 7 + from itertools import takewhile 8 + from dataclasses import dataclass 9 + from typing import Tuple, Self, Optional, Any, Type, Iterable 10 + 11 + from util import indent, hash_to_cid 12 + from blockstore import BlockStore, MemoryBlockStore 13 + 14 + # tuple helpers 15 + def tuple_replace_at(original: tuple, i: int, value: Any) -> tuple: 16 + return original[:i] + (value,) + original[i + 1:] 17 + 18 + def tuple_insert_at(original: tuple, i: int, value: Any) -> tuple: 19 + return original[:i] + (value,) + original[i:] 20 + 21 + def tuple_remove_at(original: tuple, i: int) -> tuple: 22 + return original[:i] + original[i + 1:] 23 + 24 + 25 + @dataclass(frozen=True) # frozen == immutable == win 26 + class MSTNode: 27 + """ 28 + k/v pairs are interleaved between subtrees like so: 29 + 30 + keys: (0, 1, 2, 3) 31 + vals: (0, 1, 2, 3) 32 + subtrees: (0, 1, 2, 3, 4) 33 + """ 34 + keys: Tuple[str] # collection/rkey 35 + vals: Tuple[CID] # record CIDs 36 + subtrees: Tuple[Optional[CID]] # a None value represents an empty subtree 37 + 38 + 39 + # NB: __init__ is auto-generated by dataclass decorator 40 + 41 + # these checks should never fail, and could be skipped for performance 42 + def __post_init__(self) -> None: 43 + # TODO: maybe check that they're tuples here? 44 + # implicitly, the length of self.subtrees must be at least 1 45 + if len(self.subtrees) != len(self.keys) + 1: 46 + raise ValueError("Invalid subtree count") 47 + if len(self.keys) != len(self.vals): 48 + raise ValueError("Mismatched keys/vals lengths") 49 + 50 + @classmethod 51 + def empty_root(cls) -> Self: 52 + return cls( 53 + subtrees=(None,), 54 + keys=(), 55 + vals=() 56 + ) 57 + 58 + @staticmethod 59 + def key_height(key: str) -> int: 60 + digest = int.from_bytes(hashlib.sha256(key.encode()).digest(), "big") 61 + leading_zeroes = 256 - digest.bit_length() 62 + return leading_zeroes // 2 63 + 64 + # since we're immutable, this can be cached 65 + @cached_property 66 + def cid(self) -> CID: 67 + digest = multihash.digest(self.serialised, "sha2-256") 68 + cid = CID("base32", 1, "dag-cbor", digest) 69 + return cid 70 + 71 + # likewise 72 + @cached_property 73 + def serialised(self) -> bytes: 74 + e = [] 75 + prev_key = b"" 76 + for subtree, key_str, value in zip(self.subtrees[1:], self.keys, self.vals): 77 + key_bytes = key_str.encode() 78 + shared_prefix_len = ilen(takewhile(bool, map(operator.eq, prev_key, key_bytes))) # I love functional programming 79 + e.append({ 80 + "k": key_bytes[shared_prefix_len:], 81 + "p": shared_prefix_len, 82 + "t": subtree, 83 + "v": value, 84 + }) 85 + prev_key = key_bytes 86 + return dag_cbor.encode({ 87 + "e": e, 88 + "l": self.subtrees[0] 89 + }) 90 + 91 + @classmethod 92 + def deserialise(cls, data: bytes) -> Self: 93 + cbor = dag_cbor.decode(data) 94 + if len(cbor) != 2: # e, l 95 + raise ValueError("malformed MST node") 96 + subtrees = [cbor["l"]] 97 + keys = [] 98 + vals = [] 99 + prev_key = b"" 100 + for e in cbor["e"]: # TODO: make extra sure that these checks are watertight 101 + if len(e) != 4: # k, p, t, v 102 + raise ValueError("malformed MST node") 103 + prefix_len: int = e["p"] 104 + suffix: bytes = e["k"] 105 + if prefix_len > len(prev_key): 106 + raise ValueError("invalid MST key prefix len") 107 + if prev_key[prefix_len:prefix_len+1] == suffix[:1]: 108 + raise ValueError("non-optimal MST key prefix len") 109 + this_key = prev_key[:prefix_len] + suffix 110 + if this_key <= prev_key: 111 + raise ValueError("invalid MST key sort order") 112 + keys.append(this_key.decode()) 113 + vals.append(e["v"]) 114 + subtrees.append(e["t"]) 115 + prev_key = this_key 116 + 117 + return cls( 118 + subtrees=tuple(subtrees), 119 + keys=tuple(keys), 120 + vals=tuple(vals) 121 + ) 122 + 123 + def is_empty(self) -> bool: 124 + return self.subtrees == (None,) 125 + 126 + def _to_optional(self) -> Optional[CID]: 127 + """ 128 + returns None if the node is empty 129 + """ 130 + if self.is_empty(): 131 + return None 132 + return self.cid 133 + 134 + 135 + @cached_property 136 + def height(self) -> int: 137 + # if there are keys at this level, query one directly 138 + if self.keys: 139 + return self.key_height(self.keys[0]) 140 + 141 + # we're an empty tree 142 + if self.subtrees[0] is None: 143 + return 0 144 + 145 + # this should only happen for non-root nodes with no keys 146 + raise Exception("cannot determine node height") 147 + 148 + def gte_index(self, key: str) -> int: 149 + """ 150 + find the index of the first key greater than or equal to the specified key 151 + if all keys are smaller, it returns len(keys) 152 + """ 153 + i = 0 # this loop could be a binary search but not worth it for small fanouts 154 + while i < len(self.keys) and key > self.keys[i]: 155 + i += 1 156 + return i 157 + 158 + 159 + class NodeStore: 160 + """ 161 + NodeStore wraps a BlockStore to provide a more ergonomic interface 162 + for loading and storing MSTNodes 163 + """ 164 + bs: BlockStore 165 + 166 + def __init__(self, bs: BlockStore) -> None: 167 + self.bs = bs 168 + 169 + # TODO: LRU cache this 170 + def get(self, cid: Optional[CID]) -> MSTNode: 171 + """ 172 + if cid is None, returns an empty MST node 173 + """ 174 + if cid is None: 175 + return MSTNode.empty_root() 176 + 177 + return MSTNode.deserialise(self.bs.get(bytes(cid))) 178 + 179 + # TODO: also put in cache 180 + def put(self, node: MSTNode) -> MSTNode: 181 + self.bs.put(bytes(node.cid), node.serialised) 182 + return node # this is convenient 183 + 184 + 185 + 186 + class MST: 187 + ns: NodeStore 188 + root: CID 189 + 190 + def __init__(self, ns: NodeStore, root: Optional[CID]=None) -> None: 191 + self.ns = ns 192 + if root is None: 193 + root = ns.put(MSTNode.empty_root()).cid 194 + self.root = root 195 + 196 + def put(self, key: str, val: CID): 197 + self.root = self._put(key, val) 198 + 199 + def delete(self, key: str): 200 + self.root = self._delete(key) 201 + 202 + def _put(self, key: str, val: CID) -> CID: 203 + root = ns.get(self.root) 204 + if root.is_empty(): # special case for empty tree 205 + return self._put_here(root, key, val) 206 + return self._put_recursive(root, key, val, MSTNode.key_height(key), root.height) 207 + 208 + def _put_here(self, node: MSTNode, key: str, val: CID) -> CID: 209 + i = node.gte_index(key) 210 + 211 + # the key is already present! 212 + if i < len(node.keys) and node.keys[i] == key: 213 + if node.vals[i] == val: 214 + return node.cid # we can return our old self if there is no change 215 + return self.ns.put(MSTNode( 216 + keys=node.keys, 217 + vals=tuple_replace_at(node.vals, i, val), 218 + subtrees=node.subtrees 219 + )).cid 220 + 221 + return self.ns.put(MSTNode( 222 + keys=tuple_insert_at(node.keys, i, key), 223 + vals=tuple_insert_at(node.vals, i, val), 224 + subtrees = node.subtrees[:i] + \ 225 + self._split_on_key(node.subtrees[i], key) + \ 226 + node.subtrees[i + 1:], 227 + )).cid 228 + 229 + def _put_recursive(self, node: MSTNode, key: str, val: CID, key_height: int, tree_height: int) -> CID: 230 + if key_height > tree_height: # we need to grow the tree 231 + return self.ns.put(self._put_recursive( 232 + MSTNode.empty_root(), 233 + key, val, key_height, tree_height + 1 234 + )).cid 235 + 236 + if key_height < tree_height: # we need to look below 237 + i = node.gte_index(key) 238 + return self.ns.put(MSTNode( 239 + keys=node.keys, 240 + vals=node.vals, 241 + subtrees=tuple_replace_at( 242 + node.subtrees, i, 243 + self._put_recursive( 244 + self.ns.get(node.subtrees[i]), 245 + key, val, key_height, tree_height - 1 246 + ) 247 + ) 248 + )).cid 249 + 250 + # we can insert here 251 + assert(key_height == tree_height) 252 + return self._put_here(node, key, val) 253 + 254 + def _split_on_key(self, node_cid: Optional[CID], key: str) -> Tuple[Optional[CID], Optional[CID]]: 255 + if node_cid is None: 256 + return None, None 257 + node = ns.get(node_cid) 258 + i = node.gte_index(key) 259 + lsub, rsub = self._split_on_key(node.subtrees[i], key) 260 + return self.ns.put(MSTNode( 261 + keys=node.keys[:i], 262 + vals=node.vals[:i], 263 + subtrees=node.subtrees[:i] + (lsub,) 264 + ))._to_optional(), self.ns.put(MSTNode( 265 + keys=node.keys[i:], 266 + vals=node.vals[i:], 267 + subtrees=(rsub,) + node.subtrees[i + 1:], 268 + ))._to_optional() 269 + 270 + def _squash_top(self, node_cid: Optional[CID]) -> Optional[CID]: 271 + """ 272 + strip empty nodes from the top of the tree 273 + """ 274 + node = self.ns.get(node_cid) 275 + if node.keys: 276 + return node_cid 277 + if node.subtrees[0] is None: 278 + return node_cid 279 + return self._squash_top(node.subtrees[0]) 280 + 281 + def _delete(self, key: str) -> CID: 282 + root = ns.get(self.root) 283 + # XXX: handle empty tree result case 284 + return self._squash_top(self._delete_recursive(root, key, MSTNode.key_height(key), root.height)) 285 + 286 + 287 + def _delete_recursive(self, node: MSTNode, key: str, key_height: int, tree_height: int) -> Optional[CID]: 288 + if key_height > tree_height: # the key cannot possibly be in this tree, no change needed 289 + return node.cid 290 + 291 + i = node.gte_index(key) 292 + if key_height < tree_height: # the key must be deleted from a subtree 293 + if node.subtrees[i] is None: 294 + return node.cid # the key cannot be in this subtree, no change needed 295 + return self.ns.put(MSTNode( 296 + keys=node.keys, 297 + vals=node.vals, 298 + subtrees=tuple_replace_at( 299 + node.subtrees, 300 + i, 301 + self._delete_recursive(self.ns.get(node.subtrees[i]), key, key_height, tree_height - 1) 302 + ) 303 + ))._to_optional() 304 + 305 + i = node.gte_index(key) 306 + if i == len(node.keys) or node.keys[i] != key: 307 + return node.cid # key already not present 308 + 309 + assert(node.keys[i] == key) # sanity check (should always be true) 310 + 311 + return self.ns.put(MSTNode( 312 + keys=tuple_remove_at(node.keys, i), 313 + vals=tuple_remove_at(node.vals, i), 314 + subtrees=node.subtrees[:i] + ( 315 + self._merge(node.subtrees[i], node.subtrees[i + 1]), 316 + ) + node.subtrees[i + 2:] 317 + ))._to_optional() 318 + 319 + def _merge(self, left_cid: Optional[CID], right_cid: Optional[CID]) -> Optional[CID]: 320 + if left_cid is None: 321 + return right_cid # includes the case where left == right == None 322 + if right_cid is None: 323 + return left_cid 324 + left = self.ns.get(left_cid) 325 + right = self.ns.get(right_cid) 326 + return self.ns.put(MSTNode( 327 + keys=left.keys + right.keys, 328 + vals=left.vals + right.vals, 329 + subtrees=left.subtrees[:-1] + ( 330 + self._merge( 331 + left.subtrees[-1], 332 + right.subtrees[0] 333 + ), 334 + ) + right.subtrees[1:] 335 + ))._to_optional() 336 + 337 + def __repr__(self): 338 + return self.pretty(self.root) 339 + 340 + def pretty(self, node_cid: Optional[CID]) -> str: 341 + if node_cid is None: 342 + return "<empty>" 343 + node = self.ns.get(node_cid) 344 + res = f"MSTNode<cid={node.cid.encode("base32")}>(\n{indent(self.pretty(node.subtrees[0]))},\n" 345 + for k, v, t in zip(node.keys, node.vals, node.subtrees[1:]): 346 + res += f" {k!r} ({MSTNode.key_height(k)}) -> {v.encode("base32")},\n" 347 + res += indent(self.pretty(t)) + ",\n" 348 + res += ")" 349 + return res 350 + 351 + 352 + if __name__ == "__main__": 353 + if 0: 354 + from carfile import ReadOnlyCARBlockStore 355 + f = open("/home/david/programming/python/bskyclient/retr0id.car", "rb") 356 + bs = ReadOnlyCARBlockStore(f) 357 + commit_obj = dag_cbor.decode(bs.get(bytes(bs.car_roots[0]))) 358 + mst_root: CID = commit_obj["data"] 359 + ns = NodeStore(bs) 360 + mst = MST(ns, mst_root) 361 + print(mst) 362 + else: 363 + bs = MemoryBlockStore() 364 + ns = NodeStore(bs) 365 + mst = MST(ns) 366 + print(mst) 367 + mst.root = mst._put("hello", hash_to_cid(b"blah")) 368 + print(mst) 369 + mst.root = mst._put("foo", hash_to_cid(b"bar")) 370 + print(mst) 371 + mst.root = mst._put("bar", hash_to_cid(b"bat")) 372 + print(mst) 373 + mst.root = mst._delete("foo") 374 + mst.root = mst._delete("hello") 375 + print(mst)
+9
util.py
··· 1 + from multiformats import multihash, CID 2 + 3 + def indent(msg: str) -> str: 4 + ISTR = " " 5 + return ISTR + msg.replace("\n", "\n"+ISTR) 6 + 7 + def hash_to_cid(data: bytes, codec="dag-cbor") -> CID: 8 + digest = multihash.digest(data, "sha2-256") 9 + return CID("base32", 1, codec, digest)