this repo has no description
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

restructuring

+248 -243
-227
src/atmst/mst/__init__.py
··· 1 - import hashlib 2 - import dag_cbor 3 - import operator 4 - from multiformats import multihash, CID 5 - from functools import cached_property 6 - from more_itertools import ilen 7 - from itertools import takewhile 8 - from dataclasses import dataclass 9 - from typing import Tuple, Self, Optional 10 - 11 - 12 - @dataclass(frozen=True) # frozen == immutable == win 13 - class MSTNode: 14 - """ 15 - k/v pairs are interleaved between subtrees like so: :: 16 - 17 - keys: (0, 1, 2, 3) 18 - vals: (0, 1, 2, 3) 19 - subtrees: (0, 1, 2, 3, 4) 20 - 21 - If a method is implemented in this class, it's because it's a function/property 22 - of a single node, as opposed to a whole tree 23 - """ 24 - keys: Tuple[str] # collection/rkey 25 - vals: Tuple[CID] # record CIDs 26 - subtrees: Tuple[Optional[CID]] # a None value represents an empty subtree 27 - 28 - 29 - # NB: __init__ is auto-generated by dataclass decorator 30 - 31 - # these checks should never fail, and could be skipped for performance 32 - def __post_init__(self) -> None: 33 - # TODO: maybe check that they're tuples here? 34 - # implicitly, the length of self.subtrees must be at least 1 35 - if len(self.subtrees) != len(self.keys) + 1: 36 - raise ValueError("Invalid subtree count") 37 - if len(self.keys) != len(self.vals): 38 - raise ValueError("Mismatched keys/vals lengths") 39 - 40 - @classmethod 41 - def empty_root(cls) -> Self: 42 - return cls( 43 - subtrees=(None,), 44 - keys=(), 45 - vals=() 46 - ) 47 - 48 - # this should maybe not be implemented here? 49 - @staticmethod 50 - def key_height(key: str) -> int: 51 - digest = int.from_bytes(hashlib.sha256(key.encode()).digest(), "big") 52 - leading_zeroes = 256 - digest.bit_length() 53 - return leading_zeroes // 2 54 - 55 - # since we're immutable, this can be cached 56 - @cached_property 57 - def cid(self) -> CID: 58 - digest = multihash.digest(self.serialised, "sha2-256") 59 - cid = CID("base32", 1, "dag-cbor", digest) 60 - return cid 61 - 62 - # likewise 63 - @cached_property 64 - def serialised(self) -> bytes: 65 - e = [] 66 - prev_key = b"" 67 - for subtree, key_str, value in zip(self.subtrees[1:], self.keys, self.vals): 68 - key_bytes = key_str.encode() 69 - shared_prefix_len = ilen(takewhile(bool, map(operator.eq, prev_key, key_bytes))) # I love functional programming 70 - e.append({ 71 - "k": key_bytes[shared_prefix_len:], 72 - "p": shared_prefix_len, 73 - "t": subtree, 74 - "v": value, 75 - }) 76 - prev_key = key_bytes 77 - return dag_cbor.encode({ 78 - "e": e, 79 - "l": self.subtrees[0] 80 - }) 81 - 82 - @classmethod 83 - def deserialise(cls, data: bytes) -> Self: 84 - cbor = dag_cbor.decode(data) 85 - if len(cbor) != 2: # e, l 86 - raise ValueError("malformed MST node") 87 - subtrees = [cbor["l"]] 88 - keys = [] 89 - vals = [] 90 - prev_key = b"" 91 - for e in cbor["e"]: # TODO: make extra sure that these checks are watertight wrt non-canonical representations 92 - if len(e) != 4: # k, p, t, v 93 - raise ValueError("malformed MST node") 94 - prefix_len: int = e["p"] 95 - suffix: bytes = e["k"] 96 - if prefix_len > len(prev_key): 97 - raise ValueError("invalid MST key prefix len") 98 - if prev_key[prefix_len:prefix_len+1] == suffix[:1]: 99 - raise ValueError("non-optimal MST key prefix len") 100 - this_key = prev_key[:prefix_len] + suffix 101 - if this_key <= prev_key: 102 - raise ValueError("invalid MST key sort order") 103 - keys.append(this_key.decode()) 104 - vals.append(e["v"]) 105 - subtrees.append(e["t"]) 106 - prev_key = this_key 107 - 108 - return cls( 109 - subtrees=tuple(subtrees), 110 - keys=tuple(keys), 111 - vals=tuple(vals) 112 - ) 113 - 114 - def is_empty(self) -> bool: 115 - return self.subtrees == (None,) 116 - 117 - def _to_optional(self) -> Optional[CID]: 118 - """ 119 - returns None if the node is empty 120 - """ 121 - if self.is_empty(): 122 - return None 123 - return self.cid 124 - 125 - 126 - @cached_property 127 - def height(self) -> int: 128 - # if there are keys at this level, query one directly 129 - if self.keys: 130 - return self.key_height(self.keys[0]) 131 - 132 - # we're an empty tree 133 - if self.subtrees[0] is None: 134 - return 0 135 - 136 - # this should only happen for non-root nodes with no keys 137 - raise Exception("cannot determine node height") 138 - 139 - def gte_index(self, key: str) -> int: 140 - """ 141 - find the index of the first key greater than or equal to the specified key 142 - if all keys are smaller, it returns len(keys) 143 - """ 144 - i = 0 # this loop could be a binary search but not worth it for small fanouts 145 - while i < len(self.keys) and key > self.keys[i]: 146 - i += 1 147 - return i 148 - 149 - 150 - """ 151 - if __name__ == "__main__": 152 - from .blockstore import MemoryBlockStore, OverlayBlockStore 153 - from .blockstore.car_reader import ReadOnlyCARBlockStore 154 - 155 - if 0: 156 - import sys 157 - sys.setrecursionlimit(999999999) 158 - f = open("/home/david/programming/python/bskyclient/retr0id.car", "rb") 159 - bs = OverlayBlockStore(MemoryBlockStore(), ReadOnlyCARBlockStore(f)) 160 - commit_obj = dag_cbor.decode(bs.get_block(bytes(bs.lower.car_roots[0]))) 161 - mst_root: CID = commit_obj["data"] 162 - ns = NodeStore(bs) 163 - wrangler = NodeWrangler(ns) 164 - #print(wrangler) 165 - #enumerate_mst(ns, mst_root) 166 - enumerate_mst_range(ns, mst_root, "app.bsky.feed.generator/", "app.bsky.feed.generator/\xff") 167 - 168 - root2 = wrangler.del_record(mst_root, "app.bsky.feed.generator/alttext") 169 - root2 = wrangler.del_record(root2, "app.bsky.feed.like/3kas3fyvkti22") 170 - root2 = wrangler.put_record(root2, "app.bsky.feed.like/3kc3brpic2z2p", hash_to_cid(b"blah")) 171 - 172 - c, d = mst_diff(ns, mst_root, root2) 173 - print("CREATED:") 174 - for x in c: 175 - print("created", x.encode("base32")) 176 - print("DELETED:") 177 - for x in d: 178 - print("deleted", x.encode("base32")) 179 - 180 - for op in record_diff(ns, c, d): 181 - print(op) 182 - 183 - e, f = very_slow_mst_diff(ns, mst_root, root2) 184 - assert(e == c) 185 - assert(f == d) 186 - else: 187 - bs = MemoryBlockStore() 188 - ns = NodeStore(bs) 189 - wrangler = NodeWrangler(ns) 190 - root = ns.get_node(None).cid 191 - print(ns.pretty(root)) 192 - root = wrangler.put_record(root, "hello", hash_to_cid(b"blah")) 193 - print(ns.pretty(root)) 194 - root = wrangler.put_record(root, "foo", hash_to_cid(b"bar")) 195 - print(ns.pretty(root)) 196 - root_a = root 197 - root = wrangler.put_record(root, "bar", hash_to_cid(b"bat")) 198 - root = wrangler.put_record(root, "xyzz", hash_to_cid(b"bat")) 199 - root = wrangler.del_record(root, "foo") 200 - print("=============") 201 - print(ns.pretty(root_a)) 202 - print("=============") 203 - print(ns.pretty(root)) 204 - #exit() 205 - print("=============") 206 - enumerate_mst(ns, root) 207 - c, d = mst_diff(ns, root_a, root) 208 - print("CREATED:") 209 - for x in c: 210 - print("created", x.encode("base32")) 211 - print("DELETED:") 212 - for x in d: 213 - print("deleted", x.encode("base32")) 214 - 215 - e, f = very_slow_mst_diff(ns, root_a, root) 216 - assert(e == c) 217 - assert(f == d) 218 - 219 - exit() 220 - root = wrangler.delete(root, "foo") 221 - root = wrangler.delete(root, "hello") 222 - print(ns.pretty(root)) 223 - root = wrangler.delete(root, "bar") 224 - print(ns.pretty(root)) 225 - root = wrangler.delete(root, "bar") 226 - print(ns.pretty(root)) 227 - """
+18 -13
src/atmst/mst/diff.py
··· 4 4 5 5 from multiformats import CID 6 6 7 - from . import MSTNode 7 + from .node import MSTNode 8 8 from .node_store import NodeStore 9 9 from .node_walker import NodeWalker 10 10 11 11 12 12 def record_diff(ns: NodeStore, created: set[CID], deleted: set[CID]) -> Iterable[tuple]: 13 13 """ 14 - Given two sets of MST nodes (for example, the result of `mst_diff`), this 15 - returns an iterator of record changes, in one of 3 formats: 14 + Given two sets of MST nodes (for example, the result of :meth:`mst_diff`), this 15 + returns an iterator of record changes, in one of 3 formats: :: 16 16 17 - ("created", key, value) 18 - ("updated", key, old_value, new_value) 19 - ("deleted", key, value) 17 + ("created", key, value) 18 + ("updated", key, old_value, new_value) 19 + ("deleted", key, value) 20 + 20 21 """ 21 22 created_kv = reduce(operator.__or__, ({ k: v for k, v in zip(node.keys, node.vals)} for node in map(ns.get_node, created)), {}) 22 23 deleted_kv = reduce(operator.__or__, ({ k: v for k, v in zip(node.keys, node.vals)} for node in map(ns.get_node, deleted)), {}) ··· 32 33 33 34 def very_slow_mst_diff(ns: NodeStore, root_a: CID, root_b: CID): 34 35 """ 35 - This should return the same result as mst_diff, but it gets there in a very slow 36 - yet less error-prone way, so it's useful for testing. 36 + This should return the same result as :meth:`mst_diff`, but it gets there in a slow 37 + but much more obvious way (enumerating all nodes), so it's useful for testing. 37 38 38 39 It's actually faster for smaller trees, but it chokes on trees with thousands of nodes (especially if the NodeStore is slow). 39 40 """ ··· 44 45 EMPTY_NODE_CID = MSTNode.empty_root().cid 45 46 46 47 def mst_diff(ns: NodeStore, root_a: CID, root_b: CID) -> Tuple[Set[CID], Set[CID]]: # created, deleted 48 + """ 49 + Given two MST root node CIDs, efficiently compute the difference between them, represented as 50 + two sets holding the created and deleted MST nodes respectively (referenced by CIDs). 51 + """ 47 52 created = set() # MST nodes in b but not in a 48 53 deleted = set() # MST nodes in a but not in b 49 - mst_diff_recursive(created, deleted, NodeWalker(ns, root_a), NodeWalker(ns, root_b)) 54 + _mst_diff_recursive(created, deleted, NodeWalker(ns, root_a), NodeWalker(ns, root_b)) 50 55 middle = created & deleted # my algorithm has occasional false-positives 51 56 #assert(not middle) # this fails 52 57 #print("middle", len(middle)) ··· 59 64 created.add(EMPTY_NODE_CID) 60 65 return created, deleted 61 66 62 - def mst_diff_recursive(created: Set[CID], deleted: Set[CID], a: NodeWalker, b: NodeWalker): # created, deleted 67 + def _mst_diff_recursive(created: Set[CID], deleted: Set[CID], a: NodeWalker, b: NodeWalker): # created, deleted 63 68 # the easiest of all cases 64 - if a.frame.node.cid == b.frame.node.cid: 69 + if a.frame.node == b.frame.node: 65 70 return # no difference 66 71 67 72 # trivial ··· 113 118 114 119 # the rkeys now match, but the subrees below us might not 115 120 116 - mst_diff_recursive(created, deleted, a.subtree_walker(), b.subtree_walker()) 121 + _mst_diff_recursive(created, deleted, a.subtree_walker(), b.subtree_walker()) 117 122 118 123 # check if we can still go right XXX: do we need to care about the case where one can, but the other can't? 119 124 # To consider: maybe if I just step a, b will catch up automagically 120 - if a.rkey == a.stack[0].rkey and b.rkey == a.stack[0].rkey: 125 + if a.rkey == a.stack[0].rkey and b.rkey == b.stack[0].rkey: 121 126 break 122 127 123 128 a.right()
+227
src/atmst/mst/node.py
··· 1 + import hashlib 2 + import dag_cbor 3 + import operator 4 + from multiformats import multihash, CID 5 + from functools import cached_property 6 + from more_itertools import ilen 7 + from itertools import takewhile 8 + from dataclasses import dataclass 9 + from typing import Tuple, Self, Optional 10 + 11 + 12 + @dataclass(frozen=True) # frozen == immutable == win 13 + class MSTNode: 14 + """ 15 + k/v pairs are interleaved between subtrees like so: :: 16 + 17 + keys: (0, 1, 2, 3) 18 + vals: (0, 1, 2, 3) 19 + subtrees: (0, 1, 2, 3, 4) 20 + 21 + If a method is implemented in this class, it's because it's a function/property 22 + of a single node, as opposed to a whole tree 23 + """ 24 + keys: Tuple[str] # collection/rkey 25 + vals: Tuple[CID] # record CIDs 26 + subtrees: Tuple[Optional[CID]] # a None value represents an empty subtree 27 + 28 + 29 + # NB: __init__ is auto-generated by dataclass decorator 30 + 31 + # these checks should never fail, and could be skipped for performance 32 + def __post_init__(self) -> None: 33 + # TODO: maybe check that they're tuples here? 34 + # implicitly, the length of self.subtrees must be at least 1 35 + if len(self.subtrees) != len(self.keys) + 1: 36 + raise ValueError("Invalid subtree count") 37 + if len(self.keys) != len(self.vals): 38 + raise ValueError("Mismatched keys/vals lengths") 39 + 40 + @classmethod 41 + def empty_root(cls) -> Self: 42 + return cls( 43 + subtrees=(None,), 44 + keys=(), 45 + vals=() 46 + ) 47 + 48 + # this should maybe not be implemented here? 49 + @staticmethod 50 + def key_height(key: str) -> int: 51 + digest = int.from_bytes(hashlib.sha256(key.encode()).digest(), "big") 52 + leading_zeroes = 256 - digest.bit_length() 53 + return leading_zeroes // 2 54 + 55 + # since we're immutable, this can be cached 56 + @cached_property 57 + def cid(self) -> CID: 58 + digest = multihash.digest(self.serialised, "sha2-256") 59 + cid = CID("base32", 1, "dag-cbor", digest) 60 + return cid 61 + 62 + # likewise 63 + @cached_property 64 + def serialised(self) -> bytes: 65 + e = [] 66 + prev_key = b"" 67 + for subtree, key_str, value in zip(self.subtrees[1:], self.keys, self.vals): 68 + key_bytes = key_str.encode() 69 + shared_prefix_len = ilen(takewhile(bool, map(operator.eq, prev_key, key_bytes))) # I love functional programming 70 + e.append({ 71 + "k": key_bytes[shared_prefix_len:], 72 + "p": shared_prefix_len, 73 + "t": subtree, 74 + "v": value, 75 + }) 76 + prev_key = key_bytes 77 + return dag_cbor.encode({ 78 + "e": e, 79 + "l": self.subtrees[0] 80 + }) 81 + 82 + @classmethod 83 + def deserialise(cls, data: bytes) -> Self: 84 + cbor = dag_cbor.decode(data) 85 + if len(cbor) != 2: # e, l 86 + raise ValueError("malformed MST node") 87 + subtrees = [cbor["l"]] 88 + keys = [] 89 + vals = [] 90 + prev_key = b"" 91 + for e in cbor["e"]: # TODO: make extra sure that these checks are watertight wrt non-canonical representations 92 + if len(e) != 4: # k, p, t, v 93 + raise ValueError("malformed MST node") 94 + prefix_len: int = e["p"] 95 + suffix: bytes = e["k"] 96 + if prefix_len > len(prev_key): 97 + raise ValueError("invalid MST key prefix len") 98 + if prev_key[prefix_len:prefix_len+1] == suffix[:1]: 99 + raise ValueError("non-optimal MST key prefix len") 100 + this_key = prev_key[:prefix_len] + suffix 101 + if this_key <= prev_key: 102 + raise ValueError("invalid MST key sort order") 103 + keys.append(this_key.decode()) 104 + vals.append(e["v"]) 105 + subtrees.append(e["t"]) 106 + prev_key = this_key 107 + 108 + return cls( 109 + subtrees=tuple(subtrees), 110 + keys=tuple(keys), 111 + vals=tuple(vals) 112 + ) 113 + 114 + def is_empty(self) -> bool: 115 + return self.subtrees == (None,) 116 + 117 + def _to_optional(self) -> Optional[CID]: 118 + """ 119 + returns None if the node is empty 120 + """ 121 + if self.is_empty(): 122 + return None 123 + return self.cid 124 + 125 + 126 + @cached_property 127 + def height(self) -> int: 128 + # if there are keys at this level, query one directly 129 + if self.keys: 130 + return self.key_height(self.keys[0]) 131 + 132 + # we're an empty tree 133 + if self.subtrees[0] is None: 134 + return 0 135 + 136 + # this should only happen for non-root nodes with no keys 137 + raise Exception("cannot determine node height") 138 + 139 + def gte_index(self, key: str) -> int: 140 + """ 141 + find the index of the first key greater than or equal to the specified key 142 + if all keys are smaller, it returns len(keys) 143 + """ 144 + i = 0 # this loop could be a binary search but not worth it for small fanouts 145 + while i < len(self.keys) and key > self.keys[i]: 146 + i += 1 147 + return i 148 + 149 + 150 + """ 151 + if __name__ == "__main__": 152 + from .blockstore import MemoryBlockStore, OverlayBlockStore 153 + from .blockstore.car_reader import ReadOnlyCARBlockStore 154 + 155 + if 0: 156 + import sys 157 + sys.setrecursionlimit(999999999) 158 + f = open("/home/david/programming/python/bskyclient/retr0id.car", "rb") 159 + bs = OverlayBlockStore(MemoryBlockStore(), ReadOnlyCARBlockStore(f)) 160 + commit_obj = dag_cbor.decode(bs.get_block(bytes(bs.lower.car_roots[0]))) 161 + mst_root: CID = commit_obj["data"] 162 + ns = NodeStore(bs) 163 + wrangler = NodeWrangler(ns) 164 + #print(wrangler) 165 + #enumerate_mst(ns, mst_root) 166 + enumerate_mst_range(ns, mst_root, "app.bsky.feed.generator/", "app.bsky.feed.generator/\xff") 167 + 168 + root2 = wrangler.del_record(mst_root, "app.bsky.feed.generator/alttext") 169 + root2 = wrangler.del_record(root2, "app.bsky.feed.like/3kas3fyvkti22") 170 + root2 = wrangler.put_record(root2, "app.bsky.feed.like/3kc3brpic2z2p", hash_to_cid(b"blah")) 171 + 172 + c, d = mst_diff(ns, mst_root, root2) 173 + print("CREATED:") 174 + for x in c: 175 + print("created", x.encode("base32")) 176 + print("DELETED:") 177 + for x in d: 178 + print("deleted", x.encode("base32")) 179 + 180 + for op in record_diff(ns, c, d): 181 + print(op) 182 + 183 + e, f = very_slow_mst_diff(ns, mst_root, root2) 184 + assert(e == c) 185 + assert(f == d) 186 + else: 187 + bs = MemoryBlockStore() 188 + ns = NodeStore(bs) 189 + wrangler = NodeWrangler(ns) 190 + root = ns.get_node(None).cid 191 + print(ns.pretty(root)) 192 + root = wrangler.put_record(root, "hello", hash_to_cid(b"blah")) 193 + print(ns.pretty(root)) 194 + root = wrangler.put_record(root, "foo", hash_to_cid(b"bar")) 195 + print(ns.pretty(root)) 196 + root_a = root 197 + root = wrangler.put_record(root, "bar", hash_to_cid(b"bat")) 198 + root = wrangler.put_record(root, "xyzz", hash_to_cid(b"bat")) 199 + root = wrangler.del_record(root, "foo") 200 + print("=============") 201 + print(ns.pretty(root_a)) 202 + print("=============") 203 + print(ns.pretty(root)) 204 + #exit() 205 + print("=============") 206 + enumerate_mst(ns, root) 207 + c, d = mst_diff(ns, root_a, root) 208 + print("CREATED:") 209 + for x in c: 210 + print("created", x.encode("base32")) 211 + print("DELETED:") 212 + for x in d: 213 + print("deleted", x.encode("base32")) 214 + 215 + e, f = very_slow_mst_diff(ns, root_a, root) 216 + assert(e == c) 217 + assert(f == d) 218 + 219 + exit() 220 + root = wrangler.delete(root, "foo") 221 + root = wrangler.delete(root, "hello") 222 + print(ns.pretty(root)) 223 + root = wrangler.delete(root, "bar") 224 + print(ns.pretty(root)) 225 + root = wrangler.delete(root, "bar") 226 + print(ns.pretty(root)) 227 + """
+1 -1
src/atmst/mst/node_store.py
··· 4 4 5 5 from ..blockstore import BlockStore 6 6 from ..util import indent 7 - from . import MSTNode 7 + from .node import MSTNode 8 8 9 9 class NodeStore: 10 10 """
+1 -1
src/atmst/mst/node_walker.py
··· 3 3 4 4 from multiformats import CID 5 5 6 - from . import MSTNode 6 + from .node import MSTNode 7 7 from .node_store import NodeStore 8 8 9 9 class NodeWalker:
+1 -1
src/atmst/mst/wrangler.py
··· 2 2 3 3 from multiformats import CID 4 4 5 - from . import MSTNode 5 + from .node import MSTNode 6 6 from .node_store import NodeStore 7 7 8 8 # tuple helpers