Constellation, Spacedust, Slingshot, UFOs: atproto crates and services for microcosm
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge pull request #18 from at-microcosm/ufos-salty-hash

UFOs HLL DID-hashing: secret prefix

authored by

phil and committed by
GitHub
bc80a588 84f63b52

+284 -150
+25 -13
Cargo.lock
··· 574 574 575 575 [[package]] 576 576 name = "cardinality-estimator-safe" 577 - version = "2.1.1" 577 + version = "4.0.1" 578 578 source = "registry+https://github.com/rust-lang/crates.io-index" 579 - checksum = "50c14632b90cb42ff2174d2a544ca553c1bccfab54b848ae9ab9e004b90243bf" 579 + checksum = "b41ec0cd313b46ba3b508377544b25aa1d56d05ce9e657e77dfb001d5e726e53" 580 580 dependencies = [ 581 + "digest", 581 582 "enum_dispatch", 582 583 "serde", 583 - "wyhash", 584 584 ] 585 585 586 586 [[package]] ··· 1360 1360 1361 1361 [[package]] 1362 1362 name = "getrandom" 1363 - version = "0.3.2" 1363 + version = "0.3.3" 1364 1364 source = "registry+https://github.com/rust-lang/crates.io-index" 1365 - checksum = "73fea8450eea4bac3940448fb7ae50d91f034f941199fcd9d909a5a07aa455f0" 1365 + checksum = "26145e563e54f2cadc477553f1ec5ee650b00862f0a58bcd12cbdc5f0ea2d2f4" 1366 1366 dependencies = [ 1367 1367 "cfg-if", 1368 1368 "libc", ··· 1909 1909 source = "registry+https://github.com/rust-lang/crates.io-index" 1910 1910 checksum = "38f262f097c174adebe41eb73d66ae9c06b2844fb0da69969647bbddd9b0538a" 1911 1911 dependencies = [ 1912 - "getrandom 0.3.2", 1912 + "getrandom 0.3.3", 1913 1913 "libc", 1914 1914 ] 1915 1915 ··· 2697 2697 2698 2698 [[package]] 2699 2699 name = "rand" 2700 - version = "0.9.0" 2700 + version = "0.9.1" 2701 2701 source = "registry+https://github.com/rust-lang/crates.io-index" 2702 - checksum = "3779b94aeb87e8bd4e834cee3650289ee9e0d5677f976ecdb6d219e5f4f6cd94" 2702 + checksum = "9fbfd9d094a40bf3ae768db9361049ace4c0e04a4fd6b359518bd7b73a73dd97" 2703 2703 dependencies = [ 2704 2704 "rand_chacha 0.9.0", 2705 2705 "rand_core 0.9.3", 2706 - "zerocopy 0.8.24", 2707 2706 ] 2708 2707 2709 2708 [[package]] ··· 2741 2740 source = "registry+https://github.com/rust-lang/crates.io-index" 2742 2741 checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38" 2743 2742 dependencies = [ 2744 - "getrandom 0.3.2", 2743 + "getrandom 0.3.3", 2745 2744 ] 2746 2745 2747 2746 [[package]] ··· 3226 3225 ] 3227 3226 3228 3227 [[package]] 3228 + name = "sha2" 3229 + version = "0.10.9" 3230 + source = "registry+https://github.com/rust-lang/crates.io-index" 3231 + checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283" 3232 + dependencies = [ 3233 + "cfg-if", 3234 + "cpufeatures", 3235 + "digest", 3236 + ] 3237 + 3238 + [[package]] 3229 3239 name = "sharded-slab" 3230 3240 version = "0.1.7" 3231 3241 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 3415 3425 checksum = "7437ac7763b9b123ccf33c338a5cc1bac6f69b45a136c19bdd8a65e3916435bf" 3416 3426 dependencies = [ 3417 3427 "fastrand", 3418 - "getrandom 0.3.2", 3428 + "getrandom 0.3.3", 3419 3429 "once_cell", 3420 3430 "rustix 1.0.5", 3421 3431 "windows-sys 0.59.0", ··· 3783 3793 "httparse", 3784 3794 "log", 3785 3795 "native-tls", 3786 - "rand 0.9.0", 3796 + "rand 0.9.1", 3787 3797 "sha1", 3788 3798 "thiserror 2.0.12", 3789 3799 "url", ··· 3808 3818 "dropshot", 3809 3819 "env_logger", 3810 3820 "fjall", 3821 + "getrandom 0.3.3", 3811 3822 "http", 3812 3823 "jetstream", 3813 3824 "log", ··· 3816 3827 "semver", 3817 3828 "serde", 3818 3829 "serde_json", 3830 + "sha2", 3819 3831 "tempfile", 3820 3832 "thiserror 2.0.12", 3821 3833 "tikv-jemallocator", ··· 3905 3917 source = "registry+https://github.com/rust-lang/crates.io-index" 3906 3918 checksum = "458f7a779bf54acc9f347480ac654f68407d3aab21269a6e3c9f922acd9e2da9" 3907 3919 dependencies = [ 3908 - "getrandom 0.3.2", 3920 + "getrandom 0.3.3", 3909 3921 "serde", 3910 3922 ] 3911 3923
+3 -1
ufos/Cargo.toml
··· 7 7 anyhow = "1.0.97" 8 8 async-trait = "0.1.88" 9 9 bincode = { version = "2.0.1", features = ["serde"] } 10 - cardinality-estimator-safe = { version = "2.1.1", features = ["with_serde"] } 10 + cardinality-estimator-safe = { version = "4.0.1", features = ["with_serde", "with_digest"] } 11 11 clap = { version = "4.5.31", features = ["derive"] } 12 12 dropshot = "0.16.0" 13 13 env_logger = "0.11.7" 14 14 fjall = { version = "2.8.0", features = ["lz4"] } 15 + getrandom = "0.3.3" 15 16 http = "1.3.1" 16 17 jetstream = { path = "../jetstream" } 17 18 log = "0.4.26" ··· 20 21 semver = "1.0.26" 21 22 serde = "1.0.219" 22 23 serde_json = "1.0.140" 24 + sha2 = "0.10.9" 23 25 thiserror = "2.0.12" 24 26 tokio = { version = "1.44.2", features = ["full", "sync", "time"] } 25 27
+12 -2
ufos/src/consumer.rs
··· 1 + use crate::store_types::SketchSecretPrefix; 1 2 use jetstream::{ 2 3 events::{Cursor, EventKind, JetstreamEvent}, 3 4 exports::{Did, Nsid}, ··· 32 33 jetstream_receiver: JetstreamReceiver, 33 34 batch_sender: Sender<LimitedBatch>, 34 35 current_batch: CurrentBatch, 36 + sketch_secret: SketchSecretPrefix, 35 37 } 36 38 37 39 pub async fn consume( 38 40 jetstream_endpoint: &str, 39 41 cursor: Option<Cursor>, 40 42 no_compress: bool, 43 + sketch_secret: SketchSecretPrefix, 41 44 ) -> anyhow::Result<Receiver<LimitedBatch>> { 42 45 let endpoint = DefaultJetstreamEndpoints::endpoint_or_shortcut(jetstream_endpoint); 43 46 if endpoint == jetstream_endpoint { ··· 60 63 .connect_cursor(cursor) 61 64 .await?; 62 65 let (batch_sender, batch_reciever) = channel::<LimitedBatch>(BATCH_QUEUE_SIZE); 63 - let mut batcher = Batcher::new(jetstream_receiver, batch_sender); 66 + let mut batcher = Batcher::new(jetstream_receiver, batch_sender, sketch_secret); 64 67 tokio::task::spawn(async move { batcher.run().await }); 65 68 Ok(batch_reciever) 66 69 } 67 70 68 71 impl Batcher { 69 - pub fn new(jetstream_receiver: JetstreamReceiver, batch_sender: Sender<LimitedBatch>) -> Self { 72 + pub fn new( 73 + jetstream_receiver: JetstreamReceiver, 74 + batch_sender: Sender<LimitedBatch>, 75 + sketch_secret: SketchSecretPrefix, 76 + ) -> Self { 70 77 Self { 71 78 jetstream_receiver, 72 79 batch_sender, 73 80 current_batch: Default::default(), 81 + sketch_secret, 74 82 } 75 83 } 76 84 ··· 129 137 &collection, 130 138 commit, 131 139 MAX_BATCHED_COLLECTIONS, 140 + &self.sketch_secret, 132 141 ); 133 142 134 143 if let Err(BatchInsertError::BatchFull(commit)) = optimistic_res { ··· 137 146 &collection, 138 147 commit, 139 148 MAX_BATCHED_COLLECTIONS, 149 + &self.sketch_secret, 140 150 )?; 141 151 } else { 142 152 optimistic_res?;
+2
ufos/src/db_types.rs
··· 224 224 225 225 ////// 226 226 227 + impl<const N: usize> UseBincodePlz for [u8; N] {} 228 + 227 229 impl DbBytes for Vec<u8> { 228 230 fn to_db_bytes(&self) -> Result<Vec<u8>, EncodingError> { 229 231 Ok(self.to_vec())
+6 -2
ufos/src/file_consumer.rs
··· 1 1 use crate::consumer::{Batcher, LimitedBatch, BATCH_QUEUE_SIZE}; 2 + use crate::store_types::SketchSecretPrefix; 2 3 use anyhow::Result; 3 4 use jetstream::{error::JetstreamEventError, events::JetstreamEvent}; 4 5 use std::path::PathBuf; ··· 21 22 Ok(()) 22 23 } 23 24 24 - pub async fn consume(p: PathBuf) -> Result<Receiver<LimitedBatch>> { 25 + pub async fn consume( 26 + p: PathBuf, 27 + sketch_secret: SketchSecretPrefix, 28 + ) -> Result<Receiver<LimitedBatch>> { 25 29 let f = File::open(p).await?; 26 30 let (jsonl_sender, jsonl_receiver) = channel::<JetstreamEvent>(16); 27 31 let (batch_sender, batch_reciever) = channel::<LimitedBatch>(BATCH_QUEUE_SIZE); 28 - let mut batcher = Batcher::new(jsonl_receiver, batch_sender); 32 + let mut batcher = Batcher::new(jsonl_receiver, batch_sender, sketch_secret); 29 33 tokio::task::spawn(async move { read_jsonl(f, jsonl_sender).await }); 30 34 tokio::task::spawn(async move { batcher.run().await }); 31 35 Ok(batch_reciever)
+135 -94
ufos/src/lib.rs
··· 10 10 pub mod store_types; 11 11 12 12 use crate::error::BatchInsertError; 13 - use cardinality_estimator_safe::CardinalityEstimator; 13 + use crate::store_types::SketchSecretPrefix; 14 + use cardinality_estimator_safe::{Element, Sketch}; 14 15 use error::FirehoseEventError; 15 16 use jetstream::events::{CommitEvent, CommitOp, Cursor}; 16 17 use jetstream::exports::{Did, Nsid, RecordKey}; 17 18 use schemars::JsonSchema; 18 19 use serde::Serialize; 19 20 use serde_json::value::RawValue; 21 + use sha2::Sha256; 20 22 use std::collections::HashMap; 23 + 24 + fn did_element(sketch_secret: &SketchSecretPrefix, did: &Did) -> Element<14> { 25 + Element::from_digest_with_prefix::<Sha256>(sketch_secret, did.as_bytes()) 26 + } 21 27 22 28 #[derive(Debug, Default, Clone)] 23 29 pub struct CollectionCommits<const LIMIT: usize> { 24 30 pub total_seen: usize, 25 - pub dids_estimate: CardinalityEstimator<Did>, 31 + pub dids_estimate: Sketch<14>, 26 32 pub commits: Vec<UFOsCommit>, 27 33 head: usize, 28 34 non_creates: usize, ··· 35 41 self.head = 0; 36 42 } 37 43 } 38 - pub fn truncating_insert(&mut self, commit: UFOsCommit) -> Result<(), BatchInsertError> { 44 + pub fn truncating_insert( 45 + &mut self, 46 + commit: UFOsCommit, 47 + sketch_secret: &SketchSecretPrefix, 48 + ) -> Result<(), BatchInsertError> { 39 49 if self.non_creates == LIMIT { 40 50 return Err(BatchInsertError::BatchFull(commit)); 41 51 } ··· 66 76 67 77 if is_create { 68 78 self.total_seen += 1; 69 - self.dids_estimate.insert(&did); 79 + self.dids_estimate.insert(did_element(sketch_secret, &did)); 70 80 } else { 71 81 self.non_creates += 1; 72 82 } ··· 158 168 collection: &Nsid, 159 169 commit: UFOsCommit, 160 170 max_collections: usize, 171 + sketch_secret: &SketchSecretPrefix, 161 172 ) -> Result<(), BatchInsertError> { 162 173 let map = &mut self.commits_by_nsid; 163 174 if !map.contains_key(collection) && map.len() >= max_collections { ··· 165 176 } 166 177 map.entry(collection.clone()) 167 178 .or_default() 168 - .truncating_insert(commit)?; 179 + .truncating_insert(commit, sketch_secret)?; 169 180 Ok(()) 170 181 } 171 182 pub fn total_records(&self) -> usize { ··· 181 192 self.account_removes.len() 182 193 } 183 194 pub fn estimate_dids(&self) -> usize { 184 - let mut estimator = CardinalityEstimator::<Did>::new(); 195 + let mut estimator = Sketch::<14>::default(); 185 196 for commits in self.commits_by_nsid.values() { 186 197 estimator.merge(&commits.dids_estimate); 187 198 } ··· 308 319 fn test_truncating_insert_truncates() -> anyhow::Result<()> { 309 320 let mut commits: CollectionCommits<2> = Default::default(); 310 321 311 - commits.truncating_insert(UFOsCommit { 312 - cursor: Cursor::from_raw_u64(100), 313 - did: Did::new("did:plc:whatever".to_string()).unwrap(), 314 - rkey: RecordKey::new("rkey-asdf-a".to_string()).unwrap(), 315 - rev: "rev-asdf".to_string(), 316 - action: CommitAction::Put(PutAction { 317 - record: RawValue::from_string("{}".to_string())?, 318 - is_update: false, 319 - }), 320 - })?; 322 + commits.truncating_insert( 323 + UFOsCommit { 324 + cursor: Cursor::from_raw_u64(100), 325 + did: Did::new("did:plc:whatever".to_string()).unwrap(), 326 + rkey: RecordKey::new("rkey-asdf-a".to_string()).unwrap(), 327 + rev: "rev-asdf".to_string(), 328 + action: CommitAction::Put(PutAction { 329 + record: RawValue::from_string("{}".to_string())?, 330 + is_update: false, 331 + }), 332 + }, 333 + &[0u8; 16], 334 + )?; 321 335 322 - commits.truncating_insert(UFOsCommit { 323 - cursor: Cursor::from_raw_u64(101), 324 - did: Did::new("did:plc:whatever".to_string()).unwrap(), 325 - rkey: RecordKey::new("rkey-asdf-b".to_string()).unwrap(), 326 - rev: "rev-asdg".to_string(), 327 - action: CommitAction::Put(PutAction { 328 - record: RawValue::from_string("{}".to_string())?, 329 - is_update: false, 330 - }), 331 - })?; 336 + commits.truncating_insert( 337 + UFOsCommit { 338 + cursor: Cursor::from_raw_u64(101), 339 + did: Did::new("did:plc:whatever".to_string()).unwrap(), 340 + rkey: RecordKey::new("rkey-asdf-b".to_string()).unwrap(), 341 + rev: "rev-asdg".to_string(), 342 + action: CommitAction::Put(PutAction { 343 + record: RawValue::from_string("{}".to_string())?, 344 + is_update: false, 345 + }), 346 + }, 347 + &[0u8; 16], 348 + )?; 332 349 333 - commits.truncating_insert(UFOsCommit { 334 - cursor: Cursor::from_raw_u64(102), 335 - did: Did::new("did:plc:whatever".to_string()).unwrap(), 336 - rkey: RecordKey::new("rkey-asdf-c".to_string()).unwrap(), 337 - rev: "rev-asdh".to_string(), 338 - action: CommitAction::Put(PutAction { 339 - record: RawValue::from_string("{}".to_string())?, 340 - is_update: false, 341 - }), 342 - })?; 350 + commits.truncating_insert( 351 + UFOsCommit { 352 + cursor: Cursor::from_raw_u64(102), 353 + did: Did::new("did:plc:whatever".to_string()).unwrap(), 354 + rkey: RecordKey::new("rkey-asdf-c".to_string()).unwrap(), 355 + rev: "rev-asdh".to_string(), 356 + action: CommitAction::Put(PutAction { 357 + record: RawValue::from_string("{}".to_string())?, 358 + is_update: false, 359 + }), 360 + }, 361 + &[0u8; 16], 362 + )?; 343 363 344 364 assert_eq!(commits.total_seen, 3); 345 365 assert_eq!(commits.dids_estimate.estimate(), 1); ··· 368 388 fn test_truncating_insert_does_not_truncate_deletes() -> anyhow::Result<()> { 369 389 let mut commits: CollectionCommits<2> = Default::default(); 370 390 371 - commits.truncating_insert(UFOsCommit { 372 - cursor: Cursor::from_raw_u64(100), 373 - did: Did::new("did:plc:whatever".to_string()).unwrap(), 374 - rkey: RecordKey::new("rkey-asdf-a".to_string()).unwrap(), 375 - rev: "rev-asdf".to_string(), 376 - action: CommitAction::Cut, 377 - })?; 391 + commits.truncating_insert( 392 + UFOsCommit { 393 + cursor: Cursor::from_raw_u64(100), 394 + did: Did::new("did:plc:whatever".to_string()).unwrap(), 395 + rkey: RecordKey::new("rkey-asdf-a".to_string()).unwrap(), 396 + rev: "rev-asdf".to_string(), 397 + action: CommitAction::Cut, 398 + }, 399 + &[0u8; 16], 400 + )?; 378 401 379 - commits.truncating_insert(UFOsCommit { 380 - cursor: Cursor::from_raw_u64(101), 381 - did: Did::new("did:plc:whatever".to_string()).unwrap(), 382 - rkey: RecordKey::new("rkey-asdf-b".to_string()).unwrap(), 383 - rev: "rev-asdg".to_string(), 384 - action: CommitAction::Put(PutAction { 385 - record: RawValue::from_string("{}".to_string())?, 386 - is_update: false, 387 - }), 388 - })?; 402 + commits.truncating_insert( 403 + UFOsCommit { 404 + cursor: Cursor::from_raw_u64(101), 405 + did: Did::new("did:plc:whatever".to_string()).unwrap(), 406 + rkey: RecordKey::new("rkey-asdf-b".to_string()).unwrap(), 407 + rev: "rev-asdg".to_string(), 408 + action: CommitAction::Put(PutAction { 409 + record: RawValue::from_string("{}".to_string())?, 410 + is_update: false, 411 + }), 412 + }, 413 + &[0u8; 16], 414 + )?; 389 415 390 - commits.truncating_insert(UFOsCommit { 391 - cursor: Cursor::from_raw_u64(102), 392 - did: Did::new("did:plc:whatever".to_string()).unwrap(), 393 - rkey: RecordKey::new("rkey-asdf-c".to_string()).unwrap(), 394 - rev: "rev-asdh".to_string(), 395 - action: CommitAction::Put(PutAction { 396 - record: RawValue::from_string("{}".to_string())?, 397 - is_update: false, 398 - }), 399 - })?; 416 + commits.truncating_insert( 417 + UFOsCommit { 418 + cursor: Cursor::from_raw_u64(102), 419 + did: Did::new("did:plc:whatever".to_string()).unwrap(), 420 + rkey: RecordKey::new("rkey-asdf-c".to_string()).unwrap(), 421 + rev: "rev-asdh".to_string(), 422 + action: CommitAction::Put(PutAction { 423 + record: RawValue::from_string("{}".to_string())?, 424 + is_update: false, 425 + }), 426 + }, 427 + &[0u8; 16], 428 + )?; 400 429 401 430 assert_eq!(commits.total_seen, 2); 402 431 assert_eq!(commits.dids_estimate.estimate(), 1); ··· 431 460 let mut commits: CollectionCommits<2> = Default::default(); 432 461 433 462 commits 434 - .truncating_insert(UFOsCommit { 435 - cursor: Cursor::from_raw_u64(100), 436 - did: Did::new("did:plc:whatever".to_string()).unwrap(), 437 - rkey: RecordKey::new("rkey-asdf-a".to_string()).unwrap(), 438 - rev: "rev-asdf".to_string(), 439 - action: CommitAction::Cut, 440 - }) 463 + .truncating_insert( 464 + UFOsCommit { 465 + cursor: Cursor::from_raw_u64(100), 466 + did: Did::new("did:plc:whatever".to_string()).unwrap(), 467 + rkey: RecordKey::new("rkey-asdf-a".to_string()).unwrap(), 468 + rev: "rev-asdf".to_string(), 469 + action: CommitAction::Cut, 470 + }, 471 + &[0u8; 16], 472 + ) 441 473 .unwrap(); 442 474 443 475 // this create will just be discarded 444 476 commits 445 - .truncating_insert(UFOsCommit { 446 - cursor: Cursor::from_raw_u64(80), 447 - did: Did::new("did:plc:whatever".to_string()).unwrap(), 448 - rkey: RecordKey::new("rkey-asdf-zzz".to_string()).unwrap(), 449 - rev: "rev-asdzzz".to_string(), 450 - action: CommitAction::Put(PutAction { 451 - record: RawValue::from_string("{}".to_string())?, 452 - is_update: false, 453 - }), 454 - }) 477 + .truncating_insert( 478 + UFOsCommit { 479 + cursor: Cursor::from_raw_u64(80), 480 + did: Did::new("did:plc:whatever".to_string()).unwrap(), 481 + rkey: RecordKey::new("rkey-asdf-zzz".to_string()).unwrap(), 482 + rev: "rev-asdzzz".to_string(), 483 + action: CommitAction::Put(PutAction { 484 + record: RawValue::from_string("{}".to_string())?, 485 + is_update: false, 486 + }), 487 + }, 488 + &[0u8; 16], 489 + ) 455 490 .unwrap(); 456 491 457 492 commits 458 - .truncating_insert(UFOsCommit { 459 - cursor: Cursor::from_raw_u64(101), 460 - did: Did::new("did:plc:whatever".to_string()).unwrap(), 461 - rkey: RecordKey::new("rkey-asdf-b".to_string()).unwrap(), 462 - rev: "rev-asdg".to_string(), 463 - action: CommitAction::Cut, 464 - }) 493 + .truncating_insert( 494 + UFOsCommit { 495 + cursor: Cursor::from_raw_u64(101), 496 + did: Did::new("did:plc:whatever".to_string()).unwrap(), 497 + rkey: RecordKey::new("rkey-asdf-b".to_string()).unwrap(), 498 + rev: "rev-asdg".to_string(), 499 + action: CommitAction::Cut, 500 + }, 501 + &[0u8; 16], 502 + ) 465 503 .unwrap(); 466 504 467 - let res = commits.truncating_insert(UFOsCommit { 468 - cursor: Cursor::from_raw_u64(102), 469 - did: Did::new("did:plc:whatever".to_string()).unwrap(), 470 - rkey: RecordKey::new("rkey-asdf-c".to_string()).unwrap(), 471 - rev: "rev-asdh".to_string(), 472 - action: CommitAction::Cut, 473 - }); 505 + let res = commits.truncating_insert( 506 + UFOsCommit { 507 + cursor: Cursor::from_raw_u64(102), 508 + did: Did::new("did:plc:whatever".to_string()).unwrap(), 509 + rkey: RecordKey::new("rkey-asdf-c".to_string()).unwrap(), 510 + rev: "rev-asdh".to_string(), 511 + action: CommitAction::Cut, 512 + }, 513 + &[0u8; 16], 514 + ); 474 515 475 516 assert!(res.is_err()); 476 517 let overflowed = match res {
+8 -4
ufos/src/main.rs
··· 7 7 use ufos::storage::{StorageWhatever, StoreBackground, StoreReader, StoreWriter}; 8 8 use ufos::storage_fjall::FjallStorage; 9 9 use ufos::storage_mem::MemStorage; 10 + use ufos::store_types::SketchSecretPrefix; 10 11 11 12 #[cfg(not(target_env = "msvc"))] 12 13 use tikv_jemallocator::Jemalloc; ··· 57 58 let args = Args::parse(); 58 59 let jetstream = args.jetstream.clone(); 59 60 if args.in_mem { 60 - let (read_store, write_store, cursor) = MemStorage::init( 61 + let (read_store, write_store, cursor, sketch_secret) = MemStorage::init( 61 62 args.data, 62 63 jetstream, 63 64 args.jetstream_force, ··· 70 71 read_store, 71 72 write_store, 72 73 cursor, 74 + sketch_secret, 73 75 ) 74 76 .await?; 75 77 } else { 76 - let (read_store, write_store, cursor) = FjallStorage::init( 78 + let (read_store, write_store, cursor, sketch_secret) = FjallStorage::init( 77 79 args.data, 78 80 jetstream, 79 81 args.jetstream_force, ··· 86 88 read_store, 87 89 write_store, 88 90 cursor, 91 + sketch_secret, 89 92 ) 90 93 .await?; 91 94 } ··· 100 103 read_store: impl StoreReader + 'static, 101 104 mut write_store: impl StoreWriter<B> + 'static, 102 105 cursor: Option<Cursor>, 106 + sketch_secret: SketchSecretPrefix, 103 107 ) -> anyhow::Result<()> { 104 108 println!("starting server with storage..."); 105 109 let serving = server::serve(read_store); ··· 112 116 113 117 let batches = if jetstream_fixture { 114 118 log::info!("starting with jestream file fixture: {jetstream:?}"); 115 - file_consumer::consume(jetstream.into()).await? 119 + file_consumer::consume(jetstream.into(), sketch_secret).await? 116 120 } else { 117 121 log::info!( 118 122 "starting consumer with cursor: {cursor:?} from {:?} ago", 119 123 cursor.map(|c| c.elapsed()) 120 124 ); 121 - consumer::consume(&jetstream, cursor, false).await? 125 + consumer::consume(&jetstream, cursor, false, sketch_secret).await? 122 126 }; 123 127 124 128 let rolling = write_store.background_tasks()?.run();
+2 -1
ufos/src/storage.rs
··· 1 + use crate::store_types::SketchSecretPrefix; 1 2 use crate::{ 2 3 error::StorageError, ConsumerInfo, Count, Cursor, EventBatch, QueryPeriod, TopCollections, 3 4 UFOsRecord, ··· 16 17 endpoint: String, 17 18 force_endpoint: bool, 18 19 config: C, 19 - ) -> StorageResult<(R, W, Option<Cursor>)> 20 + ) -> StorageResult<(R, W, Option<Cursor>, SketchSecretPrefix)> 20 21 where 21 22 Self: Sized; 22 23 }
+59 -15
ufos/src/storage_fjall.rs
··· 6 6 DeleteAccountQueueVal, HourTruncatedCursor, HourlyDidsKey, HourlyRecordsKey, HourlyRollupKey, 7 7 JetstreamCursorKey, JetstreamCursorValue, JetstreamEndpointKey, JetstreamEndpointValue, 8 8 LiveCountsKey, NewRollupCursorKey, NewRollupCursorValue, NsidRecordFeedKey, NsidRecordFeedVal, 9 - RecordLocationKey, RecordLocationMeta, RecordLocationVal, RecordRawValue, TakeoffKey, 10 - TakeoffValue, TrimCollectionCursorKey, WeekTruncatedCursor, WeeklyDidsKey, WeeklyRecordsKey, 11 - WeeklyRollupKey, 9 + RecordLocationKey, RecordLocationMeta, RecordLocationVal, RecordRawValue, SketchSecretKey, 10 + SketchSecretPrefix, TakeoffKey, TakeoffValue, TrimCollectionCursorKey, WeekTruncatedCursor, 11 + WeeklyDidsKey, WeeklyRecordsKey, WeeklyRollupKey, 12 12 }; 13 13 use crate::{ 14 14 CommitAction, ConsumerInfo, Count, Did, EventBatch, Nsid, QueryPeriod, TopCollections, ··· 45 45 /// - Launch date 46 46 /// - key: "takeoff" (literal) 47 47 /// - val: u64 (micros timestamp, not from jetstream for now so not precise) 48 + /// 49 + /// - Cardinality estimator secret 50 + /// - key: "sketch_secret" (literal) 51 + /// - val: [u8; 16] 48 52 /// 49 53 /// - Rollup cursor (bg work: roll stats into hourlies, delete accounts, old record deletes) 50 54 /// - key: "rollup_cursor" (literal) ··· 141 145 endpoint: String, 142 146 force_endpoint: bool, 143 147 _config: FjallConfig, 144 - ) -> StorageResult<(FjallReader, FjallWriter, Option<Cursor>)> { 148 + ) -> StorageResult<(FjallReader, FjallWriter, Option<Cursor>, SketchSecretPrefix)> { 145 149 let keyspace = { 146 150 let config = Config::new(path); 147 151 ··· 159 163 160 164 let js_cursor = get_static_neu::<JetstreamCursorKey, JetstreamCursorValue>(&global)?; 161 165 162 - if js_cursor.is_some() { 166 + let sketch_secret = if js_cursor.is_some() { 163 167 let stored_endpoint = 164 168 get_static_neu::<JetstreamEndpointKey, JetstreamEndpointValue>(&global)?; 165 - 166 169 let JetstreamEndpointValue(stored) = stored_endpoint.ok_or(StorageError::InitError( 167 170 "found cursor but missing js_endpoint, refusing to start.".to_string(), 168 171 ))?; 169 172 173 + let Some(stored_secret) = 174 + get_static_neu::<SketchSecretKey, SketchSecretPrefix>(&global)? 175 + else { 176 + return Err(StorageError::InitError( 177 + "found cursor but missing sketch_secret, refusing to start.".to_string(), 178 + )); 179 + }; 180 + 170 181 if stored != endpoint { 171 182 if force_endpoint { 172 183 log::warn!("forcing a jetstream switch from {stored:?} to {endpoint:?}"); ··· 179 190 "stored js_endpoint {stored:?} differs from provided {endpoint:?}, refusing to start."))); 180 191 } 181 192 } 193 + stored_secret 182 194 } else { 183 - insert_static_neu::<JetstreamEndpointKey>( 195 + log::info!("initializing a fresh db!"); 196 + init_static_neu::<JetstreamEndpointKey>( 184 197 &global, 185 198 JetstreamEndpointValue(endpoint.to_string()), 186 199 )?; 187 - insert_static_neu::<TakeoffKey>(&global, Cursor::at(SystemTime::now()))?; 188 - insert_static_neu::<NewRollupCursorKey>(&global, Cursor::from_start())?; 189 - } 200 + 201 + log::info!("generating new secret for cardinality sketches..."); 202 + let mut sketch_secret: SketchSecretPrefix = [0u8; 16]; 203 + getrandom::fill(&mut sketch_secret).map_err(|e| { 204 + StorageError::InitError(format!( 205 + "failed to get a random secret for cardinality sketches: {e:?}" 206 + )) 207 + })?; 208 + init_static_neu::<SketchSecretKey>(&global, sketch_secret)?; 209 + 210 + init_static_neu::<TakeoffKey>(&global, Cursor::at(SystemTime::now()))?; 211 + init_static_neu::<NewRollupCursorKey>(&global, Cursor::from_start())?; 212 + 213 + sketch_secret 214 + }; 190 215 191 216 let reader = FjallReader { 192 217 keyspace: keyspace.clone(), ··· 204 229 rollups, 205 230 queues, 206 231 }; 207 - Ok((reader, writer, js_cursor)) 232 + Ok((reader, writer, js_cursor, sketch_secret)) 208 233 } 209 234 } 210 235 ··· 1089 1114 Ok(()) 1090 1115 } 1091 1116 1117 + /// Set a value to a fixed key, erroring if the value already exists 1118 + /// 1119 + /// Intended for single-threaded init: not safe under concurrency, since there 1120 + /// is no transaction between checking if the already exists and writing it. 1121 + fn init_static_neu<K: StaticStr>( 1122 + global: &PartitionHandle, 1123 + value: impl DbBytes, 1124 + ) -> StorageResult<()> { 1125 + let key_bytes = DbStaticStr::<K>::default().to_db_bytes()?; 1126 + if global.get(&key_bytes)?.is_some() { 1127 + return Err(StorageError::InitError(format!( 1128 + "init failed: value for key {key_bytes:?} already exists" 1129 + ))); 1130 + } 1131 + let value_bytes = value.to_db_bytes()?; 1132 + global.insert(&key_bytes, &value_bytes)?; 1133 + Ok(()) 1134 + } 1135 + 1092 1136 /// Set a value to a fixed key 1093 1137 fn insert_batch_static_neu<K: StaticStr>( 1094 1138 batch: &mut FjallBatch, ··· 1132 1176 use serde_json::value::RawValue; 1133 1177 1134 1178 fn fjall_db() -> (FjallReader, FjallWriter) { 1135 - let (read, write, _) = FjallStorage::init( 1179 + let (read, write, _, _) = FjallStorage::init( 1136 1180 tempfile::tempdir().unwrap(), 1137 1181 "offline test (no real jetstream endpoint)".to_string(), 1138 1182 false, ··· 1187 1231 .commits_by_nsid 1188 1232 .entry(collection.clone()) 1189 1233 .or_default() 1190 - .truncating_insert(commit) 1234 + .truncating_insert(commit, &[0u8; 16]) 1191 1235 .unwrap(); 1192 1236 1193 1237 collection ··· 1229 1273 .commits_by_nsid 1230 1274 .entry(collection.clone()) 1231 1275 .or_default() 1232 - .truncating_insert(commit) 1276 + .truncating_insert(commit, &[0u8; 16]) 1233 1277 .unwrap(); 1234 1278 1235 1279 collection ··· 1261 1305 .commits_by_nsid 1262 1306 .entry(collection.clone()) 1263 1307 .or_default() 1264 - .truncating_insert(commit) 1308 + .truncating_insert(commit, &[0u8; 16]) 1265 1309 .unwrap(); 1266 1310 1267 1311 collection
+9 -8
ufos/src/storage_mem.rs
··· 9 9 HourTruncatedCursor, HourlyRollupKey, JetstreamCursorKey, JetstreamCursorValue, 10 10 JetstreamEndpointKey, JetstreamEndpointValue, LiveCountsKey, NewRollupCursorKey, 11 11 NewRollupCursorValue, NsidRecordFeedKey, NsidRecordFeedVal, RecordLocationKey, 12 - RecordLocationMeta, RecordLocationVal, RecordRawValue, TakeoffKey, TakeoffValue, 13 - WeekTruncatedCursor, WeeklyRollupKey, 12 + RecordLocationMeta, RecordLocationVal, RecordRawValue, SketchSecretPrefix, TakeoffKey, 13 + TakeoffValue, WeekTruncatedCursor, WeeklyRollupKey, 14 14 }; 15 15 use crate::{ 16 16 CommitAction, ConsumerInfo, Count, Did, EventBatch, Nsid, QueryPeriod, TopCollections, ··· 257 257 endpoint: String, 258 258 force_endpoint: bool, 259 259 _config: MemConfig, 260 - ) -> StorageResult<(MemReader, MemWriter, Option<Cursor>)> { 260 + ) -> StorageResult<(MemReader, MemWriter, Option<Cursor>, SketchSecretPrefix)> { 261 261 let keyspace = MemKeyspace::open(); 262 262 263 263 let global = keyspace.open_partition("global")?; ··· 312 312 rollups, 313 313 queues, 314 314 }; 315 - Ok((reader, writer, js_cursor)) 315 + let secret_prefix = [0u8; 16]; // in-mem store is always deterministic: no secret 316 + Ok((reader, writer, js_cursor, secret_prefix)) 316 317 } 317 318 } 318 319 ··· 1106 1107 use serde_json::value::RawValue; 1107 1108 1108 1109 fn fjall_db() -> (MemReader, MemWriter) { 1109 - let (read, write, _) = MemStorage::init( 1110 + let (read, write, _, _) = MemStorage::init( 1110 1111 tempfile::tempdir().unwrap(), 1111 1112 "offline test (no real jetstream endpoint)".to_string(), 1112 1113 false, ··· 1161 1162 .commits_by_nsid 1162 1163 .entry(collection.clone()) 1163 1164 .or_default() 1164 - .truncating_insert(commit) 1165 + .truncating_insert(commit, &[0u8; 16]) 1165 1166 .unwrap(); 1166 1167 1167 1168 collection ··· 1203 1204 .commits_by_nsid 1204 1205 .entry(collection.clone()) 1205 1206 .or_default() 1206 - .truncating_insert(commit) 1207 + .truncating_insert(commit, &[0u8; 16]) 1207 1208 .unwrap(); 1208 1209 1209 1210 collection ··· 1235 1236 .commits_by_nsid 1236 1237 .entry(collection.clone()) 1237 1238 .or_default() 1238 - .truncating_insert(commit) 1239 + .truncating_insert(commit, &[0u8; 16]) 1239 1240 .unwrap(); 1240 1241 1241 1242 collection
+23 -10
ufos/src/store_types.rs
··· 3 3 }; 4 4 use crate::{Cursor, Did, Nsid, PutAction, RecordKey, UFOsCommit}; 5 5 use bincode::{Decode, Encode}; 6 - use cardinality_estimator_safe::CardinalityEstimator; 6 + use cardinality_estimator_safe::Sketch; 7 7 use std::ops::Range; 8 8 9 9 macro_rules! static_str { ··· 21 21 // key format: ["js_cursor"] 22 22 static_str!("js_cursor", JetstreamCursorKey); 23 23 pub type JetstreamCursorValue = Cursor; 24 + 25 + // key format: ["sketch_secret"] 26 + static_str!("sketch_secret", SketchSecretKey); 27 + pub type SketchSecretPrefix = [u8; 16]; 24 28 25 29 // key format: ["rollup_cursor"] 26 30 static_str!("rollup_cursor", NewRollupCursorKey); ··· 199 203 impl UseBincodePlz for TotalRecordsValue {} 200 204 201 205 #[derive(Debug, PartialEq, serde::Serialize, serde::Deserialize)] 202 - pub struct EstimatedDidsValue(pub CardinalityEstimator<Did>); 206 + pub struct EstimatedDidsValue(pub Sketch<14>); 203 207 impl SerdeBytes for EstimatedDidsValue {} 204 208 impl DbBytes for EstimatedDidsValue { 205 209 #[cfg(test)] ··· 223 227 224 228 pub type CountsValue = DbConcat<TotalRecordsValue, EstimatedDidsValue>; 225 229 impl CountsValue { 226 - pub fn new(total: u64, dids: CardinalityEstimator<Did>) -> Self { 230 + pub fn new(total: u64, dids: Sketch<14>) -> Self { 227 231 Self { 228 232 prefix: TotalRecordsValue(total), 229 233 suffix: EstimatedDidsValue(dids), ··· 232 236 pub fn records(&self) -> u64 { 233 237 self.prefix.0 234 238 } 235 - pub fn dids(&self) -> &CardinalityEstimator<Did> { 239 + pub fn dids(&self) -> &Sketch<14> { 236 240 &self.suffix.0 237 241 } 238 242 pub fn merge(&mut self, other: &Self) { ··· 244 248 fn default() -> Self { 245 249 Self { 246 250 prefix: TotalRecordsValue(0), 247 - suffix: EstimatedDidsValue(CardinalityEstimator::new()), 251 + suffix: EstimatedDidsValue(Sketch::<14>::default()), 248 252 } 249 253 } 250 254 } ··· 433 437 #[cfg(test)] 434 438 mod test { 435 439 use super::{ 436 - CardinalityEstimator, CountsValue, Cursor, Did, EncodingError, HourTruncatedCursor, 437 - HourlyRollupKey, Nsid, HOUR_IN_MICROS, 440 + CountsValue, Cursor, Did, EncodingError, HourTruncatedCursor, HourlyRollupKey, Nsid, 441 + Sketch, HOUR_IN_MICROS, 438 442 }; 439 443 use crate::db_types::DbBytes; 444 + use cardinality_estimator_safe::Element; 445 + use sha2::Sha256; 440 446 441 447 #[test] 442 448 fn test_by_hourly_rollup_key() -> Result<(), EncodingError> { ··· 456 462 457 463 #[test] 458 464 fn test_by_hourly_rollup_value() -> Result<(), EncodingError> { 459 - let mut estimator = CardinalityEstimator::new(); 465 + let mut estimator = Sketch::<14>::default(); 466 + fn to_element(d: Did) -> Element<14> { 467 + Element::from_digest_oneshot::<Sha256>(d.to_string().as_bytes()) 468 + } 460 469 for i in 0..10 { 461 - estimator.insert(&Did::new(format!("did:plc:inze6wrmsm7pjl7yta3oig7{i}")).unwrap()); 470 + estimator.insert(to_element( 471 + Did::new(format!("did:plc:inze6wrmsm7pjl7yta3oig7{i}")).unwrap(), 472 + )); 462 473 } 463 474 let original = CountsValue::new(123, estimator.clone()); 464 475 let serialized = original.to_db_bytes()?; ··· 467 478 assert_eq!(bytes_consumed, serialized.len()); 468 479 469 480 for i in 10..1_000 { 470 - estimator.insert(&Did::new(format!("did:plc:inze6wrmsm7pjl7yta3oig{i}")).unwrap()); 481 + estimator.insert(to_element( 482 + Did::new(format!("did:plc:inze6wrmsm7pjl7yta3oig{i}")).unwrap(), 483 + )); 471 484 } 472 485 let original = CountsValue::new(123, estimator); 473 486 let serialized = original.to_db_bytes()?;