Server tools to backfill, tail, mirror, and verify PLC logs
50
fork

Configure Feed

Select the types of activity you want to include in your feed.

complete-ish bulk backfill procedure

phil f3098d01 96a5b454

+90 -59
+3 -3
src/backfill.rs
··· 26 26 let dest = dest.clone(); 27 27 let source = source.clone(); 28 28 workers.spawn(async move { 29 - log::info!("about to get weeks..."); 29 + log::trace!("about to get weeks..."); 30 30 31 31 while let Some(week) = weeks.lock().await.pop() { 32 - log::info!( 32 + log::trace!( 33 33 "worker {w}: fetching week {} (-{})", 34 34 Into::<Dt>::into(week).to_rfc3339(), 35 35 week.n_ago(), 36 36 ); 37 37 week_to_pages(source.clone(), week, dest.clone()).await?; 38 - log::info!("done a week"); 38 + log::trace!("week {}", Into::<Dt>::into(week).to_rfc3339()); 39 39 } 40 40 log::info!("done with the weeks ig"); 41 41 Ok(())
+7 -1
src/bin/allegedly.rs
··· 37 37 /// Pass a postgres connection url like "postgresql://localhost:5432" 38 38 #[arg(long)] 39 39 to_postgres: Option<Url>, 40 + /// Delete all operations from the postgres db before starting 41 + /// 42 + /// only used if `--to-postgres` is present 43 + #[arg(long, action)] 44 + postgres_reset: bool, 40 45 /// Stop at the week ending before this date 41 46 #[arg(long)] 42 47 until: Option<Dt>, ··· 91 96 dir, 92 97 source_workers, 93 98 to_postgres, 99 + postgres_reset, 94 100 until, 95 101 } => { 96 102 let (tx, rx) = flume::bounded(32); // big pages ··· 109 115 }); 110 116 if let Some(url) = to_postgres { 111 117 let db = Db::new(url.as_str()).await.unwrap(); 112 - pages_to_pg(db, rx).await.unwrap(); 118 + pages_to_pg(db, rx, postgres_reset).await.unwrap(); 113 119 } else { 114 120 pages_to_stdout(rx).await.unwrap(); 115 121 }
+80 -55
src/plc_pg.rs
··· 71 71 } 72 72 } 73 73 74 - pub async fn write_bulk(db: Db, pages: flume::Receiver<ExportPage>) -> Result<(), PgError> { 74 + /// Dump rows into an empty operations table quickly 75 + /// 76 + /// you must run this after initializing the db with kysely migrations from the 77 + /// typescript app, but before inserting any content. 78 + /// 79 + /// it's an invasive process: it will drop the indexes that kysely created (and 80 + /// restore them after) in order to get the backfill in as quickly as possible. 81 + /// 82 + /// fails: if the backfill data violates the primary key constraint (unique did*cid) 83 + /// 84 + /// panics: if the operations or dids tables are not empty, unless reset is true 85 + /// 86 + /// recommended postgres setting: `max_wal_size=4GB` (or more) 87 + pub async fn write_bulk( 88 + db: Db, 89 + pages: flume::Receiver<ExportPage>, 90 + reset: bool, 91 + ) -> Result<(), PgError> { 75 92 let mut client = db.connect().await?; 76 93 77 - // TODO: maybe we want to be more cautious 78 - client 79 - .execute( 80 - r#" 81 - DROP TABLE IF EXISTS allegedly_backfill"#, 82 - &[], 83 - ) 94 + let t0 = Instant::now(); 95 + let tx = client.transaction().await?; 96 + 97 + let t_step = Instant::now(); 98 + for table in ["operations", "dids"] { 99 + if reset { 100 + let n = tx.execute(&format!("DELETE FROM {table}"), &[]).await?; 101 + if n > 0 { 102 + log::warn!("postgres reset: deleted {n} from {table}"); 103 + } 104 + } else { 105 + let n: i64 = tx 106 + .query_one(&format!("SELECT count(*) FROM {table}"), &[]) 107 + .await? 108 + .get(0); 109 + if n > 0 { 110 + panic!("postgres: {table} table was not empty and `reset` not requested"); 111 + } 112 + } 113 + } 114 + log::trace!("tables clean: {:?}", t_step.elapsed()); 115 + 116 + let t_step = Instant::now(); 117 + tx.execute("ALTER TABLE operations SET UNLOGGED", &[]) 84 118 .await?; 119 + tx.execute("ALTER TABLE dids SET UNLOGGED", &[]).await?; 120 + log::trace!("set tables unlogged: {:?}", t_step.elapsed()); 85 121 86 - let tx = client.transaction().await?; 122 + let t_step = Instant::now(); 123 + tx.execute(r#"DROP INDEX "operations_createdAt_index""#, &[]) 124 + .await?; 125 + tx.execute("DROP INDEX operations_did_createdat_idx", &[]) 126 + .await?; 127 + log::trace!("indexes dropped: {:?}", t_step.elapsed()); 87 128 88 - tx.execute( 89 - r#" 90 - CREATE UNLOGGED TABLE allegedly_backfill ( 91 - did text not null, 92 - cid text not null, 93 - operation jsonb not null, 94 - nullified boolean not null, 95 - "createdAt" timestamptz not null 96 - )"#, 97 - &[], 98 - ) 99 - .await?; 100 - 129 + let t_step = Instant::now(); 130 + log::trace!("starting binary COPY IN..."); 101 131 let types = &[ 102 - Type::TEXT, 103 132 Type::TEXT, 104 133 Type::JSONB, 134 + Type::TEXT, 105 135 Type::BOOL, 106 136 Type::TIMESTAMPTZ, 107 137 ]; 108 - let t0 = Instant::now(); 109 - 110 138 let sync = tx 111 - .copy_in("COPY allegedly_backfill FROM STDIN BINARY") 139 + .copy_in( 140 + r#"COPY operations (did, operation, cid, nullified, "createdAt") FROM STDIN BINARY"#, 141 + ) 112 142 .await?; 113 143 let mut writer = pin!(BinaryCopyInWriter::new(sync, types)); 114 - 115 144 while let Ok(page) = pages.recv_async().await { 116 145 for s in page.ops { 117 146 let Ok(op) = serde_json::from_str::<Op>(&s) else { ··· 122 151 .as_mut() 123 152 .write(&[ 124 153 &op.did, 125 - &op.cid, 126 154 &Json(op.operation), 155 + &op.cid, 127 156 &op.nullified, 128 157 &op.created_at, 129 158 ]) 130 159 .await?; 131 160 } 132 161 } 133 - 134 162 let n = writer.as_mut().finish().await?; 135 - log::info!("copied in {n} rows"); 163 + log::trace!("COPY IN wrote {n} ops: {:?}", t_step.elapsed()); 136 164 137 - tx.commit().await?; 138 - log::info!("copy in time: {:?}", t0.elapsed()); 165 + // CAUTION: these indexes MUST match up exactly with the kysely ones we dropped 166 + let t_step = Instant::now(); 167 + tx.execute( 168 + r#"CREATE INDEX operations_did_createdat_idx ON operations (did, "createdAt")"#, 169 + &[], 170 + ) 171 + .await?; 172 + tx.execute( 173 + r#"CREATE INDEX "operations_createdAt_index" ON operations ("createdAt")"#, 174 + &[], 175 + ) 176 + .await?; 177 + log::trace!("indexes recreated: {:?}", t_step.elapsed()); 139 178 140 - log::info!("copying dids into plc table..."); 141 - let n = client 179 + let t_step = Instant::now(); 180 + let n = tx 142 181 .execute( 143 - r#" 144 - INSERT INTO dids 145 - SELECT distinct did FROM allegedly_backfill 146 - ON CONFLICT do nothing"#, 182 + r#"INSERT INTO dids SELECT distinct did FROM operations"#, 147 183 &[], 148 184 ) 149 185 .await?; 150 - log::info!("{n} inserted; elapsed: {:?}", t0.elapsed()); 186 + log::trace!("INSERT wrote {n} dids: {:?}", t_step.elapsed()); 151 187 152 - log::info!("copying ops into plc table..."); 153 - let n = client 154 - .execute( 155 - r#" 156 - INSERT INTO operations (did, cid, operation, nullified, "createdAt") 157 - SELECT did, cid, operation, nullified, "createdAt" FROM allegedly_backfill 158 - ON CONFLICT do nothing"#, 159 - &[], 160 - ) 161 - .await?; 162 - log::info!("{n} inserted; elapsed: {:?}", t0.elapsed()); 188 + let t_step = Instant::now(); 189 + tx.execute("ALTER TABLE dids SET LOGGED", &[]).await?; 190 + tx.execute("ALTER TABLE operations SET LOGGED", &[]).await?; 191 + log::trace!("set tables LOGGED: {:?}", t_step.elapsed()); 163 192 164 - log::info!("clean up backfill table..."); 165 - client 166 - .execute(r#"DROP TABLE allegedly_backfill"#, &[]) 167 - .await?; 168 - 193 + tx.commit().await?; 169 194 log::info!("total backfill time: {:?}", t0.elapsed()); 170 195 171 196 Ok(())