Server tools to backfill, tail, mirror, and verify PLC logs
49
fork

Configure Feed

Select the types of activity you want to include in your feed.

clean up backfill script

phil 78557d62 5caf8d72

+134 -72
+108 -61
src/bin/backfill.rs
··· 1 1 use allegedly::{ 2 - Db, Dt, FolderSource, HttpSource, backfill, backfill_to_pg, bin::GlobalArgs, bin_init, 3 - full_pages, pages_to_pg, pages_to_stdout, poll_upstream, 2 + Db, Dt, ExportPage, FolderSource, HttpSource, backfill, backfill_to_pg, bin::GlobalArgs, 3 + bin_init, full_pages, pages_to_pg, pages_to_stdout, poll_upstream, 4 4 }; 5 5 use clap::Parser; 6 6 use reqwest::Url; 7 7 use std::path::PathBuf; 8 - use tokio::sync::{mpsc, oneshot}; 8 + use tokio::{ 9 + sync::{mpsc, oneshot}, 10 + task::JoinSet, 11 + }; 12 + 13 + pub const DEFAULT_HTTP: &str = "https://plc.t3.storage.dev/plc.directory/"; 9 14 10 15 #[derive(Debug, clap::Args)] 11 16 pub struct Args { 12 17 /// Remote URL prefix to fetch bundles from 13 18 #[arg(long)] 14 - #[clap(default_value = "https://plc.t3.storage.dev/plc.directory/")] 19 + #[clap(default_value = DEFAULT_HTTP)] 15 20 http: Url, 16 21 /// Local folder to fetch bundles from (overrides `http`) 17 22 #[arg(long)] 18 23 dir: Option<PathBuf>, 24 + /// Don't do weekly bulk-loading at all. 25 + /// 26 + /// overrides `http` and `dir`, makes catch_up redundant 27 + #[arg(long, action)] 28 + no_bulk: bool, 19 29 /// Parallel bundle fetchers 20 30 /// 21 31 /// Default: 4 for http fetches, 1 for local folder ··· 47 57 Args { 48 58 http, 49 59 dir, 60 + no_bulk, 50 61 source_workers, 51 62 to_postgres, 52 63 postgres_cert, ··· 55 66 catch_up, 56 67 }: Args, 57 68 ) -> anyhow::Result<()> { 58 - let (tx, rx) = mpsc::channel(32); // these are big pages 59 - tokio::task::spawn(async move { 60 - if let Some(dir) = dir { 61 - log::info!("Reading weekly bundles from local folder {dir:?}"); 62 - backfill(FolderSource(dir), tx, source_workers.unwrap_or(1), until) 63 - .await 64 - .inspect_err(|e| log::error!("backfill from folder problem: {e}")) 65 - .expect("to source bundles from a folder"); 66 - } else { 67 - log::info!("Fetching weekly bundles from from {http}"); 68 - backfill(HttpSource(http), tx, source_workers.unwrap_or(4), until) 69 - .await 70 - .expect("to source bundles from http"); 71 - } 72 - }); 69 + let mut tasks = JoinSet::new(); 70 + 71 + let (bulk_tx, bulk_out) = mpsc::channel(32); // bulk uses big pages 73 72 74 - // postgres writer will notify us as soon as the very last op's time is known 75 - // so we can start catching up while pg is restoring indexes and stuff 76 - let (notify_last_at, rx_last) = if catch_up { 73 + // a bulk sink can notify us as soon as the very last op's time is known 74 + // so we can start catching up while the sink might restore indexes and such 75 + let (found_last_tx, found_last_out) = if catch_up { 77 76 let (tx, rx) = oneshot::channel(); 78 77 (Some(tx), Some(rx)) 79 78 } else { 80 79 (None, None) 81 80 }; 82 81 83 - let to_postgres_url_bulk = to_postgres.clone(); 84 - let pg_cert = postgres_cert.clone(); 85 - let bulk_out_write = tokio::task::spawn(async move { 86 - if let Some(ref url) = to_postgres_url_bulk { 87 - let db = Db::new(url.as_str(), pg_cert) 88 - .await 89 - .expect("to get db for bulk out write"); 90 - backfill_to_pg(db, postgres_reset, rx, notify_last_at) 91 - .await 92 - .expect("to backfill to pg"); 82 + let (poll_tx, poll_out) = mpsc::channel::<ExportPage>(128); // normal/small pages 83 + let (full_tx, full_out) = mpsc::channel(1); // don't need to buffer at this filter 84 + 85 + // set up sources 86 + if no_bulk { 87 + // simple mode, just poll upstream from teh beginning 88 + if http != DEFAULT_HTTP.parse()? { 89 + log::warn!("ignoring non-default bulk http setting since --no-bulk was set"); 90 + } 91 + if let Some(d) = dir { 92 + log::warn!("ignoring bulk dir setting ({d:?}) since --no-bulk was set."); 93 + } 94 + if let Some(u) = until { 95 + log::warn!( 96 + "ignoring `until` setting ({u:?}) since --no-bulk was set. (feature request?)" 97 + ); 98 + } 99 + let mut upstream = upstream; 100 + upstream.set_path("/export"); 101 + tasks.spawn(poll_upstream(None, upstream, poll_tx)); 102 + tasks.spawn(full_pages(poll_out, full_tx)); 103 + tasks.spawn(pages_to_stdout(full_out, None)); 104 + } else { 105 + // fun mode 106 + 107 + // set up bulk sources 108 + if let Some(dir) = dir { 109 + if http != DEFAULT_HTTP.parse()? { 110 + anyhow::bail!( 111 + "non-default bulk http setting can't be used with bulk dir setting ({dir:?})" 112 + ); 113 + } 114 + tasks.spawn(backfill( 115 + FolderSource(dir), 116 + bulk_tx, 117 + source_workers.unwrap_or(1), 118 + until, 119 + )); 93 120 } else { 94 - pages_to_stdout(rx, notify_last_at) 95 - .await 96 - .expect("to backfill to stdout"); 121 + tasks.spawn(backfill( 122 + HttpSource(http), 123 + bulk_tx, 124 + source_workers.unwrap_or(4), 125 + until, 126 + )); 97 127 } 98 - }); 99 128 100 - if let Some(rx_last) = rx_last { 101 - let mut upstream = upstream; 102 - upstream.set_path("/export"); 103 - // wait until the time for `after` is known 104 - let last_at = rx_last.await.expect("to get the last log's createdAt"); 105 - log::info!("beginning catch-up from {last_at:?} while the writer finalizes stuff"); 106 - let (tx, rx) = mpsc::channel(256); // these are small pages 107 - tokio::task::spawn(async move { 108 - poll_upstream(last_at, upstream, tx) 109 - .await 110 - .expect("polling upstream to work") 111 - }); 112 - bulk_out_write.await.expect("to wait for bulk_out_write"); 113 - log::info!("writing catch-up pages"); 114 - let full_pages = full_pages(rx); 115 - if let Some(url) = to_postgres { 116 - let db = Db::new(url.as_str(), postgres_cert) 117 - .await 118 - .expect("to connect pg for catchup"); 119 - pages_to_pg(db, full_pages) 120 - .await 121 - .expect("to write catch-up pages to pg"); 129 + // and the catch-up source... 130 + if let Some(last) = found_last_out { 131 + tasks.spawn(async move { 132 + let mut upstream = upstream; 133 + upstream.set_path("/export"); 134 + poll_upstream(last.await?, upstream, poll_tx).await 135 + }); 136 + } 137 + 138 + // set up sinks 139 + if let Some(pg_url) = to_postgres { 140 + log::trace!("connecting to postgres..."); 141 + let db = Db::new(pg_url.as_str(), postgres_cert).await?; 142 + log::trace!("connected to postgres"); 143 + 144 + tasks.spawn(backfill_to_pg( 145 + db.clone(), 146 + postgres_reset, 147 + bulk_out, 148 + found_last_tx, 149 + )); 150 + tasks.spawn(pages_to_pg(db, full_out)); 122 151 } else { 123 - pages_to_stdout(full_pages, None) 124 - .await 125 - .expect("to write catch-up pages to stdout"); 152 + tasks.spawn(pages_to_stdout(bulk_out, found_last_tx)); 153 + tasks.spawn(pages_to_stdout(full_out, None)); 126 154 } 127 155 } 156 + 157 + while let Some(next) = tasks.join_next().await { 158 + match next { 159 + Err(e) if e.is_panic() => { 160 + log::error!("a joinset task panicked: {e}. bailing now. (should we panic?)"); 161 + return Err(e.into()); 162 + } 163 + Err(e) => { 164 + log::error!("a joinset task failed to join: {e}"); 165 + return Err(e.into()); 166 + } 167 + Ok(Err(e)) => { 168 + log::error!("a joinset task completed with error: {e}"); 169 + return Err(e); 170 + } 171 + _ => {} 172 + } 173 + } 174 + 128 175 Ok(()) 129 176 } 130 177
+24 -9
src/lib.rs
··· 80 80 /// 81 81 /// PLC will return up to 1000 ops on a page, and returns full pages until it 82 82 /// has caught up, so this is a (hacky?) way to stop polling once we're up. 83 - pub fn full_pages(mut rx: mpsc::Receiver<ExportPage>) -> mpsc::Receiver<ExportPage> { 84 - let (tx, fwd) = mpsc::channel(1); 85 - tokio::task::spawn(async move { 86 - while let Some(page) = rx.recv().await 87 - && page.ops.len() > 900 88 - { 89 - tx.send(page).await.expect("to be able to forward a page"); 83 + pub async fn full_pages( 84 + mut rx: mpsc::Receiver<ExportPage>, 85 + tx: mpsc::Sender<ExportPage>, 86 + ) -> anyhow::Result<()> { 87 + while let Some(page) = rx.recv().await { 88 + let n = page.ops.len(); 89 + if n < 900 { 90 + let last_age = page.ops.last().map(|op| chrono::Utc::now() - op.created_at); 91 + let Some(age) = last_age else { 92 + log::info!("full_pages done, empty final page"); 93 + return Ok(()); 94 + }; 95 + if age <= chrono::TimeDelta::hours(6) { 96 + log::info!("full_pages done, final page of {n} ops"); 97 + } else { 98 + log::warn!("full_pages finished with small page of {n} ops, but it's {age} old"); 99 + } 100 + return Ok(()); 90 101 } 91 - }); 92 - fwd 102 + log::trace!("full_pages: continuing with page of {n} ops"); 103 + tx.send(page).await?; 104 + } 105 + Err(anyhow::anyhow!( 106 + "full_pages ran out of source material, sender closed" 107 + )) 93 108 } 94 109 95 110 pub async fn pages_to_stdout(
+2 -2
src/plc_pg.rs
··· 133 133 } 134 134 } 135 135 136 - pub async fn pages_to_pg(db: Db, mut pages: mpsc::Receiver<ExportPage>) -> Result<(), PgError> { 136 + pub async fn pages_to_pg(db: Db, mut pages: mpsc::Receiver<ExportPage>) -> anyhow::Result<()> { 137 137 let mut client = db.connect().await?; 138 138 139 139 let ops_stmt = client ··· 197 197 reset: bool, 198 198 mut pages: mpsc::Receiver<ExportPage>, 199 199 notify_last_at: Option<oneshot::Sender<Option<Dt>>>, 200 - ) -> Result<(), PgError> { 200 + ) -> anyhow::Result<()> { 201 201 let mut client = db.connect().await?; 202 202 203 203 let t0 = Instant::now();