···11mod backfill;
2233-pub use backfill::PageForwarder;
33+pub use backfill::week_to_pages;
4455/// One page of PLC export
66///
77-/// should have maximum length of 1000 lines.
88-/// A bulk export consumer should chunk ops into pages of max 1000 ops.
99-///
1010-/// leading and trailing whitespace should be trimmed.
77+/// Not limited, but expected to have up to about 1000 lines
118pub struct ExportPage {
1212- pub ops: String,
99+ pub ops: Vec<String>,
1310}
+19-40
src/main.rs
src/bin/main.rs
···11use clap::Parser;
22use serde::Deserialize;
33-use std::io::Write;
43use std::time::Duration;
54use tokio_postgres::NoTls;
65use url::Url;
7688-use allegedly::{ExportPage, PageForwarder};
77+use allegedly::{ExportPage, week_to_pages};
98109const EXPORT_PAGE_QUEUE_SIZE: usize = 0; // rendezvous for now
1110const UPSTREAM_REQUEST_INTERVAL: Duration = Duration::from_millis(500);
···7675 while week < immutable_week {
7776 log::info!("backfilling week {week_n} ({week})");
7877 let url = upstream.join(&format!("{week}.jsonl.gz")).unwrap();
7979- let mut gzipped_chunks = client
8080- .get(url)
8181- .send()
8282- .await
8383- .unwrap()
8484- .error_for_status()
8585- .unwrap();
8686-8787- let mut sink = PageForwarder::<1000>::new(tx.clone());
8888- let mut decoder = flate2::write::GzDecoder::new(&mut sink);
8989-9090- while let Some(chunk) = gzipped_chunks.chunk().await.unwrap() {
9191- tokio::task::block_in_place(|| {
9292- let mut chunk = chunk;
9393- while !chunk.is_empty() {
9494- let Ok(n) = decoder
9595- .write(&chunk)
9696- .inspect_err(|e| log::warn!("wat: {e}"))
9797- else {
9898- panic!("can't feed bytes to the decoder :/");
9999- };
100100- if n == 0 {
101101- panic!("apparently we can't write");
102102- }
103103- chunk = chunk.split_off(n);
104104- }
105105- });
106106- }
107107- decoder.flush().unwrap();
108108-7878+ week_to_pages(&client, url, tx.clone()).await.unwrap();
10979 week_n += 1;
11080 week += WEEK_IN_SECONDS;
11181 }
···12191 .user_agent(concat!(
12292 "allegedly v",
12393 env!("CARGO_PKG_VERSION"),
124124- " (part of @microcosm.blue; contact @bad-example.com)"
9494+ " (from @microcosm.blue; contact @bad-example.com)"
12595 ))
126126- .timeout(Duration::from_secs(10))
12796 .build()
12897 .unwrap();
12998···165134 after = Some(op.created_at);
166135167136 log::trace!("got some ops until {after:?}, sending them...");
137137+ let ops = ops.split('\n').map(Into::into).collect();
168138 tx.send_async(ExportPage { ops }).await.unwrap();
169139 }
170140}
···192162 .unwrap();
193163194164 while let Ok(page) = rx.recv_async().await {
195195- log::info!("got a page...");
165165+ log::trace!("got a page...");
196166197167 let mut tx = pg_client.transaction().await.unwrap();
198168199169 // TODO: probably figure out postgres COPY IN
200170 // for now just write everything into a transaction
201171202202- log::info!("setting up inserts...");
203203- for op_line in page.ops.lines() {
204204- let Ok(op) = serde_json::from_str::<Op>(op_line)
172172+ log::trace!("setting up inserts...");
173173+ for op_line in page
174174+ .ops
175175+ .into_iter()
176176+ .flat_map(|s| {
177177+ s.replace("}{", "}\n{")
178178+ .split('\n')
179179+ .map(|s| s.trim())
180180+ .map(Into::into)
181181+ .collect::<Vec<String>>()
182182+ })
183183+ .filter(|s| !s.is_empty())
184184+ {
185185+ let Ok(op) = serde_json::from_str::<Op>(&op_line)
205186 .inspect_err(|e| log::error!("failing! at the {op_line}! {e}"))
206187 else {
207188 log::error!("ayeeeee just ignoring this error for now......");
···236217 }
237218238219 tx.commit().await.unwrap();
239239-240240- log::info!("hi from writer! (done page)");
241220 }
242221 Ok(())
243222}