···66pub mod server;
77pub mod storage;
88pub mod storage_fjall;
99-pub mod storage_mem;
109pub mod store_types;
11101211use crate::error::BatchInsertError;
···287286#[derive(Debug, Serialize, JsonSchema)]
288287pub struct JustCount {
289288 creates: u64,
289289+ updates: u64,
290290+ deletes: u64,
290291 dids_estimate: u64,
291292}
292293
+17-57
ufos/src/main.rs
···77use ufos::server;
88use ufos::storage::{StorageWhatever, StoreBackground, StoreReader, StoreWriter};
99use ufos::storage_fjall::FjallStorage;
1010-use ufos::storage_mem::MemStorage;
1110use ufos::store_types::SketchSecretPrefix;
1211use ufos::{nice_duration, ConsumerInfo};
1312···1918static GLOBAL: Jemalloc = Jemalloc;
20192120/// Aggregate links in the at-mosphere
2222-#[derive(Parser, Debug)]
2121+#[derive(Parser, Debug, Clone)]
2322#[command(version, about, long_about = None)]
2423struct Args {
2524 /// Jetstream server to connect to (exclusive with --fixture). Provide either a wss:// URL, or a shorhand value:
···4746 /// todo: restore this
4847 #[arg(long, action)]
4948 pause_rw: bool,
5050- /// DEBUG: use an in-memory store instead of fjall
5151- #[arg(long, action)]
5252- in_mem: bool,
5349 /// reset the rollup cursor, scrape through missed things in the past (backfill)
5450 #[arg(long, action)]
5551 reroll: bool,
···64606561 let args = Args::parse();
6662 let jetstream = args.jetstream.clone();
6767- if args.in_mem {
6868- let (read_store, write_store, cursor, sketch_secret) = MemStorage::init(
6969- args.data,
7070- jetstream,
7171- args.jetstream_force,
7272- Default::default(),
7373- )?;
7474- go(
7575- args.jetstream,
7676- args.jetstream_fixture,
7777- args.pause_writer,
7878- args.backfill,
7979- args.reroll,
8080- read_store,
8181- write_store,
8282- cursor,
8383- sketch_secret,
8484- )
8585- .await?;
8686- } else {
8787- let (read_store, write_store, cursor, sketch_secret) = FjallStorage::init(
8888- args.data,
8989- jetstream,
9090- args.jetstream_force,
9191- Default::default(),
9292- )?;
9393- go(
9494- args.jetstream,
9595- args.jetstream_fixture,
9696- args.pause_writer,
9797- args.backfill,
9898- args.reroll,
9999- read_store,
100100- write_store,
101101- cursor,
102102- sketch_secret,
103103- )
104104- .await?;
105105- }
106106-6363+ let (read_store, write_store, cursor, sketch_secret) = FjallStorage::init(
6464+ args.data.clone(),
6565+ jetstream,
6666+ args.jetstream_force,
6767+ Default::default(),
6868+ )?;
6969+ go(args, read_store, write_store, cursor, sketch_secret).await?;
10770 Ok(())
10871}
10972110110-#[allow(clippy::too_many_arguments)]
11173async fn go<B: StoreBackground>(
112112- jetstream: String,
113113- jetstream_fixture: bool,
114114- pause_writer: bool,
115115- backfill: bool,
116116- reroll: bool,
7474+ args: Args,
11775 read_store: impl StoreReader + 'static + Clone,
11876 mut write_store: impl StoreWriter<B> + 'static,
11977 cursor: Option<Cursor>,
···12280 println!("starting server with storage...");
12381 let serving = server::serve(read_store.clone());
12482125125- if pause_writer {
8383+ if args.pause_writer {
12684 log::info!("not starting jetstream or the write loop.");
12785 serving.await.map_err(|e| anyhow::anyhow!(e))?;
12886 return Ok(());
12987 }
13088131131- let batches = if jetstream_fixture {
132132- log::info!("starting with jestream file fixture: {jetstream:?}");
133133- file_consumer::consume(jetstream.into(), sketch_secret, cursor).await?
8989+ let batches = if args.jetstream_fixture {
9090+ log::info!("starting with jestream file fixture: {:?}", args.jetstream);
9191+ file_consumer::consume(args.jetstream.into(), sketch_secret, cursor).await?
13492 } else {
13593 log::info!(
13694 "starting consumer with cursor: {cursor:?} from {:?} ago",
13795 cursor.map(|c| c.elapsed())
13896 );
139139- consumer::consume(&jetstream, cursor, false, sketch_secret).await?
9797+ consumer::consume(&args.jetstream, cursor, false, sketch_secret).await?
14098 };
14199142142- let rolling = write_store.background_tasks(reroll)?.run(backfill);
100100+ let rolling = write_store
101101+ .background_tasks(args.reroll)?
102102+ .run(args.backfill);
143103 let storing = write_store.receive_batches(batches);
144104145105 let stating = do_update_stuff(read_store);
+58-47
ufos/src/server.rs
ufos/src/server/mod.rs
···11+mod collections_query;
22+mod cors;
33+14use crate::index_html::INDEX_HTML;
25use crate::storage::StoreReader;
36use crate::store_types::{HourTruncatedCursor, WeekTruncatedCursor};
47use crate::{ConsumerInfo, Cursor, JustCount, Nsid, NsidCount, OrderCollectionsBy, UFOsRecord};
58use base64::{engine::general_purpose::URL_SAFE_NO_PAD, Engine as _};
69use chrono::{DateTime, Utc};
1010+use collections_query::MultiCollectionQuery;
1111+use cors::{OkCors, OkCorsResponse};
712use dropshot::endpoint;
813use dropshot::ApiDescription;
914use dropshot::Body;
···1116use dropshot::ConfigLogging;
1217use dropshot::ConfigLoggingLevel;
1318use dropshot::HttpError;
1414-use dropshot::HttpResponseHeaders;
1515-use dropshot::HttpResponseOk;
1619use dropshot::Query;
1720use dropshot::RequestContext;
1821use dropshot::ServerBuilder;
2222+1923use http::{Response, StatusCode};
2024use schemars::JsonSchema;
2125use serde::{Deserialize, Serialize};
···7478}]
7579async fn get_openapi(ctx: RequestContext<Context>) -> OkCorsResponse<serde_json::Value> {
7680 let spec = (*ctx.context().spec).clone();
7777- ok_cors(spec)
8181+ OkCors(spec).into()
7882}
79838084#[derive(Debug, Serialize, JsonSchema)]
···8387 storage: serde_json::Value,
8488 consumer: ConsumerInfo,
8589}
8686-/// Get meta information about UFOs itself
9090+/// UFOs meta-info
8791#[endpoint {
8892 method = GET,
8993 path = "/meta"
···103107 .await
104108 .map_err(failed_to_get("consumer info"))?;
105109106106- ok_cors(MetaInfo {
110110+ OkCors(MetaInfo {
107111 storage_name: storage.name(),
108112 storage: storage_info,
109113 consumer,
110114 })
115115+ .into()
111116}
112117113118// TODO: replace with normal (🙃) multi-qs value somehow
···145150 }
146151 }
147152}
148148-/// Get recent records by collection
153153+/// Record samples
154154+///
155155+/// Get most recent records seen in the firehose, by collection NSID
149156///
150157/// Multiple collections are supported. They will be delivered in one big array with no
151158/// specified order.
···190197 .map(|r| r.into())
191198 .collect();
192199193193- ok_cors(records)
200200+ OkCors(records).into()
194201}
195202196203#[derive(Debug, Deserialize, JsonSchema)]
197197-struct TotalSeenCollectionsQuery {
198198- collection: String, // JsonSchema not implemented for Nsid :(
204204+struct CollectionsStatsQuery {
205205+ /// Limit stats to those seen after this UTC datetime
206206+ ///
207207+ /// default: 1 week ago
208208+ since: Option<DateTime<Utc>>,
209209+ /// Limit stats to those seen before this UTC datetime
210210+ ///
211211+ /// default: now
212212+ until: Option<DateTime<Utc>>,
199213}
200200-#[derive(Debug, Serialize, JsonSchema)]
201201-struct TotalCounts {
202202- total_creates: u64,
203203- dids_estimate: u64,
204204-}
205205-/// Get total records seen by collection
214214+/// Collection stats
215215+///
216216+/// Get record statistics for collections during a specific time period.
217217+///
218218+/// Note: the statistics are "rolled up" into hourly buckets in the background,
219219+/// so the data here can be as stale as that background task is behind. See the
220220+/// meta info endpoint to find out how up-to-date the rollup currently is. (In
221221+/// general it sholud be pretty close to live)
206222#[endpoint {
207223 method = GET,
208208- path = "/records/total-seen"
224224+ path = "/collections/stats"
209225}]
210210-async fn get_records_total_seen(
226226+async fn get_collection_stats(
211227 ctx: RequestContext<Context>,
212212- collection_query: Query<TotalSeenCollectionsQuery>,
213213-) -> OkCorsResponse<HashMap<String, TotalCounts>> {
228228+ collections_query: MultiCollectionQuery,
229229+ query: Query<CollectionsStatsQuery>,
230230+) -> OkCorsResponse<HashMap<String, JustCount>> {
214231 let Context { storage, .. } = ctx.context();
232232+ let q = query.into_inner();
233233+ let collections: HashSet<Nsid> = collections_query.try_into()?;
215234216216- let query = collection_query.into_inner();
217217- let collections = to_multiple_nsids(&query.collection)
218218- .map_err(|reason| HttpError::for_bad_request(None, reason))?;
235235+ let since = q.since.map(dt_to_cursor).transpose()?.unwrap_or_else(|| {
236236+ let week_ago_secs = 7 * 86_400;
237237+ let week_ago = SystemTime::now() - Duration::from_secs(week_ago_secs);
238238+ Cursor::at(week_ago).into()
239239+ });
240240+241241+ let until = q.until.map(dt_to_cursor).transpose()?;
219242220243 let mut seen_by_collection = HashMap::with_capacity(collections.len());
221244222245 for collection in &collections {
223223- let (total_creates, dids_estimate) = storage
224224- .get_counts_by_collection(collection)
246246+ let counts = storage
247247+ .get_collection_counts(collection, since, until)
225248 .await
226249 .map_err(|e| HttpError::for_internal_error(format!("boooo: {e:?}")))?;
227250228228- seen_by_collection.insert(
229229- collection.to_string(),
230230- TotalCounts {
231231- total_creates,
232232- dids_estimate,
233233- },
234234- );
251251+ seen_by_collection.insert(collection.to_string(), counts);
235252 }
236253237237- ok_cors(seen_by_collection)
254254+ OkCors(seen_by_collection).into()
238255}
239256240257#[derive(Debug, Serialize, JsonSchema)]
···283300 order: Option<CollectionsQueryOrder>,
284301}
285302286286-/// Get collection with statistics
303303+/// List collections
304304+///
305305+/// With statistics.
287306///
288307/// ## To fetch a full list:
289308///
···353372354373 let next_cursor = next_cursor.map(|c| URL_SAFE_NO_PAD.encode(c));
355374356356- ok_cors(CollectionsResponse {
375375+ OkCors(CollectionsResponse {
357376 collections,
358377 cursor: next_cursor,
359378 })
379379+ .into()
360380}
361381362382#[derive(Debug, Deserialize, JsonSchema)]
···384404 range: Vec<DateTime<Utc>>,
385405 series: HashMap<String, Vec<JustCount>>,
386406}
387387-/// Get timeseries data
407407+/// Collection timeseries stats
388408#[endpoint {
389409 method = GET,
390410 path = "/timeseries"
···407427 let step = if let Some(secs) = q.step {
408428 if secs < 3600 {
409429 let msg = format!("step is too small: {}", secs);
410410- return Err(HttpError::for_bad_request(None, msg));
430430+ Err(HttpError::for_bad_request(None, msg))?;
411431 }
412432 (secs / 3600) * 3600 // trucate to hour
413433 } else {
···433453 .map(|(k, v)| (k.to_string(), v.iter().map(Into::into).collect()))
434454 .collect();
435455436436- ok_cors(CollectionTimeseriesResponse { range, series })
456456+ OkCors(CollectionTimeseriesResponse { range, series }).into()
437457}
438458439459pub async fn serve(storage: impl StoreReader + 'static) -> Result<(), String> {
···449469 api.register(get_openapi).unwrap();
450470 api.register(get_meta_info).unwrap();
451471 api.register(get_records_by_collections).unwrap();
452452- api.register(get_records_total_seen).unwrap();
472472+ api.register(get_collection_stats).unwrap();
453473 api.register(get_collections).unwrap();
454474 api.register(get_timeseries).unwrap();
455475···482502 .map_err(|error| format!("failed to start server: {}", error))?
483503 .await
484504}
485485-486486-/// awkward helpers
487487-type OkCorsResponse<T> = Result<HttpResponseHeaders<HttpResponseOk<T>>, HttpError>;
488488-fn ok_cors<T: Send + Sync + Serialize + JsonSchema>(t: T) -> OkCorsResponse<T> {
489489- let mut res = HttpResponseHeaders::new_unnamed(HttpResponseOk(t));
490490- res.headers_mut()
491491- .insert("access-control-allow-origin", "*".parse().unwrap());
492492- Ok(res)
493493-}
+72
ufos/src/server/collections_query.rs
···11+use crate::Nsid;
22+use async_trait::async_trait;
33+use dropshot::{
44+ ApiEndpointBodyContentType, ExtractorMetadata, HttpError, Query, RequestContext, ServerContext,
55+ SharedExtractor,
66+};
77+use schemars::JsonSchema;
88+use serde::Deserialize;
99+use std::collections::HashSet;
1010+1111+/// The real type that gets deserialized
1212+#[derive(Debug, Deserialize, JsonSchema)]
1313+pub struct MultiCollectionQuery {
1414+ pub collection: Vec<String>,
1515+}
1616+1717+/// The fake corresponding type for docs that dropshot won't freak out about a
1818+/// vec for
1919+#[derive(Deserialize, JsonSchema)]
2020+#[allow(dead_code)]
2121+struct MultiCollectionQueryForDocs {
2222+ /// One or more collection [NSID](https://atproto.com/specs/nsid)s
2323+ ///
2424+ /// Pass this parameter multiple times to specify multiple collections, like
2525+ /// `collection=app.bsky.feed.like&collection=app.bsky.feed.post`
2626+ collection: String,
2727+}
2828+2929+impl TryFrom<MultiCollectionQuery> for HashSet<Nsid> {
3030+ type Error = HttpError;
3131+ fn try_from(mcq: MultiCollectionQuery) -> Result<Self, Self::Error> {
3232+ let mut out = HashSet::with_capacity(mcq.collection.len());
3333+ for c in mcq.collection {
3434+ let nsid = Nsid::new(c).map_err(|e| {
3535+ HttpError::for_bad_request(
3636+ None,
3737+ format!("failed to convert collection to an NSID: {e:?}"),
3838+ )
3939+ })?;
4040+ out.insert(nsid);
4141+ }
4242+ Ok(out)
4343+ }
4444+}
4545+4646+// The `SharedExtractor` implementation for Query<QueryType> describes how to
4747+// construct an instance of `Query<QueryType>` from an HTTP request: namely, by
4848+// parsing the query string to an instance of `QueryType`.
4949+#[async_trait]
5050+impl SharedExtractor for MultiCollectionQuery {
5151+ async fn from_request<Context: ServerContext>(
5252+ ctx: &RequestContext<Context>,
5353+ ) -> Result<MultiCollectionQuery, HttpError> {
5454+ let raw_query = ctx.request.uri().query().unwrap_or("");
5555+ let q = serde_qs::from_str(raw_query).map_err(|e| {
5656+ HttpError::for_bad_request(None, format!("unable to parse query string: {}", e))
5757+ })?;
5858+ Ok(q)
5959+ }
6060+6161+ fn metadata(body_content_type: ApiEndpointBodyContentType) -> ExtractorMetadata {
6262+ // HACK: query type switcheroo: passing MultiCollectionQuery to
6363+ // `metadata` would "helpfully" panic because dropshot believes we can
6464+ // only have scalar types in a query.
6565+ //
6666+ // so instead we have a fake second type whose only job is to look the
6767+ // same as MultiCollectionQuery exept that it has `String` instead of
6868+ // `Vec<String>`, which dropshot will accept, and generate ~close-enough
6969+ // docs for.
7070+ <Query<MultiCollectionQueryForDocs> as SharedExtractor>::metadata(body_content_type)
7171+ }
7272+}
+23
ufos/src/server/cors.rs
···11+use dropshot::{HttpError, HttpResponseHeaders, HttpResponseOk};
22+use schemars::JsonSchema;
33+use serde::Serialize;
44+55+pub type OkCorsResponse<T> = Result<HttpResponseHeaders<HttpResponseOk<T>>, HttpError>;
66+77+/// Helper for constructing Ok responses: return OkCors(T).into()
88+/// (not happy with this yet)
99+pub struct OkCors<T: Serialize + JsonSchema + Send + Sync>(pub T);
1010+1111+impl<T> From<OkCors<T>> for OkCorsResponse<T>
1212+where
1313+ T: Serialize + JsonSchema + Send + Sync,
1414+{
1515+ fn from(ok: OkCors<T>) -> OkCorsResponse<T> {
1616+ let mut res = HttpResponseHeaders::new_unnamed(HttpResponseOk(ok.0));
1717+ res.headers_mut()
1818+ .insert("access-control-allow-origin", "*".parse().unwrap());
1919+ Ok(res)
2020+ }
2121+}
2222+2323+// TODO: cors for HttpError