a textual notation to locate fields within atproto records (draft spec) microcosm.tngl.io/RecordPath/
8
fork

Configure Feed

Select the types of activity you want to include in your feed.

generators/iterators for path-enumerate output

phil 488901ed 53c7c712

+333 -151
+3 -1
playground/src/routes/+page.svelte
··· 65 65 66 66 let record = $derived(parseResult.record); 67 67 let jsonError = $derived(parseResult.error); 68 - let paths: PathInfo[] = $derived(record ? enumerate(record) : []); 68 + let paths: PathInfo[] = $derived( 69 + record ? Array.from(enumerate(record), ([p]) => p) : [] 70 + ); 69 71 70 72 let pathResult = $derived.by(() => { 71 73 const trimmed = pathInput.trim();
+80 -56
ref-impl-js/src/index.ts
··· 221 221 } 222 222 223 223 // Enumerate all RecordPaths reachable from a record. 224 - export function enumerate(record: Record<string, unknown>): PathInfo[] { 225 - const paths = new Map<string, PathInfo>(); 226 - enumObject(record, '', false, paths); 227 - return Array.from(paths.values()); 224 + // Returns a generator yielding [PathInfo, value] pairs, deduplicated by path. 225 + export function* enumerate( 226 + record: Record<string, unknown> 227 + ): Generator<[PathInfo, unknown]> { 228 + const seen = new Set<string>(); 229 + yield* enumObject(seen, record, '', false); 228 230 } 229 231 230 - function enumObject( 232 + function* enumObject( 233 + seen: Set<string>, 231 234 obj: Record<string, unknown>, 232 235 prefix: string, 233 - isVector: boolean, 234 - paths: Map<string, PathInfo> 235 - ) { 236 + isVector: boolean 237 + ): Generator<[PathInfo, unknown]> { 238 + const vtype = isVector ? 'vector' : 'scalar'; 239 + 236 240 for (const key of Object.keys(obj)) { 237 241 const child = obj[key]; 238 242 const escaped = escapeFieldName(key); 239 243 const keyPath = prefix ? prefix + '.' + escaped : escaped; 240 - const vtype = isVector ? 'vector' : 'scalar'; 241 244 242 245 if (child === null || child === undefined || typeof child !== 'object') { 243 - paths.set(keyPath, { path: keyPath, type: vtype }); 246 + if (!seen.has(keyPath)) { 247 + seen.add(keyPath); 248 + yield [{ path: keyPath, type: vtype }, child]; 249 + } 244 250 } else if (Array.isArray(child)) { 245 - paths.set(keyPath, { path: keyPath, type: vtype }); 246 - enumArray(child, keyPath, isVector, paths); 251 + if (!seen.has(keyPath)) { 252 + seen.add(keyPath); 253 + yield [{ path: keyPath, type: vtype }, child]; 254 + } 255 + yield* enumArray(seen, child, keyPath); 247 256 } else if ((child as Record<string, unknown>).$type) { 248 - paths.set(keyPath, { path: keyPath, type: vtype }); 257 + if (!seen.has(keyPath)) { 258 + seen.add(keyPath); 259 + yield [{ path: keyPath, type: vtype }, child]; 260 + } 249 261 const nsid = (child as Record<string, unknown>).$type as string; 250 262 const qualified = keyPath + '{' + nsid + '}'; 251 - paths.set(qualified, { path: qualified, type: vtype }); 252 - enumObject(child as Record<string, unknown>, qualified, isVector, paths); 263 + if (!seen.has(qualified)) { 264 + seen.add(qualified); 265 + yield [{ path: qualified, type: vtype }, child]; 266 + } 267 + yield* enumObject(seen, child as Record<string, unknown>, qualified, isVector); 253 268 } else { 254 - paths.set(keyPath, { path: keyPath, type: vtype }); 255 - enumObject(child as Record<string, unknown>, keyPath, isVector, paths); 269 + if (!seen.has(keyPath)) { 270 + seen.add(keyPath); 271 + yield [{ path: keyPath, type: vtype }, child]; 272 + } 273 + yield* enumObject(seen, child as Record<string, unknown>, keyPath, isVector); 256 274 } 257 275 } 258 276 } 259 277 260 - function enumArray( 278 + function* enumArray( 279 + seen: Set<string>, 261 280 arr: unknown[], 262 - prefix: string, 263 - _parentIsVector: boolean, 264 - paths: Map<string, PathInfo> 265 - ) { 281 + prefix: string 282 + ): Generator<[PathInfo, unknown]> { 266 283 const hasUnion = arr.some( 267 284 (el) => 268 285 typeof el === 'object' && ··· 272 289 ); 273 290 274 291 if (hasUnion) { 275 - const byType: Record<string, Record<string, unknown>[]> = {}; 276 - const plain: unknown[] = []; 292 + let hasPlain = false; 277 293 for (const el of arr) { 278 - if ( 279 - typeof el === 'object' && 280 - el !== null && 281 - !Array.isArray(el) && 282 - (el as Record<string, unknown>).$type 283 - ) { 284 - const nsid = (el as Record<string, unknown>).$type as string; 285 - (byType[nsid] || (byType[nsid] = [])).push(el as Record<string, unknown>); 294 + const nsid = 295 + typeof el === 'object' && el !== null && !Array.isArray(el) 296 + ? ((el as Record<string, unknown>).$type as string | undefined) 297 + : undefined; 298 + if (nsid) { 299 + const qp = prefix + '[' + nsid + ']'; 300 + if (!seen.has(qp)) { 301 + seen.add(qp); 302 + yield [{ path: qp, type: 'vector' }, el]; 303 + } 304 + if (typeof el === 'object' && el !== null && !Array.isArray(el)) { 305 + yield* enumObject(seen, el as Record<string, unknown>, qp, true); 306 + } 286 307 } else { 287 - plain.push(el); 308 + hasPlain = true; 309 + yield* enumValue(seen, el, prefix + '[]'); 288 310 } 289 311 } 290 - for (const [nsid, elements] of Object.entries(byType)) { 291 - const qp = prefix + '[' + nsid + ']'; 292 - paths.set(qp, { path: qp, type: 'vector' }); 293 - for (const el of elements) { 294 - enumObject(el, qp, true, paths); 312 + if (hasPlain) { 313 + const bare = prefix + '[]'; 314 + if (!seen.has(bare)) { 315 + seen.add(bare); 316 + yield [{ path: bare, type: 'vector' }, arr]; 295 317 } 296 318 } 297 - if (plain.length > 0) { 298 - enumPlainArray(plain, prefix + '[]', paths); 299 - } 300 319 } else { 301 - enumPlainArray(arr, prefix + '[]', paths); 320 + const bare = prefix + '[]'; 321 + if (!seen.has(bare)) { 322 + seen.add(bare); 323 + yield [{ path: bare, type: 'vector' }, arr]; 324 + } 325 + for (const el of arr) { 326 + yield* enumValue(seen, el, bare); 327 + } 302 328 } 303 329 } 304 330 305 - function enumPlainArray(arr: unknown[], prefix: string, paths: Map<string, PathInfo>) { 306 - paths.set(prefix, { path: prefix, type: 'vector' }); 307 - for (const el of arr) { 308 - if (el === null || el === undefined || typeof el !== 'object') { 309 - // scalar elements — path is the array prefix itself 310 - } else if (Array.isArray(el)) { 311 - enumArray(el, prefix, true, paths); 312 - } else { 313 - enumObject(el as Record<string, unknown>, prefix, true, paths); 314 - } 331 + function* enumValue( 332 + seen: Set<string>, 333 + value: unknown, 334 + prefix: string 335 + ): Generator<[PathInfo, unknown]> { 336 + if (value === null || value === undefined || typeof value !== 'object') { 337 + return; 338 + } 339 + if (Array.isArray(value)) { 340 + yield* enumArray(seen, value, prefix); 341 + } else { 342 + yield* enumObject(seen, value as Record<string, unknown>, prefix, true); 315 343 } 316 344 } 317 - 318 - // TODO: enumerateMatching? pass a test fn that accepts (path, value) pairs and returns bool 319 - // could just filter the output of enumerate, but this avoids collecting lots of stuff we don't need 320 - // (or enumerate could be a generator?? that might be even nicer) 321 345 322 346 export function isVector(pathStr: string): boolean { 323 347 for (let i = 0; i < pathStr.length; i++) {
+3 -3
ref-impl-js/test/interop.test.ts
··· 31 31 describe('enumerate', () => { 32 32 for (const t of enumFixture.tests) { 33 33 it(t.description, () => { 34 - const result = enumerate(t.record); 35 - 36 34 // Compare as sets of {path, type} 37 - const resultSet = new Set(result.map((p: PathInfo) => `${p.path}:${p.type}`)); 35 + const resultSet = new Set( 36 + Array.from(enumerate(t.record), ([p]) => `${p.path}:${p.type}`) 37 + ); 38 38 const expectedSet = new Set( 39 39 t.expected.map((p: { path: string; type: string }) => `${p.path}:${p.type}`) 40 40 );
+238 -86
ref-impl-rust/src/lib.rs
··· 330 330 331 331 // -- Enumerator -- 332 332 333 - struct PathCollector { 333 + const DEFAULT_MAX_DEPTH: usize = 64; 334 + 335 + /// Returns a lazy iterator over all `(PathInfo, &Value)` pairs reachable from 336 + /// a record. Paths are deduplicated; each unique path is yielded once. 337 + pub fn enumerate(record: &Value) -> Paths<'_> { 338 + Paths::new(record, DEFAULT_MAX_DEPTH) 339 + } 340 + 341 + /// Work items for the stack-based tree walk. 342 + enum Work<'a> { 343 + /// Yield this path+value if not yet seen. 344 + Emit { 345 + path: String, 346 + path_type: PathType, 347 + value: &'a Value, 348 + }, 349 + /// Expand an object's entries onto the stack. 350 + Object { 351 + obj: &'a serde_json::Map<String, Value>, 352 + prefix: String, 353 + is_vector: bool, 354 + depth: usize, 355 + }, 356 + /// Expand an array's elements onto the stack. 357 + Array { 358 + arr: &'a [Value], 359 + arr_value: &'a Value, 360 + prefix: String, 361 + depth: usize, 362 + }, 363 + } 364 + 365 + pub struct Paths<'a> { 366 + stack: Vec<Work<'a>>, 334 367 seen: HashSet<String>, 335 - paths: Vec<PathInfo>, 368 + max_depth: usize, 336 369 } 337 370 338 - impl PathCollector { 339 - fn new() -> Self { 340 - Self { 371 + impl<'a> Paths<'a> { 372 + fn new(record: &'a Value, max_depth: usize) -> Self { 373 + let mut paths = Self { 374 + stack: Vec::new(), 341 375 seen: HashSet::new(), 342 - paths: Vec::new(), 343 - } 344 - } 345 - 346 - fn insert(&mut self, path: &str, path_type: PathType) { 347 - if self.seen.insert(path.to_string()) { 348 - self.paths.push(PathInfo { 349 - path: path.to_string(), 350 - path_type, 376 + max_depth, 377 + }; 378 + if let Some(obj) = record.as_object() { 379 + paths.stack.push(Work::Object { 380 + obj, 381 + prefix: String::new(), 382 + is_vector: false, 383 + depth: 0, 351 384 }); 352 385 } 386 + paths 353 387 } 354 - } 355 388 356 - pub fn enumerate(record: &Value) -> Vec<PathInfo> { 357 - let mut collector = PathCollector::new(); 358 - if let Some(obj) = record.as_object() { 359 - enum_object(obj, "", false, &mut collector); 389 + pub fn with_max_depth(mut self, max_depth: usize) -> Self { 390 + self.max_depth = max_depth; 391 + self 360 392 } 361 - collector.paths 362 - } 363 393 364 - fn enum_object( 365 - obj: &serde_json::Map<String, Value>, 366 - prefix: &str, 367 - is_vector: bool, 368 - out: &mut PathCollector, 369 - ) { 370 - let vtype = if is_vector { 371 - PathType::Vector 372 - } else { 373 - PathType::Scalar 374 - }; 375 - 376 - for (key, child) in obj { 377 - let escaped = escape_field_name(key); 378 - let key_path = if prefix.is_empty() { 379 - escaped 394 + fn expand_object( 395 + &mut self, 396 + obj: &'a serde_json::Map<String, Value>, 397 + prefix: &str, 398 + is_vector: bool, 399 + depth: usize, 400 + ) { 401 + let vtype = if is_vector { 402 + PathType::Vector 380 403 } else { 381 - format!("{prefix}.{escaped}") 404 + PathType::Scalar 382 405 }; 383 406 384 - out.insert(&key_path, vtype); 407 + // Push in reverse so the first key is at the top of the stack. 408 + let entries: Vec<_> = obj.iter().collect(); 409 + for (key, child) in entries.into_iter().rev() { 410 + let escaped = escape_field_name(key); 411 + let key_path = if prefix.is_empty() { 412 + escaped 413 + } else { 414 + format!("{prefix}.{escaped}") 415 + }; 385 416 386 - match child { 387 - Value::Array(arr) => enum_array(arr, &key_path, is_vector, out), 388 - Value::Object(child_obj) => { 389 - match child_obj.get("$type").and_then(|t| t.as_str()) { 390 - Some(nsid) => { 391 - let qualified = format!("{key_path}{{{nsid}}}"); 392 - out.insert(&qualified, vtype); 393 - enum_object(child_obj, &qualified, is_vector, out); 417 + // Push children first (deeper in stack), then the emit (top). 418 + match child { 419 + Value::Array(arr) => { 420 + self.stack.push(Work::Array { 421 + arr, 422 + arr_value: child, 423 + prefix: key_path.clone(), 424 + depth: depth + 1, 425 + }); 426 + self.stack.push(Work::Emit { 427 + path: key_path, 428 + path_type: vtype, 429 + value: child, 430 + }); 431 + } 432 + Value::Object(child_obj) => { 433 + match child_obj.get("$type").and_then(|t| t.as_str()) { 434 + Some(nsid) => { 435 + let qualified = format!("{key_path}{{{nsid}}}"); 436 + self.stack.push(Work::Object { 437 + obj: child_obj, 438 + prefix: qualified.clone(), 439 + is_vector, 440 + depth: depth + 1, 441 + }); 442 + self.stack.push(Work::Emit { 443 + path: qualified, 444 + path_type: vtype, 445 + value: child, 446 + }); 447 + self.stack.push(Work::Emit { 448 + path: key_path, 449 + path_type: vtype, 450 + value: child, 451 + }); 452 + } 453 + None => { 454 + self.stack.push(Work::Object { 455 + obj: child_obj, 456 + prefix: key_path.clone(), 457 + is_vector, 458 + depth: depth + 1, 459 + }); 460 + self.stack.push(Work::Emit { 461 + path: key_path, 462 + path_type: vtype, 463 + value: child, 464 + }); 465 + } 394 466 } 395 - None => enum_object(child_obj, &key_path, is_vector, out), 467 + } 468 + _ => { 469 + self.stack.push(Work::Emit { 470 + path: key_path, 471 + path_type: vtype, 472 + value: child, 473 + }); 396 474 } 397 475 } 398 - _ => {} // scalars already inserted 399 476 } 400 477 } 401 - } 402 478 403 - fn enum_array(arr: &[Value], prefix: &str, _is_vector: bool, out: &mut PathCollector) { 404 - let has_union = arr 405 - .iter() 406 - .any(|el| el.as_object().is_some_and(|o| o.contains_key("$type"))); 479 + fn expand_array( 480 + &mut self, 481 + arr: &'a [Value], 482 + arr_value: &'a Value, 483 + prefix: &str, 484 + depth: usize, 485 + ) { 486 + let has_union = arr 487 + .iter() 488 + .any(|el| el.as_object().is_some_and(|o| o.contains_key("$type"))); 407 489 408 - if has_union { 409 - // Partition into typed (union) and plain elements. 410 - // Use a Vec of pairs to preserve encounter order across types. 411 - let mut seen_types = HashSet::new(); 412 - let mut has_plain = false; 490 + if has_union { 491 + let mut has_plain = false; 413 492 414 - for el in arr { 415 - match el.as_object().and_then(|o| o.get("$type")).and_then(|t| t.as_str()) { 416 - Some(nsid) => { 417 - let qp = format!("{prefix}[{nsid}]"); 418 - if seen_types.insert(nsid.to_string()) { 419 - out.insert(&qp, PathType::Vector); 493 + for el in arr.iter().rev() { 494 + match el 495 + .as_object() 496 + .and_then(|o| o.get("$type")) 497 + .and_then(|t| t.as_str()) 498 + { 499 + Some(nsid) => { 500 + let qp = format!("{prefix}[{nsid}]"); 501 + if let Some(obj) = el.as_object() { 502 + self.stack.push(Work::Object { 503 + obj, 504 + prefix: qp.clone(), 505 + is_vector: true, 506 + depth: depth + 1, 507 + }); 508 + } 509 + self.stack.push(Work::Emit { 510 + path: qp, 511 + path_type: PathType::Vector, 512 + value: el, 513 + }); 420 514 } 421 - if let Some(obj) = el.as_object() { 422 - enum_object(obj, &qp, true, out); 515 + None => { 516 + has_plain = true; 517 + self.expand_child_value(el, &format!("{prefix}[]"), depth + 1); 423 518 } 424 519 } 425 - None => has_plain = true, 426 520 } 427 - } 428 - if has_plain { 521 + 522 + if has_plain { 523 + self.stack.push(Work::Emit { 524 + path: format!("{prefix}[]"), 525 + path_type: PathType::Vector, 526 + value: arr_value, 527 + }); 528 + } 529 + } else { 429 530 let bare = format!("{prefix}[]"); 430 - out.insert(&bare, PathType::Vector); 431 - for el in arr.iter().filter(|el| { 432 - !el.as_object() 433 - .is_some_and(|o| o.contains_key("$type")) 434 - }) { 435 - enum_value(el, &bare, out); 531 + 532 + for el in arr.iter().rev() { 533 + self.expand_child_value(el, &bare, depth + 1); 436 534 } 535 + 536 + self.stack.push(Work::Emit { 537 + path: bare, 538 + path_type: PathType::Vector, 539 + value: arr_value, 540 + }); 437 541 } 438 - } else { 439 - let bare = format!("{prefix}[]"); 440 - out.insert(&bare, PathType::Vector); 441 - for el in arr { 442 - enum_value(el, &bare, out); 542 + } 543 + 544 + fn expand_child_value(&mut self, value: &'a Value, prefix: &str, depth: usize) { 545 + match value { 546 + Value::Object(obj) => { 547 + self.stack.push(Work::Object { 548 + obj, 549 + prefix: prefix.to_string(), 550 + is_vector: true, 551 + depth, 552 + }); 553 + } 554 + Value::Array(arr) => { 555 + self.stack.push(Work::Array { 556 + arr, 557 + arr_value: value, 558 + prefix: prefix.to_string(), 559 + depth, 560 + }); 561 + } 562 + _ => {} 443 563 } 444 564 } 445 565 } 446 566 447 - fn enum_value(value: &Value, prefix: &str, out: &mut PathCollector) { 448 - match value { 449 - Value::Array(inner) => enum_array(inner, prefix, true, out), 450 - Value::Object(obj) => enum_object(obj, prefix, true, out), 451 - _ => {} 567 + impl<'a> Iterator for Paths<'a> { 568 + type Item = (PathInfo, &'a Value); 569 + 570 + fn next(&mut self) -> Option<Self::Item> { 571 + loop { 572 + match self.stack.pop()? { 573 + Work::Emit { 574 + path, 575 + path_type, 576 + value, 577 + } => { 578 + if self.seen.insert(path.clone()) { 579 + return Some((PathInfo { path, path_type }, value)); 580 + } 581 + } 582 + Work::Object { 583 + obj, 584 + prefix, 585 + is_vector, 586 + depth, 587 + } => { 588 + if depth <= self.max_depth { 589 + self.expand_object(obj, &prefix, is_vector, depth); 590 + } 591 + } 592 + Work::Array { 593 + arr, 594 + arr_value, 595 + prefix, 596 + depth, 597 + } => { 598 + if depth <= self.max_depth { 599 + self.expand_array(arr, arr_value, &prefix, depth); 600 + } 601 + } 602 + } 603 + } 452 604 } 453 605 } 454 606
+2 -4
ref-impl-rust/tests/interop.rs
··· 55 55 fn enumerate_tests() { 56 56 let f: EnumFixture = serde_json::from_str(ENUMERATE_JSON).unwrap(); 57 57 for t in &f.tests { 58 - let result = enumerate(&t.record); 59 - let result_set: HashSet<String> = result 60 - .iter() 61 - .map(|p| { 58 + let result_set: HashSet<String> = enumerate(&t.record) 59 + .map(|(p, _value)| { 62 60 let ty = match p.path_type { 63 61 PathType::Scalar => "scalar", 64 62 PathType::Vector => "vector",
+7 -1
spec.md
··· 223 223 Lexicon evolution rules forbid **type changes** across schema revisions, so a plain object ref cannot be converted to a union in a lexicon-forward-compatibility-compliant revision under the same NSID, so RecordPath canonicalization is hopefully safe from forward-compatible lexicon changes. 224 224 225 225 226 - todo: 226 + #### todo 227 227 228 228 - api recommendations for scalar vs vector queries 229 229 ··· 235 235 - backlinks example: match links on non-link fields if they happen to parse 236 236 237 237 - bring back the expected order of matches: depth-first-search order (probably as a "should") 238 + 239 + - include cbor/drisl -- keep json for examples, but all this should be applicable 240 + 241 + #### questions 242 + 243 + - should an empty RecordPath be legal? would match the entire record.