a textual notation to locate fields within atproto records (draft spec) microcosm.tngl.io/RecordPath/
9
fork

Configure Feed

Select the types of activity you want to include in your feed.

at main 620 lines 19 kB view raw
1// RecordPath — parser, matcher, enumerator 2// Reference implementation of the RecordPath draft spec 3 4use serde_json::Value; 5use std::collections::HashSet; 6 7// -- Error -- 8 9/// A parse error from a RecordPath string. 10#[derive(Debug, Clone, thiserror::Error)] 11pub enum ParseError { 12 #[error("empty path")] 13 EmptyPath, 14 15 #[error("empty segment (position {position} in '{input}')")] 16 EmptySegment { input: String, position: usize }, 17 18 #[error("escape at end of input (position {position} in '{input}')")] 19 EscapeAtEnd { input: String, position: usize }, 20 21 #[error("escape followed by non-escapable '{ch}' (position {position} in '{input}')")] 22 InvalidEscape { 23 input: String, 24 position: usize, 25 ch: char, 26 }, 27 28 #[error("unexpected '{ch}' without opening bracket (position {position} in '{input}')")] 29 UnexpectedClose { 30 input: String, 31 position: usize, 32 ch: char, 33 }, 34 35 #[error("unclosed '{open}' (position {position} in '{input}')")] 36 Unclosed { 37 input: String, 38 position: usize, 39 open: char, 40 }, 41 42 #[error("trailing dot (position {position} in '{input}')")] 43 TrailingDot { input: String, position: usize }, 44} 45 46impl ParseError { 47 pub fn input(&self) -> &str { 48 match self { 49 Self::EmptyPath => "", 50 Self::EmptySegment { input, .. } 51 | Self::EscapeAtEnd { input, .. } 52 | Self::InvalidEscape { input, .. } 53 | Self::UnexpectedClose { input, .. } 54 | Self::Unclosed { input, .. } 55 | Self::TrailingDot { input, .. } => input, 56 } 57 } 58 59 pub fn position(&self) -> usize { 60 match self { 61 Self::EmptyPath => 0, 62 Self::EmptySegment { position, .. } 63 | Self::EscapeAtEnd { position, .. } 64 | Self::InvalidEscape { position, .. } 65 | Self::UnexpectedClose { position, .. } 66 | Self::Unclosed { position, .. } 67 | Self::TrailingDot { position, .. } => *position, 68 } 69 } 70 71 /// A corrected version of the input, if one can be inferred. 72 pub fn suggestion(&self) -> Option<String> { 73 let input = self.input(); 74 let pos = self.position(); 75 match self { 76 Self::EmptyPath | Self::EmptySegment { .. } => None, 77 Self::EscapeAtEnd { .. } => Some(format!("{}!!", &input[..pos])), 78 Self::InvalidEscape { .. } => { 79 Some(format!("{}!!{}", &input[..pos], &input[pos + 1..])) 80 } 81 Self::UnexpectedClose { ch, .. } => { 82 Some(format!("{}!{ch}{}", &input[..pos], &input[pos + 1..])) 83 } 84 Self::Unclosed { open, .. } => { 85 let close = if *open == '[' { ']' } else { '}' }; 86 Some(format!("{input}{close}")) 87 } 88 Self::TrailingDot { .. } => Some(input[..input.len() - 1].to_string()), 89 } 90 } 91 92 /// Explanation of what the suggestion changes. 93 pub fn suggestion_hint(&self) -> Option<String> { 94 match self { 95 Self::EmptyPath | Self::EmptySegment { .. } => None, 96 Self::EscapeAtEnd { .. } | Self::InvalidEscape { .. } => { 97 Some("escape the '!' as '!!'".into()) 98 } 99 Self::UnexpectedClose { ch, .. } => Some(format!("escape as '!{ch}'")), 100 Self::Unclosed { open, .. } => { 101 let close = if *open == '[' { ']' } else { '}' }; 102 Some(format!("close with '{close}'")) 103 } 104 Self::TrailingDot { .. } => Some("remove the trailing dot".into()), 105 } 106 } 107} 108 109// -- Types -- 110 111#[derive(Debug, Clone, PartialEq, Eq)] 112pub enum Qualifier { 113 Array, 114 ArrayUnion(String), 115 ScalarUnion(String), 116} 117 118#[derive(Debug, Clone, PartialEq, Eq)] 119pub struct Segment { 120 pub key: String, 121 pub qualifiers: Vec<Qualifier>, 122} 123 124#[derive(Debug, Clone, Copy, PartialEq, Eq)] 125pub enum PathType { 126 Scalar, 127 Vector, 128} 129 130#[derive(Debug, Clone, PartialEq, Eq)] 131pub struct PathInfo { 132 pub path: String, 133 pub path_type: PathType, 134} 135 136// -- Escape -- 137 138fn is_structural(ch: char) -> bool { 139 matches!(ch, '.' | '[' | ']' | '{' | '}' | '!') 140} 141 142pub fn escape_field_name(key: &str) -> String { 143 let mut out = String::with_capacity(key.len()); 144 for ch in key.chars() { 145 if is_structural(ch) { 146 out.push('!'); 147 } 148 out.push(ch); 149 } 150 out 151} 152 153// -- Parser -- 154 155pub fn parse(input: &str) -> Result<Vec<Segment>, ParseError> { 156 if input.is_empty() { 157 return Err(ParseError::EmptyPath); 158 } 159 160 let bytes = input.as_bytes(); 161 let len = bytes.len(); 162 let mut segments = Vec::new(); 163 let mut i = 0; 164 165 while i < len { 166 let mut key = String::new(); 167 168 while i < len && !matches!(bytes[i], b'.' | b'[' | b'{') { 169 match bytes[i] { 170 b'!' => { 171 if i + 1 >= len { 172 return Err(ParseError::EscapeAtEnd { 173 input: input.into(), 174 position: i, 175 }); 176 } 177 let next = bytes[i + 1] as char; 178 if !is_structural(next) { 179 return Err(ParseError::InvalidEscape { 180 input: input.into(), 181 position: i, 182 ch: next, 183 }); 184 } 185 key.push(next); 186 i += 2; 187 } 188 b']' | b'}' => { 189 return Err(ParseError::UnexpectedClose { 190 input: input.into(), 191 position: i, 192 ch: bytes[i] as char, 193 }); 194 } 195 _ => { 196 // Consume a full UTF-8 character (structural chars are all 197 // ASCII, so non-ASCII bytes are always key content). 198 let start = i; 199 i += 1; 200 while i < len && (bytes[i] & 0xC0) == 0x80 { 201 i += 1; 202 } 203 key.push_str(&input[start..i]); 204 } 205 } 206 } 207 208 if key.is_empty() { 209 return Err(ParseError::EmptySegment { 210 input: input.into(), 211 position: i, 212 }); 213 } 214 215 let mut qualifiers = Vec::new(); 216 while i < len && matches!(bytes[i], b'[' | b'{') { 217 let open = bytes[i] as char; 218 let close = if open == '[' { b']' } else { b'}' }; 219 let open_pos = i; 220 i += 1; 221 let content_start = i; 222 while i < len && bytes[i] != close { 223 i += 1; 224 } 225 if i >= len { 226 return Err(ParseError::Unclosed { 227 input: input.into(), 228 position: open_pos, 229 open, 230 }); 231 } 232 let content = &input[content_start..i]; 233 i += 1; 234 235 qualifiers.push(match (open, content.is_empty()) { 236 ('[', true) => Qualifier::Array, 237 ('[', false) => Qualifier::ArrayUnion(content.into()), 238 (_, _) => Qualifier::ScalarUnion(content.into()), 239 }); 240 } 241 242 segments.push(Segment { key, qualifiers }); 243 244 if i < len && bytes[i] == b'.' { 245 i += 1; 246 if i >= len { 247 return Err(ParseError::TrailingDot { 248 input: input.into(), 249 position: i - 1, 250 }); 251 } 252 } 253 } 254 255 Ok(segments) 256} 257 258// -- Matcher -- 259 260pub fn match_path(record: &Value, path: &str) -> Vec<Value> { 261 let Ok(segments) = parse(path) else { 262 return vec![]; 263 }; 264 match_segments(record, &segments, 0) 265} 266 267fn match_segments(data: &Value, segments: &[Segment], seg_idx: usize) -> Vec<Value> { 268 if seg_idx >= segments.len() { 269 return vec![data.clone()]; 270 } 271 let seg = &segments[seg_idx]; 272 let Some(obj) = data.as_object() else { 273 return vec![]; 274 }; 275 let Some(value) = obj.get(&seg.key) else { 276 return vec![]; 277 }; 278 apply_qualifiers(value, &seg.qualifiers, 0, segments, seg_idx) 279} 280 281fn apply_qualifiers( 282 value: &Value, 283 qualifiers: &[Qualifier], 284 qual_idx: usize, 285 segments: &[Segment], 286 seg_idx: usize, 287) -> Vec<Value> { 288 let Some(qual) = qualifiers.get(qual_idx) else { 289 // No more qualifiers — advance to next segment or collect 290 return if seg_idx + 1 >= segments.len() { 291 vec![value.clone()] 292 } else { 293 match_segments(value, segments, seg_idx + 1) 294 }; 295 }; 296 297 match qual { 298 Qualifier::ScalarUnion(nsid) => { 299 let type_matches = value 300 .as_object() 301 .and_then(|o| o.get("$type")) 302 .and_then(|t| t.as_str()) 303 .is_some_and(|t| t == nsid); 304 if type_matches { 305 apply_qualifiers(value, qualifiers, qual_idx + 1, segments, seg_idx) 306 } else { 307 vec![] 308 } 309 } 310 Qualifier::Array | Qualifier::ArrayUnion(_) => { 311 let Some(arr) = value.as_array() else { 312 return vec![]; 313 }; 314 arr.iter() 315 .filter(|elem| match qual { 316 Qualifier::ArrayUnion(nsid) => elem 317 .as_object() 318 .and_then(|o| o.get("$type")) 319 .and_then(|t| t.as_str()) 320 .is_some_and(|t| t == nsid), 321 _ => true, 322 }) 323 .flat_map(|elem| { 324 apply_qualifiers(elem, qualifiers, qual_idx + 1, segments, seg_idx) 325 }) 326 .collect() 327 } 328 } 329} 330 331// -- Enumerator -- 332 333const DEFAULT_MAX_DEPTH: usize = 64; 334 335/// Returns a lazy iterator over all `(PathInfo, &Value)` pairs reachable from 336/// a record. Paths are deduplicated; each unique path is yielded once. 337pub fn enumerate(record: &Value) -> Paths<'_> { 338 Paths::new(record, DEFAULT_MAX_DEPTH) 339} 340 341/// Work items for the stack-based tree walk. 342enum Work<'a> { 343 /// Yield this path+value if not yet seen. 344 Emit { 345 path: String, 346 path_type: PathType, 347 value: &'a Value, 348 }, 349 /// Expand an object's entries onto the stack. 350 Object { 351 obj: &'a serde_json::Map<String, Value>, 352 prefix: String, 353 is_vector: bool, 354 depth: usize, 355 }, 356 /// Expand an array's elements onto the stack. 357 Array { 358 arr: &'a [Value], 359 arr_value: &'a Value, 360 prefix: String, 361 depth: usize, 362 }, 363} 364 365pub struct Paths<'a> { 366 stack: Vec<Work<'a>>, 367 seen: HashSet<String>, 368 max_depth: usize, 369} 370 371impl<'a> Paths<'a> { 372 fn new(record: &'a Value, max_depth: usize) -> Self { 373 let mut paths = Self { 374 stack: Vec::new(), 375 seen: HashSet::new(), 376 max_depth, 377 }; 378 if let Some(obj) = record.as_object() { 379 paths.stack.push(Work::Object { 380 obj, 381 prefix: String::new(), 382 is_vector: false, 383 depth: 0, 384 }); 385 } 386 paths 387 } 388 389 pub fn with_max_depth(mut self, max_depth: usize) -> Self { 390 self.max_depth = max_depth; 391 self 392 } 393 394 fn expand_object( 395 &mut self, 396 obj: &'a serde_json::Map<String, Value>, 397 prefix: &str, 398 is_vector: bool, 399 depth: usize, 400 ) { 401 let vtype = if is_vector { 402 PathType::Vector 403 } else { 404 PathType::Scalar 405 }; 406 407 // Push in reverse so the first key is at the top of the stack. 408 let entries: Vec<_> = obj.iter().collect(); 409 for (key, child) in entries.into_iter().rev() { 410 let escaped = escape_field_name(key); 411 let key_path = if prefix.is_empty() { 412 escaped 413 } else { 414 format!("{prefix}.{escaped}") 415 }; 416 417 // Push children first (deeper in stack), then the emit (top). 418 match child { 419 Value::Array(arr) => { 420 self.stack.push(Work::Array { 421 arr, 422 arr_value: child, 423 prefix: key_path.clone(), 424 depth: depth + 1, 425 }); 426 self.stack.push(Work::Emit { 427 path: key_path, 428 path_type: vtype, 429 value: child, 430 }); 431 } 432 Value::Object(child_obj) => { 433 match child_obj.get("$type").and_then(|t| t.as_str()) { 434 Some(nsid) => { 435 let qualified = format!("{key_path}{{{nsid}}}"); 436 self.stack.push(Work::Object { 437 obj: child_obj, 438 prefix: qualified.clone(), 439 is_vector, 440 depth: depth + 1, 441 }); 442 self.stack.push(Work::Emit { 443 path: qualified, 444 path_type: vtype, 445 value: child, 446 }); 447 self.stack.push(Work::Emit { 448 path: key_path, 449 path_type: vtype, 450 value: child, 451 }); 452 } 453 None => { 454 self.stack.push(Work::Object { 455 obj: child_obj, 456 prefix: key_path.clone(), 457 is_vector, 458 depth: depth + 1, 459 }); 460 self.stack.push(Work::Emit { 461 path: key_path, 462 path_type: vtype, 463 value: child, 464 }); 465 } 466 } 467 } 468 _ => { 469 self.stack.push(Work::Emit { 470 path: key_path, 471 path_type: vtype, 472 value: child, 473 }); 474 } 475 } 476 } 477 } 478 479 fn expand_array( 480 &mut self, 481 arr: &'a [Value], 482 arr_value: &'a Value, 483 prefix: &str, 484 depth: usize, 485 ) { 486 let has_union = arr 487 .iter() 488 .any(|el| el.as_object().is_some_and(|o| o.contains_key("$type"))); 489 490 if has_union { 491 let mut has_plain = false; 492 493 for el in arr.iter().rev() { 494 match el 495 .as_object() 496 .and_then(|o| o.get("$type")) 497 .and_then(|t| t.as_str()) 498 { 499 Some(nsid) => { 500 let qp = format!("{prefix}[{nsid}]"); 501 if let Some(obj) = el.as_object() { 502 self.stack.push(Work::Object { 503 obj, 504 prefix: qp.clone(), 505 is_vector: true, 506 depth: depth + 1, 507 }); 508 } 509 self.stack.push(Work::Emit { 510 path: qp, 511 path_type: PathType::Vector, 512 value: el, 513 }); 514 } 515 None => { 516 has_plain = true; 517 self.expand_child_value(el, &format!("{prefix}[]"), depth + 1); 518 } 519 } 520 } 521 522 if has_plain { 523 self.stack.push(Work::Emit { 524 path: format!("{prefix}[]"), 525 path_type: PathType::Vector, 526 value: arr_value, 527 }); 528 } 529 } else { 530 let bare = format!("{prefix}[]"); 531 532 for el in arr.iter().rev() { 533 self.expand_child_value(el, &bare, depth + 1); 534 } 535 536 self.stack.push(Work::Emit { 537 path: bare, 538 path_type: PathType::Vector, 539 value: arr_value, 540 }); 541 } 542 } 543 544 fn expand_child_value(&mut self, value: &'a Value, prefix: &str, depth: usize) { 545 match value { 546 Value::Object(obj) => { 547 self.stack.push(Work::Object { 548 obj, 549 prefix: prefix.to_string(), 550 is_vector: true, 551 depth, 552 }); 553 } 554 Value::Array(arr) => { 555 self.stack.push(Work::Array { 556 arr, 557 arr_value: value, 558 prefix: prefix.to_string(), 559 depth, 560 }); 561 } 562 _ => {} 563 } 564 } 565} 566 567impl<'a> Iterator for Paths<'a> { 568 type Item = (PathInfo, &'a Value); 569 570 fn next(&mut self) -> Option<Self::Item> { 571 loop { 572 match self.stack.pop()? { 573 Work::Emit { 574 path, 575 path_type, 576 value, 577 } => { 578 if self.seen.insert(path.clone()) { 579 return Some((PathInfo { path, path_type }, value)); 580 } 581 } 582 Work::Object { 583 obj, 584 prefix, 585 is_vector, 586 depth, 587 } => { 588 if depth <= self.max_depth { 589 self.expand_object(obj, &prefix, is_vector, depth); 590 } 591 } 592 Work::Array { 593 arr, 594 arr_value, 595 prefix, 596 depth, 597 } => { 598 if depth <= self.max_depth { 599 self.expand_array(arr, arr_value, &prefix, depth); 600 } 601 } 602 } 603 } 604 } 605} 606 607// -- is_vector -- 608 609pub fn is_vector(path: &str) -> bool { 610 let bytes = path.as_bytes(); 611 let mut i = 0; 612 while i < bytes.len() { 613 match bytes[i] { 614 b'!' if i + 1 < bytes.len() => i += 2, 615 b'[' => return true, 616 _ => i += 1, 617 } 618 } 619 false 620}