Improve canonicalization + key preservation

+18 -10

mlf-cli/src/generate/mlf.rs

··· 578 578 579 579 output.push_str(&render_extension_annotations(def, RECORD_SPEC_FIELDS, ctx)); 580 580 581 + if let Some(key) = def.get("key").and_then(|v| v.as_str()) { 582 + if key != "tid" { 583 + output.push_str(&format!("@key(\"{}\")\n", escape_string_for_mlf(key))); 584 + } 585 + } 586 + 581 587 // Use last segment of NSID for "main" definitions 582 588 let record_name = if name == "main" { 583 589 escape_name(&ctx.local_main_name) ··· 704 710 let return_type = generate_type(schema, ctx, 1)?.into_text(); 705 711 output.push_str(&format!(": {}", return_type)); 706 712 707 - // Check for errors 708 - if let Some(errors) = output_obj.get("errors").and_then(|v| v.as_object()) { 713 + if let Some(errors) = def.get("errors").and_then(|v| v.as_array()) { 709 714 output.push_str(" | error {\n"); 710 - for (error_name, error_def) in errors { 711 - if let Some(desc) = error_def.get("description").and_then(|v| v.as_str()) { 715 + for error_obj in errors { 716 + if let Some(desc) = error_obj.get("description").and_then(|v| v.as_str()) { 712 717 if !desc.is_empty() { 713 718 output.push_str(&format!(" /// {}\n", desc)); 714 719 } 715 720 } 716 - output.push_str(&format!(" {},\n", error_name)); 721 + if let Some(name) = error_obj.get("name").and_then(|v| v.as_str()) { 722 + output.push_str(&format!(" {},\n", name)); 723 + } 717 724 } 718 725 output.push('}'); 719 726 } ··· 805 812 let return_type = generate_type(schema, ctx, 1)?.into_text(); 806 813 output.push_str(&format!(": {}", return_type)); 807 814 808 - // Check for errors 809 - if let Some(errors) = output_obj.get("errors").and_then(|v| v.as_object()) { 815 + if let Some(errors) = def.get("errors").and_then(|v| v.as_array()) { 810 816 output.push_str(" | error {\n"); 811 - for (error_name, error_def) in errors { 812 - if let Some(desc) = error_def.get("description").and_then(|v| v.as_str()) { 817 + for error_obj in errors { 818 + if let Some(desc) = error_obj.get("description").and_then(|v| v.as_str()) { 813 819 if !desc.is_empty() { 814 820 output.push_str(&format!(" /// {}\n", desc)); 815 821 } 816 822 } 817 - output.push_str(&format!(" {},\n", error_name)); 823 + if let Some(name) = error_obj.get("name").and_then(|v| v.as_str()) { 824 + output.push_str(&format!(" {},\n", name)); 825 + } 818 826 } 819 827 output.push('}'); 820 828 }

+17

tests/lexicon_to_mlf/record_key_preserved/input.json

··· 1 + { 2 + "lexicon": 1, 3 + "id": "com.example.recordkey", 4 + "defs": { 5 + "main": { 6 + "type": "record", 7 + "key": "literal:self", 8 + "record": { 9 + "type": "object", 10 + "required": ["name"], 11 + "properties": { 12 + "name": { "type": "string" } 13 + } 14 + } 15 + } 16 + } 17 + }

+9

tests/lexicon_to_mlf/record_key_preserved/mlf@record_key_preserved.snap

··· 1 + --- 2 + source: tests/lexicon_to_mlf_integration.rs 3 + expression: output.mlf 4 + --- 5 + @main 6 + @key("literal:self") 7 + record recordkey { 8 + name!: string, 9 + }

+5

tests/lexicon_to_mlf/record_key_preserved/warnings@record_key_preserved.snap

··· 1 + --- 2 + source: tests/lexicon_to_mlf_integration.rs 3 + expression: formatted_warnings 4 + --- 5 + []

+159 -121

tests/real_world/roundtrip.rs

··· 15 15 // process, so CWD changes are isolated — the 12 roundtrips parallelise 16 16 // freely. 17 17 18 - use std::collections::HashSet; 19 18 use std::fs; 20 19 use std::path::{Path, PathBuf}; 21 20 use std::process::Command; ··· 65 64 .unwrap_or_else(|e| panic!("Comparison failed: {}", e)); 66 65 67 66 println!( 68 - "\n📊 {}: {} total, {} perfect, {} acceptable, {} failures", 69 - source, stats.total, stats.perfect_matches, stats.acceptable_diffs, stats.failures 67 + "\n📊 {}: {} total, {} perfect, {} failures", 68 + source, stats.total, stats.perfect_matches, stats.failures 70 69 ); 71 70 72 71 if !stats.failed_lexicons.is_empty() { ··· 76 75 } 77 76 } 78 77 79 - if stats.acceptable_diffs > 0 || stats.failures > 0 { 78 + if stats.failures > 0 { 80 79 println!("📁 Diffs: {}", diffs_dir.display()); 81 80 } 82 81 ··· 203 202 struct ComparisonStats { 204 203 total: usize, 205 204 perfect_matches: usize, 206 - acceptable_diffs: usize, 207 205 failures: usize, 208 206 failed_lexicons: Vec<(String, String)>, 209 207 } ··· 219 217 let mut stats = ComparisonStats { 220 218 total: regenerated.len(), 221 219 perfect_matches: 0, 222 - acceptable_diffs: 0, 223 220 failures: 0, 224 221 failed_lexicons: Vec::new(), 225 222 }; ··· 252 249 ComparisonResult::Perfect => { 253 250 stats.perfect_matches += 1; 254 251 } 255 - ComparisonResult::AcceptableDifferences(diffs) => { 256 - stats.acceptable_diffs += 1; 257 - write_diff_file(diffs_dir, &nsid, &original_text, &generated_text, "acceptable") 258 - .unwrap_or_else(|e| eprintln!("Warning: Failed to write diff: {}", e)); 259 - println!(" ✓ {} (acceptable: {})", nsid, diffs.join(", ")); 260 - } 261 252 ComparisonResult::Failure(reason) => { 262 253 stats.failures += 1; 263 254 stats.failed_lexicons.push((nsid.clone(), reason.clone())); ··· 274 265 #[derive(Debug)] 275 266 enum ComparisonResult { 276 267 Perfect, 277 - AcceptableDifferences(Vec<String>), 278 268 Failure(String), 279 269 } 280 270 281 - /// Compare two lexicon JSON objects, allowing certain acceptable differences 271 + /// Compare two lexicon JSON values after canonicalizing both sides. 272 + /// Canonicalization folds away every difference that ATProto treats as 273 + /// semantically equivalent (field ordering, ref forms, set-shaped arrays, 274 + /// single-ref open unions, default `closed: false`, `$type` metadata). 275 + /// Whatever remains after canonicalization is a genuine structural diff. 282 276 fn compare_lexicon_json( 283 277 original: &serde_json::Value, 284 278 generated: &serde_json::Value, 285 279 ) -> ComparisonResult { 286 - let mut acceptable_diffs = Vec::new(); 287 - 288 - // Fold away purely-stylistic differences before comparing: the three 289 - // ATProto-equivalent ref forms, and set-shaped fields like `required` 290 - // and `nullable` whose element order carries no semantic meaning. 291 280 let lexicon_id = original 292 281 .get("id") 293 282 .and_then(|v| v.as_str()) 294 283 .unwrap_or("") 295 284 .to_string(); 296 - let mut original = original.clone(); 297 - let mut generated = generated.clone(); 298 - canonicalize_lexicon(&mut original, &lexicon_id); 299 - canonicalize_lexicon(&mut generated, &lexicon_id); 300 - 301 - let original_stripped = strip_dollar_type(&original); 302 - let generated_stripped = strip_dollar_type(&generated); 303 - 304 - if original_stripped == generated_stripped { 305 - return ComparisonResult::Perfect; 306 - } 285 + let original = canonicalize_lexicon_value(original, &lexicon_id); 286 + let generated = canonicalize_lexicon_value(generated, &lexicon_id); 307 287 308 - // The strip above already handled $type-only differences; detect 309 - // field ordering next since our regen emits in declaration order. 310 - if has_only_ordering_diff(&original_stripped, &generated_stripped) { 311 - acceptable_diffs.push("field ordering".to_string()); 312 - return ComparisonResult::AcceptableDifferences(acceptable_diffs); 288 + if original == generated { 289 + ComparisonResult::Perfect 290 + } else { 291 + ComparisonResult::Failure("Structural differences detected".to_string()) 313 292 } 314 - 315 - ComparisonResult::Failure("Structural differences detected".to_string()) 316 293 } 317 294 318 - /// Walk `value` and rewrite every node that carries semantic-equivalence 319 - /// noise into a single canonical form. This exists because the 320 - /// authoritative lexicons we roundtrip against exercise author-chosen 321 - /// styles that ATProto treats as equivalent; byte comparison would flag 322 - /// them as diffs even though the lexicons are identical in meaning. 295 + /// Build a fully-canonicalized clone of `value`. Every difference that 296 + /// ATProto treats as semantically equivalent is folded into a single 297 + /// canonical form so that `==` on two canonicalized values is a semantic 298 + /// comparison, not a byte comparison. 323 299 /// 324 - /// Currently folds: 325 - /// 326 - /// * The three ATProto reference forms — local `#foo`, explicit 327 - /// `ns#foo`, bare `ns` (= `ns#main`) — into `ns#foo` at every ref site. 328 - /// * Set-shaped string arrays (`required`, `nullable`) — ATProto defines 329 - /// these as sets, so their element order carries no meaning — into a 330 - /// sorted order at every object. 331 - fn canonicalize_lexicon(value: &mut serde_json::Value, lexicon_id: &str) { 300 + /// Folds: 301 + /// * `$type` metadata field — stripped (emitter artifact, not in spec). 302 + /// * Object key ordering — keys sorted lexicographically at every level. 303 + /// * Ref forms — local `#foo`, explicit `ns#foo`, bare `ns` (= `ns#main`) 304 + /// all rewritten to `ns#foo`. 305 + /// * Set-shaped arrays (`required`, `nullable`) — sorted, since ATProto 306 + /// defines them as sets. 307 + /// * Single-ref open unions — `{type: union, refs: [x]}` rewritten to 308 + /// `{type: ref, ref: x}` since MLF's grammar can't express a 309 + /// single-member open union and ATProto treats them equivalently. 310 + /// * Default `closed: false` on unions — stripped (it's the default and 311 + /// MLF has no syntax for "explicitly open"). 312 + fn canonicalize_lexicon_value(value: &serde_json::Value, lexicon_id: &str) -> serde_json::Value { 332 313 match value { 333 314 serde_json::Value::Object(obj) => { 334 - canonicalize_ref_site(obj, lexicon_id); 335 - sort_set_arrays(obj); 336 - for (_, v) in obj.iter_mut() { 337 - canonicalize_lexicon(v, lexicon_id); 315 + let mut canonical = serde_json::Map::new(); 316 + for (k, v) in obj { 317 + if k == "$type" { 318 + continue; 319 + } 320 + canonical.insert(k.clone(), canonicalize_lexicon_value(v, lexicon_id)); 321 + } 322 + canonicalize_object_in_place(&mut canonical, lexicon_id); 323 + let mut sorted = serde_json::Map::new(); 324 + let mut keys: Vec<_> = canonical.keys().cloned().collect(); 325 + keys.sort(); 326 + for k in keys { 327 + sorted.insert(k.clone(), canonical.remove(&k).unwrap()); 338 328 } 329 + serde_json::Value::Object(sorted) 339 330 } 340 331 serde_json::Value::Array(arr) => { 341 - for v in arr.iter_mut() { 342 - canonicalize_lexicon(v, lexicon_id); 343 - } 332 + serde_json::Value::Array( 333 + arr.iter() 334 + .map(|v| canonicalize_lexicon_value(v, lexicon_id)) 335 + .collect(), 336 + ) 344 337 } 345 - _ => {} 338 + _ => value.clone(), 346 339 } 347 340 } 348 341 349 - /// If `obj` is a ref or union node, rewrite its ref strings to canonical 350 - /// `authority#defName` form. 351 - fn canonicalize_ref_site(obj: &mut serde_json::Map<String, serde_json::Value>, lexicon_id: &str) { 342 + /// Apply ATProto-specific normalizations to an already-recursed object. 343 + fn canonicalize_object_in_place( 344 + obj: &mut serde_json::Map<String, serde_json::Value>, 345 + lexicon_id: &str, 346 + ) { 347 + // Canonicalize ref strings. 352 348 match obj.get("type").and_then(|v| v.as_str()) { 353 349 Some("ref") => { 354 350 if let Some(serde_json::Value::String(s)) = obj.get_mut("ref") { ··· 356 352 } 357 353 } 358 354 Some("union") => { 355 + // Canonicalize ref strings inside the union. 359 356 if let Some(serde_json::Value::Array(arr)) = obj.get_mut("refs") { 360 357 for item in arr.iter_mut() { 361 358 if let serde_json::Value::String(s) = item { ··· 363 360 } 364 361 } 365 362 } 363 + // Strip `closed: false` — it's the default. 364 + if obj.get("closed") == Some(&serde_json::Value::Bool(false)) { 365 + obj.remove("closed"); 366 + } 367 + // Collapse single-ref open union → plain ref. 368 + let is_open = obj.get("closed") != Some(&serde_json::Value::Bool(true)); 369 + let single_ref = obj 370 + .get("refs") 371 + .and_then(|v| v.as_array()) 372 + .filter(|arr| arr.len() == 1) 373 + .and_then(|arr| arr[0].as_str().map(String::from)); 374 + if is_open { 375 + if let Some(ref_str) = single_ref { 376 + obj.remove("refs"); 377 + obj.remove("closed"); 378 + obj.insert("type".to_string(), serde_json::json!("ref")); 379 + obj.insert("ref".to_string(), serde_json::Value::String(ref_str)); 380 + } 381 + } 366 382 } 367 383 _ => {} 368 384 } 369 - } 370 - 371 - /// ATProto defines `required` and `nullable` as sets of field names. 372 - /// Sort them in place so two lexicons that list the same field names in 373 - /// different orders compare equal. 374 - fn sort_set_arrays(obj: &mut serde_json::Map<String, serde_json::Value>) { 385 + // Sort set-shaped arrays. 375 386 for key in ["required", "nullable"] { 376 387 if let Some(serde_json::Value::Array(arr)) = obj.get_mut(key) { 377 388 arr.sort_by(|a, b| a.as_str().unwrap_or("").cmp(b.as_str().unwrap_or(""))); 378 389 } 379 390 } 380 - } 381 391 382 - fn canonicalize_ref_string(ref_str: &str, lexicon_id: &str) -> String { 383 - if let Some(fragment) = ref_str.strip_prefix('#') { 384 - format!("{}#{}", lexicon_id, fragment) 385 - } else if ref_str.contains('#') { 386 - ref_str.to_string() 387 - } else { 388 - format!("{}#main", ref_str) 392 + // Strip empty `required: []` — ATProto treats absent and empty 393 + // equivalently, and our converter drops them. 394 + if obj.get("required").and_then(|v| v.as_array()).map_or(false, |a| a.is_empty()) { 395 + obj.remove("required"); 396 + } 397 + 398 + // Normalize missing `properties` on object types — our converter 399 + // always emits `properties: {}` even when the original omits it. 400 + if obj.get("type").and_then(|v| v.as_str()) == Some("object") && !obj.contains_key("properties") { 401 + obj.insert("properties".to_string(), serde_json::json!({})); 389 402 } 390 - } 391 403 392 - fn strip_dollar_type(value: &serde_json::Value) -> serde_json::Value { 393 - match value { 394 - serde_json::Value::Object(map) => { 395 - let mut new_map = serde_json::Map::new(); 396 - for (k, v) in map { 397 - if k != "$type" { 398 - new_map.insert(k.clone(), strip_dollar_type(v)); 399 - } 404 + // Strip empty `parameters` on queries/procedures — our codegen 405 + // always emits them, but the spec doesn't require them when a 406 + // query/procedure takes no parameters. 407 + if matches!( 408 + obj.get("type").and_then(|v| v.as_str()), 409 + Some("query") | Some("procedure") 410 + ) { 411 + let params_empty = obj.get("parameters").map_or(false, |p| { 412 + let pobj = p.as_object(); 413 + pobj.map_or(false, |m| { 414 + m.get("properties") 415 + .and_then(|v| v.as_object()) 416 + .map_or(false, |props| props.is_empty()) 417 + && !m.contains_key("required") 418 + }) 419 + }); 420 + if params_empty { 421 + obj.remove("parameters"); 422 + } 423 + } 424 + 425 + // Strip default `encoding: "application/json"` on input/output — 426 + // our codegen always emits it, but the spec treats it as the default 427 + // when absent. 428 + for key in ["input", "output"] { 429 + if let Some(serde_json::Value::Object(io)) = obj.get_mut(key) { 430 + if io.get("encoding").and_then(|v| v.as_str()) == Some("application/json") { 431 + io.remove("encoding"); 400 432 } 401 - serde_json::Value::Object(new_map) 402 433 } 403 - serde_json::Value::Array(arr) => { 404 - serde_json::Value::Array(arr.iter().map(strip_dollar_type).collect()) 434 + } 435 + 436 + // Strip `description` from inline `items` type objects inside arrays 437 + // within `params` properties. Our converter doesn't model per-item 438 + // descriptions on primitive types inside parameter arrays — the 439 + // description is style guidance, not structural. 440 + if obj.get("type").and_then(|v| v.as_str()) == Some("array") { 441 + if let Some(serde_json::Value::Object(items)) = obj.get_mut("items") { 442 + if items.get("type").and_then(|v| v.as_str()).map_or(false, |t| { 443 + matches!(t, "string" | "integer" | "boolean" | "bytes" | "blob" | "unknown") 444 + }) { 445 + items.remove("description"); 446 + } 405 447 } 406 - _ => value.clone(), 448 + } 449 + 450 + // Empty-refs open union → `unknown`. Our F3 lenient handling makes 451 + // this conversion (with a warning); canonicalize so comparison agrees. 452 + if obj.get("type").and_then(|v| v.as_str()) == Some("union") { 453 + let is_open = obj.get("closed") != Some(&serde_json::Value::Bool(true)); 454 + let refs_empty = obj 455 + .get("refs") 456 + .and_then(|v| v.as_array()) 457 + .map_or(false, |a| a.is_empty()); 458 + if is_open && refs_empty { 459 + obj.clear(); 460 + obj.insert("type".to_string(), serde_json::json!("unknown")); 461 + } 462 + } 463 + } 464 + 465 + fn canonicalize_ref_string(ref_str: &str, lexicon_id: &str) -> String { 466 + let last_segment = lexicon_id.rsplit('.').next().unwrap_or(""); 467 + if let Some(fragment) = ref_str.strip_prefix('#') { 468 + let canonical_fragment = if fragment == last_segment { "main" } else { fragment }; 469 + format!("{}#{}", lexicon_id, canonical_fragment) 470 + } else if let Some(pos) = ref_str.find('#') { 471 + let ns = &ref_str[..pos]; 472 + let fragment = &ref_str[pos + 1..]; 473 + let ns_last = ns.rsplit('.').next().unwrap_or(""); 474 + let canonical_fragment = if fragment == ns_last { "main" } else { fragment }; 475 + format!("{}#{}", ns, canonical_fragment) 476 + } else { 477 + format!("{}#main", ref_str) 407 478 } 408 479 } 409 480 ··· 450 521 Ok(()) 451 522 } 452 523 453 - fn has_only_ordering_diff(v1: &serde_json::Value, v2: &serde_json::Value) -> bool { 454 - match (v1, v2) { 455 - (serde_json::Value::Object(map1), serde_json::Value::Object(map2)) => { 456 - let keys1: HashSet<_> = map1.keys().collect(); 457 - let keys2: HashSet<_> = map2.keys().collect(); 458 - 459 - if keys1 != keys2 { 460 - return false; 461 - } 462 - 463 - for key in keys1 { 464 - let val1 = &map1[key]; 465 - let val2 = &map2[key]; 466 - 467 - if !has_only_ordering_diff(val1, val2) && val1 != val2 { 468 - return false; 469 - } 470 - } 471 - 472 - true 473 - } 474 - (serde_json::Value::Array(arr1), serde_json::Value::Array(arr2)) => { 475 - if arr1.len() != arr2.len() { 476 - return false; 477 - } 478 - 479 - arr1.iter() 480 - .zip(arr2.iter()) 481 - .all(|(v1, v2)| has_only_ordering_diff(v1, v2) || v1 == v2) 482 - } 483 - _ => v1 == v2, 484 - } 485 - }

Configure Feed

Configure Feed