use mlf_lang::ast::*; use serde_json::Value as JsonValue; use std::fmt; use unicode_segmentation::UnicodeSegmentation; use regex::Regex; use url::Url; use time::format_description::well_known::Rfc3339; use time::OffsetDateTime; use langtag::LangTag; #[derive(Debug, Clone)] pub struct ValidationError { pub path: String, pub message: String, } impl fmt::Display for ValidationError { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "{}: {}", self.path, self.message) } } impl std::error::Error for ValidationError {} pub struct RecordValidator<'a> { lexicon: &'a Lexicon, } impl<'a> RecordValidator<'a> { pub fn new(lexicon: &'a Lexicon) -> Self { Self { lexicon } } pub fn validate_record(&self, record: &JsonValue) -> Result<(), Vec> { let mut errors = Vec::new(); // Find the main record definition let main_item = self.find_main_item()?; match main_item { Item::Record(record_def) => { // Validate as object with the record's fields self.validate_object(record, &record_def.fields, "$", &mut errors); } Item::Query(_) | Item::Procedure(_) => { errors.push(ValidationError { path: "$".to_string(), message: "Cannot validate records against query/procedure definitions".to_string(), }); } _ => { errors.push(ValidationError { path: "$".to_string(), message: "No record definition found in lexicon".to_string(), }); } } if errors.is_empty() { Ok(()) } else { Err(errors) } } fn find_main_item(&self) -> Result<&Item, Vec> { // Look for a record, query, or procedure (main definitions) for item in &self.lexicon.items { match item { Item::Record(_) | Item::Query(_) | Item::Procedure(_) => { return Ok(item); } _ => continue, } } Err(vec![ValidationError { path: "$".to_string(), message: "No main definition found in lexicon".to_string(), }]) } fn validate_against_type( &self, value: &JsonValue, ty: &Type, path: &str, errors: &mut Vec, ) { match ty { Type::Primitive { kind, .. } => { self.validate_primitive(value, *kind, path, errors); } Type::Constrained { base, constraints, .. } => { self.validate_against_type(value, base, path, errors); self.validate_constraints(value, constraints, path, errors); } Type::Object { fields, .. } => { self.validate_object(value, fields, path, errors); } Type::Array { inner, .. } => { self.validate_array(value, inner, path, errors); } Type::Union { types, .. } => { self.validate_union(value, types, path, errors); } Type::Parenthesized { inner, .. } => { self.validate_against_type(value, inner, path, errors); } Type::Reference { path: ref_path, .. } => { // Try to resolve reference if let Some(resolved_type) = self.resolve_reference(ref_path) { self.validate_against_type(value, &resolved_type, path, errors); } else { // Can't resolve, skip validation } } Type::Unknown { .. } => { // Unknown type accepts anything } } } fn resolve_reference(&self, path: &Path) -> Option { // Simple resolution: look for inline/def types with matching name if path.segments.len() == 1 { let name = &path.segments[0].name; for item in &self.lexicon.items { match item { Item::InlineType(i) if i.name.name == *name => { return Some(i.ty.clone()); } Item::DefType(d) if d.name.name == *name => { return Some(d.ty.clone()); } _ => {} } } } None } fn validate_primitive( &self, value: &JsonValue, kind: PrimitiveType, path: &str, errors: &mut Vec, ) { match kind { PrimitiveType::Null => { if !value.is_null() { errors.push(ValidationError { path: path.to_string(), message: "Expected null".to_string(), }); } } PrimitiveType::Boolean => { if !value.is_boolean() { errors.push(ValidationError { path: path.to_string(), message: "Expected boolean".to_string(), }); } } PrimitiveType::Integer => { if let Some(n) = value.as_i64() { // Check JavaScript-safe integer range (-2^53 to 2^53) if n < -(1i64 << 53) || n > (1i64 << 53) { errors.push(ValidationError { path: path.to_string(), message: "Integer out of JavaScript-safe range".to_string(), }); } } else { errors.push(ValidationError { path: path.to_string(), message: "Expected integer".to_string(), }); } } PrimitiveType::String => { if !value.is_string() { errors.push(ValidationError { path: path.to_string(), message: "Expected string".to_string(), }); } } PrimitiveType::Bytes => { // Bytes should be encoded as {"$bytes": "base64-string"} if let Some(obj) = value.as_object() { if let Some(bytes_val) = obj.get("$bytes") { if !bytes_val.is_string() { errors.push(ValidationError { path: path.to_string(), message: "Expected $bytes to be a base64 string".to_string(), }); } } else { errors.push(ValidationError { path: path.to_string(), message: "Expected object with $bytes field".to_string(), }); } } else { errors.push(ValidationError { path: path.to_string(), message: "Expected bytes object with $bytes field".to_string(), }); } } PrimitiveType::Blob => { // Blob should have $type, ref, mimeType, size if let Some(obj) = value.as_object() { let required = ["$type", "ref", "mimeType", "size"]; for field in &required { if !obj.contains_key(*field) { errors.push(ValidationError { path: path.to_string(), message: format!("Blob missing required field: {}", field), }); } } } else { errors.push(ValidationError { path: path.to_string(), message: "Expected blob object".to_string(), }); } } } } fn validate_constraints( &self, value: &JsonValue, constraints: &[Constraint], path: &str, errors: &mut Vec, ) { for constraint in constraints { match constraint { Constraint::MinLength { value: min, .. } => { if let Some(s) = value.as_str() { if s.len() < *min { errors.push(ValidationError { path: path.to_string(), message: format!("String too short: {} bytes (min: {})", s.len(), min), }); } } else if let Some(arr) = value.as_array() { // MinLength can also apply to arrays (element count) if arr.len() < *min { errors.push(ValidationError { path: path.to_string(), message: format!("Array too short: {} elements (min: {})", arr.len(), min), }); } } } Constraint::MaxLength { value: max, .. } => { if let Some(s) = value.as_str() { if s.len() > *max { errors.push(ValidationError { path: path.to_string(), message: format!("String too long: {} bytes (max: {})", s.len(), max), }); } } else if let Some(arr) = value.as_array() { // MaxLength can also apply to arrays (element count) if arr.len() > *max { errors.push(ValidationError { path: path.to_string(), message: format!("Array too long: {} elements (max: {})", arr.len(), max), }); } } } Constraint::MinGraphemes { value: min, .. } => { if let Some(s) = value.as_str() { // Use proper Unicode grapheme cluster counting let count = s.graphemes(true).count(); if count < *min { errors.push(ValidationError { path: path.to_string(), message: format!("String has too few graphemes: {} (min: {})", count, min), }); } } } Constraint::MaxGraphemes { value: max, .. } => { if let Some(s) = value.as_str() { // Use proper Unicode grapheme cluster counting let count = s.graphemes(true).count(); if count > *max { errors.push(ValidationError { path: path.to_string(), message: format!("String has too many graphemes: {} (max: {})", count, max), }); } } } Constraint::Minimum { value: min, .. } => { if let Some(n) = value.as_i64() { if n < *min { errors.push(ValidationError { path: path.to_string(), message: format!("Value too small: {} (min: {})", n, min), }); } } } Constraint::Maximum { value: max, .. } => { if let Some(n) = value.as_i64() { if n > *max { errors.push(ValidationError { path: path.to_string(), message: format!("Value too large: {} (max: {})", n, max), }); } } } Constraint::Enum { values, .. } => { if let Some(s) = value.as_str() { let enum_strings: Vec = values.iter().map(|v| match v { mlf_lang::ast::ValueRef::Literal(lit) => lit.clone(), mlf_lang::ast::ValueRef::Reference(path) => path.to_string(), }).collect(); if !enum_strings.contains(&s.to_string()) { errors.push(ValidationError { path: path.to_string(), message: format!("Value '{}' not in enum: {:?}", s, enum_strings), }); } } } Constraint::Format { value: format, .. } => { if let Some(s) = value.as_str() { self.validate_format(s, format, path, errors); } } Constraint::Accept { mimes, .. } => { // Validate blob mimeType against accept list if let Some(obj) = value.as_object() { if let Some(mime) = obj.get("mimeType").and_then(|v| v.as_str()) { if !mimes.iter().any(|m| m == mime) { errors.push(ValidationError { path: path.to_string(), message: format!("MIME type '{}' not accepted (allowed: {:?})", mime, mimes), }); } } } } Constraint::MaxSize { value: max, .. } => { // Validate blob size if let Some(obj) = value.as_object() { if let Some(size) = obj.get("size").and_then(|v| v.as_u64()) { if size as usize > *max { errors.push(ValidationError { path: path.to_string(), message: format!("Blob size {} exceeds maximum: {}", size, max), }); } } } } Constraint::KnownValues { .. } => { // knownValues is a hint, not enforced } Constraint::Default { .. } => { // Default values are used when field is missing, not for validation } Constraint::Const { .. } => { // Const values are enforced at compile time, not runtime validation } } } } fn validate_format( &self, value: &str, format: &str, path: &str, errors: &mut Vec, ) { let is_valid = match format { "datetime" => validate_datetime(value), "uri" => validate_uri(value), "at-uri" => validate_at_uri(value), "did" => validate_did(value), "handle" => validate_handle(value), "nsid" => validate_nsid(value), "cid" => validate_cid(value), "at-identifier" => validate_at_identifier(value), "language" => validate_language(value), "tid" => validate_tid(value), "record-key" => validate_record_key(value), _ => true, // Unknown format, pass validation }; if !is_valid { errors.push(ValidationError { path: path.to_string(), message: format!("Invalid {} format: '{}'", format, value), }); } } fn validate_object( &self, value: &JsonValue, fields: &[Field], path: &str, errors: &mut Vec, ) { if let Some(obj) = value.as_object() { // Check required fields for field in fields { if !field.optional && !obj.contains_key(&field.name.name) { errors.push(ValidationError { path: if path == "$" { field.name.name.clone() } else { format!("{}.{}", path, field.name.name) }, message: "Required field missing".to_string(), }); } else if let Some(field_value) = obj.get(&field.name.name) { let field_path = if path == "$" { field.name.name.clone() } else { format!("{}.{}", path, field.name.name) }; self.validate_against_type(field_value, &field.ty, &field_path, errors); } } } else { errors.push(ValidationError { path: path.to_string(), message: format!("Expected object, got {}", value_type_name(value)), }); } } fn validate_array( &self, value: &JsonValue, inner: &Type, path: &str, errors: &mut Vec, ) { if let Some(arr) = value.as_array() { for (i, item) in arr.iter().enumerate() { let item_path = format!("{}[{}]", path, i); self.validate_against_type(item, inner, &item_path, errors); } } else { errors.push(ValidationError { path: path.to_string(), message: format!("Expected array, got {}", value_type_name(value)), }); } } fn validate_union( &self, value: &JsonValue, types: &[Type], path: &str, errors: &mut Vec, ) { // Try to validate against each type in the union let mut matched = false; for ty in types { let mut type_errors = Vec::new(); self.validate_against_type(value, ty, path, &mut type_errors); if type_errors.is_empty() { matched = true; break; } } if !matched { errors.push(ValidationError { path: path.to_string(), message: format!("Value does not match any type in union ({} variants tried)", types.len()), }); } } } fn value_type_name(value: &JsonValue) -> &'static str { match value { JsonValue::Null => "null", JsonValue::Bool(_) => "boolean", JsonValue::Number(_) => "number", JsonValue::String(_) => "string", JsonValue::Array(_) => "array", JsonValue::Object(_) => "object", } } // Format validators /// Validate datetime format (RFC 3339 / ISO 8601) fn validate_datetime(value: &str) -> bool { // Use time crate for proper RFC 3339 parsing OffsetDateTime::parse(value, &Rfc3339).is_ok() } /// Validate URI format (RFC 3986) fn validate_uri(value: &str) -> bool { // Use url crate for proper URI parsing Url::parse(value).is_ok() } /// Validate AT-URI format (at://did:plc:xyz/com.example.foo/record-key) fn validate_at_uri(value: &str) -> bool { // AT-URI format: at://authority/collection/rkey // authority is a DID or handle // collection is an NSID // rkey is optional if !value.starts_with("at://") { return false; } // Strip the scheme let without_scheme = &value[5..]; // Split by first slash to get authority and path let (authority, path) = match without_scheme.split_once('/') { Some((auth, p)) => (auth, Some(p)), None => (without_scheme, None), }; // Authority must be a DID or handle if !validate_did(authority) && !validate_handle(authority) { return false; } // Path validation (if present) if let Some(path_str) = path { if !path_str.is_empty() { // Path should be collection or collection/rkey let parts: Vec<&str> = path_str.split('/').filter(|s| !s.is_empty()).collect(); if parts.is_empty() || parts.len() > 2 { return false; } // Collection should be an NSID if !validate_nsid(parts[0]) { return false; } // Record key validation (if present) if parts.len() == 2 && !validate_record_key(parts[1]) { return false; } } } true } /// Validate DID format (did:method:identifier) fn validate_did(value: &str) -> bool { // DID format: did:method:method-specific-id // method: lowercase letters, numbers // method-specific-id: alphanumeric plus . - _ : let re = Regex::new( r"^did:[a-z0-9]+:[a-zA-Z0-9._:%-]*[a-zA-Z0-9._-]$" ).unwrap(); re.is_match(value) } /// Validate handle format (domain name) fn validate_handle(value: &str) -> bool { // Handle is a domain name: segment.segment.segment // Each segment: alphanumeric and hyphen, can't start or end with hyphen // Must have at least one dot if !value.contains('.') || value.starts_with('.') || value.ends_with('.') { return false; } // Check each segment for segment in value.split('.') { if segment.is_empty() || segment.starts_with('-') || segment.ends_with('-') || segment.len() > 63 { return false; } if !segment.chars().all(|c| c.is_ascii_alphanumeric() || c == '-') { return false; } } // Total length check value.len() <= 253 } /// Validate NSID format (namespaced identifier) fn validate_nsid(value: &str) -> bool { // NSID format: authority.name(.name)* // authority: domain name (reversed) // name: lowercase alphanumeric, max 63 chars per segment // Total: 3-317 chars if value.len() < 3 || value.len() > 317 { return false; } let parts: Vec<&str> = value.split('.').collect(); if parts.len() < 3 { return false; } // Check each segment for part in &parts { if part.is_empty() || part.len() > 63 { return false; } // NSID segments must be lowercase alphanumeric (and hyphen for domain parts) if !part.chars().all(|c| c.is_ascii_lowercase() || c.is_ascii_digit() || c == '-') { return false; } // Can't start with digit if part.chars().next().map(|c| c.is_ascii_digit()).unwrap_or(false) { return false; } } true } /// Validate CID format (Content Identifier) fn validate_cid(value: &str) -> bool { // CID format is complex (multibase encoded multihash) // Basic validation: non-empty, starts with base58btc or base32 prefix // Full validation would require parsing the multibase/multihash if value.is_empty() { return false; } // CIDv0: starts with 'Qm' (base58btc) // CIDv1: starts with 'b' (base32) or 'z' (base58btc) followed by version if value.starts_with("Qm") && value.len() == 46 { // CIDv0 - all base58btc chars return value.chars().all(|c| { c.is_ascii_alphanumeric() && c != '0' && c != 'O' && c != 'I' && c != 'l' }); } if (value.starts_with('b') || value.starts_with('z')) && value.len() > 10 { // CIDv1 - basic check for valid multibase chars return value.chars().all(|c| c.is_ascii_alphanumeric()); } false } /// Validate AT-identifier format (DID or handle) fn validate_at_identifier(value: &str) -> bool { validate_did(value) || validate_handle(value) } /// Validate language code format (BCP 47) fn validate_language(value: &str) -> bool { // Use langtag crate for proper BCP 47 / RFC 5646 validation LangTag::new(value).is_ok() } /// Validate TID format (Timestamp Identifier) fn validate_tid(value: &str) -> bool { // TID: 13 character base32-sortable timestamp // Uses a-z2-7 character set (no 0,1,8,9) if value.len() != 13 { return false; } value.chars().all(|c| { matches!(c, 'a'..='z' | '2'..='7') }) } /// Validate record-key format fn validate_record_key(value: &str) -> bool { // Record key: alphanumeric, dot, underscore, tilde, hyphen // 1-512 characters // Can be TID or custom key if value.is_empty() || value.len() > 512 { return false; } // If it looks like a TID, validate as TID if value.len() == 13 && value.chars().all(|c| matches!(c, 'a'..='z' | '2'..='7')) { return validate_tid(value); } // Otherwise, general record key validation value.chars().all(|c| { c.is_ascii_alphanumeric() || c == '.' || c == '_' || c == '~' || c == '-' }) } #[cfg(test)] mod tests { use super::*; #[test] fn test_validate_datetime() { // Valid datetimes assert!(validate_datetime("2024-01-15T10:30:00Z")); assert!(validate_datetime("2024-01-15T10:30:00.123Z")); assert!(validate_datetime("2024-01-15T10:30:00+05:30")); assert!(validate_datetime("2024-01-15T10:30:00-08:00")); // Invalid datetimes assert!(!validate_datetime("2024-01-15")); assert!(!validate_datetime("2024-01-15 10:30:00")); assert!(!validate_datetime("not-a-date")); } #[test] fn test_validate_uri() { // Valid URIs assert!(validate_uri("https://example.com")); assert!(validate_uri("http://example.com/path")); assert!(validate_uri("ftp://example.com")); assert!(validate_uri("custom-scheme://something")); // Invalid URIs assert!(!validate_uri("not a uri")); assert!(!validate_uri("://missing-scheme")); assert!(!validate_uri("")); } #[test] fn test_validate_at_uri() { // Valid AT-URIs assert!(validate_at_uri("at://did:plc:abc123")); assert!(validate_at_uri("at://did:plc:abc123/com.example.foo")); assert!(validate_at_uri("at://did:plc:abc123/com.example.foo/abc123")); assert!(validate_at_uri("at://alice.example.com/com.example.post/abc")); // Invalid AT-URIs assert!(!validate_at_uri("https://example.com")); assert!(!validate_at_uri("at://")); assert!(!validate_at_uri("not-at-uri")); } #[test] fn test_validate_did() { // Valid DIDs assert!(validate_did("did:plc:abc123xyz")); assert!(validate_did("did:web:example.com")); assert!(validate_did("did:key:abc123")); // Invalid DIDs assert!(!validate_did("not-a-did")); assert!(!validate_did("did:")); assert!(!validate_did("did:UPPERCASE:test")); // method must be lowercase } #[test] fn test_validate_handle() { // Valid handles assert!(validate_handle("example.com")); assert!(validate_handle("alice.example.com")); assert!(validate_handle("my-site.example.com")); // Invalid handles assert!(!validate_handle("nodomainext")); assert!(!validate_handle(".example.com")); assert!(!validate_handle("example.com.")); assert!(!validate_handle("-invalid.com")); assert!(!validate_handle("invalid-.com")); } #[test] fn test_validate_nsid() { // Valid NSIDs assert!(validate_nsid("com.example.foo")); assert!(validate_nsid("com.example.foo.bar")); assert!(validate_nsid("io.github.user.action")); // Invalid NSIDs assert!(!validate_nsid("com.example")); // need at least 3 segments assert!(!validate_nsid("COM.EXAMPLE.FOO")); // must be lowercase assert!(!validate_nsid("com.123invalid.foo")); // can't start with digit assert!(!validate_nsid("co")); // too short } #[test] fn test_validate_cid() { // Valid CIDs (examples) assert!(validate_cid("QmYwAPJzv5CZsnA625s3Xf2nemtYgPpHdWEz79ojWnPbdG")); // CIDv0 assert!(validate_cid("bafybeihdwdcefgh4dqkjv67uzcmw7ojee6xedzdetojuzjevtenxquvyku")); // CIDv1 // Invalid CIDs assert!(!validate_cid("")); assert!(!validate_cid("not-a-cid")); assert!(!validate_cid("Qm123")); // too short } #[test] fn test_validate_at_identifier() { // Valid (DIDs) assert!(validate_at_identifier("did:plc:abc123")); // Valid (handles) assert!(validate_at_identifier("example.com")); assert!(validate_at_identifier("alice.example.com")); // Invalid assert!(!validate_at_identifier("not-valid")); assert!(!validate_at_identifier("")); } #[test] fn test_validate_language() { // Valid language codes (BCP 47 / RFC 5646) assert!(validate_language("en")); assert!(validate_language("en-US")); assert!(validate_language("zh-Hans-CN")); assert!(validate_language("fr-CA")); assert!(validate_language("en-GB")); assert!(validate_language("de-DE")); // Invalid language codes assert!(!validate_language("e")); // too short assert!(!validate_language("en_US")); // wrong separator (underscore) assert!(!validate_language("")); // empty assert!(!validate_language("123")); // starts with digit assert!(!validate_language("en--US")); // double separator } #[test] fn test_validate_tid() { // Valid TIDs (13 chars, base32-sortable) assert!(validate_tid("3jui7kd54zh2y")); assert!(validate_tid("3k2a4dqudbbz2")); // Invalid TIDs assert!(!validate_tid("3jui7kd54zh2")); // too short assert!(!validate_tid("3jui7kd54zh2yy")); // too long assert!(!validate_tid("3jui7kd54zh2Y")); // uppercase not allowed assert!(!validate_tid("3jui0kd54zh2y")); // 0 not allowed } #[test] fn test_validate_record_key() { // Valid record keys assert!(validate_record_key("3jui7kd54zh2y")); // TID assert!(validate_record_key("my-record-key")); assert!(validate_record_key("key.with.dots")); assert!(validate_record_key("key_with_underscores")); assert!(validate_record_key("key~with~tildes")); // Invalid record keys assert!(!validate_record_key("")); // empty assert!(!validate_record_key(&"a".repeat(513))); // too long assert!(!validate_record_key("key with spaces")); // spaces not allowed } #[test] fn test_grapheme_counting() { // Unicode grapheme cluster counting test use unicode_segmentation::UnicodeSegmentation; let text = "👨‍👩‍👧‍👦"; // Family emoji (1 grapheme cluster) assert_eq!(text.graphemes(true).count(), 1); let text = "hello"; // 5 graphemes assert_eq!(text.graphemes(true).count(), 5); let text = "नमस्ते"; // Devanagari (3 grapheme clusters) assert_eq!(text.graphemes(true).count(), 3); } }