(** XHTML parser using xmlm for proper XML parsing. This module provides XML parsing for XHTML files, which the HTML5 parser cannot handle correctly (especially self-closing tags on non-void elements). *) (** Parse XHTML content using xmlm and return a DOM tree. *) let parse_xhtml content = let input = Xmlm.make_input (`String (0, content)) in (* Stack of nodes during parsing *) let stack = ref [] in let root = Html5rw.Dom.create_document () in stack := [root]; (* Helper to get namespace shorthand *) let ns_shorthand ns = if ns = "http://www.w3.org/2000/svg" then Some "svg" else if ns = "http://www.w3.org/1998/Math/MathML" then Some "mathml" else if ns = "http://www.w3.org/1999/xhtml" then None (* HTML namespace *) else if ns = "" then None (* No namespace = HTML *) else Some ns (* Keep other namespaces as-is *) in (* Process xmlm signals *) let rec process () = if Xmlm.eoi input then () else begin match Xmlm.input input with | `Dtd _ -> (* Skip DTD for now *) process () | `El_start ((ns, local), attrs) -> (* Create element node *) let namespace = ns_shorthand ns in let attr_list = List.map (fun ((_, aname), aval) -> (aname, aval)) attrs in let node = Html5rw.Dom.create_element local ~namespace ~attrs:attr_list () in (* Append to current parent *) (match !stack with | parent :: _ -> Html5rw.Dom.append_child parent node | [] -> ()); (* Push onto stack *) stack := node :: !stack; process () | `El_end -> (* Pop from stack *) (match !stack with | _ :: rest -> stack := rest | [] -> ()); process () | `Data text -> (* Create text node and append to current parent *) let trimmed = String.trim text in if trimmed <> "" || String.length text > 0 then begin let text_node = Html5rw.Dom.create_text text in (match !stack with | parent :: _ -> Html5rw.Dom.append_child parent text_node | [] -> ()) end; process () end in try process (); Ok root with | Xmlm.Error ((line, col), err) -> Error (Printf.sprintf "XML parse error at %d:%d: %s" line col (Xmlm.error_message err)) (** Check if a system_id indicates an XHTML file. *) let is_xhtml_file system_id = match system_id with | Some path -> String.length path > 6 && String.sub path (String.length path - 6) 6 = ".xhtml" | None -> false (** Wrap DOM in an Html5rw.t-compatible structure for the checker. *) type xhtml_doc = { root : Html5rw.Dom.node; errors : Html5rw.Error.t list; } let xhtml_root doc = doc.root let xhtml_errors _doc = [] (* XML parser handles errors differently *)