(*--------------------------------------------------------------------------- Copyright (c) 2025 Anil Madhavapeddy . All rights reserved. SPDX-License-Identifier: MIT ---------------------------------------------------------------------------*) (* Main parser entry point - bytesrw-only API *) open Bytesrw module Dom = Dom module Tokenizer = Tokenizer module Encoding = Encoding type parse_error = Parser_tree_builder.parse_error type fragment_context = Parser_tree_builder.fragment_context type t = { root : Dom.node; errors : parse_error list; encoding : Encoding.encoding option; } (* Token sink that feeds tokens to tree builder *) module TreeBuilderSink = struct type t = Parser_tree_builder.t let process tb token ~line ~column = Parser_tree_builder.set_position tb ~line ~column; Parser_tree_builder.process_token tb token; (* Check if we need to switch tokenizer state based on current element *) (* Only switch for HTML namespace elements - SVG/MathML use different rules *) match Parser_tree_builder.current_node tb with | Some node when node.Dom.namespace = None || node.Dom.namespace = Some "html" -> let name = node.Dom.name in if List.mem name ["textarea"; "title"] then `SwitchTo Tokenizer_state.Rcdata else if List.mem name ["style"; "xmp"; "iframe"; "noembed"; "noframes"] then `SwitchTo Tokenizer_state.Rawtext else if name = "script" then `SwitchTo Tokenizer_state.Script_data else if name = "plaintext" then `SwitchTo Tokenizer_state.Plaintext else `Continue | _ -> `Continue let adjusted_current_node_in_html_namespace tb = Parser_tree_builder.adjusted_current_node_in_html_namespace tb end (* Core parsing function that takes a Bytes.Reader.t *) let parse ?(collect_errors=false) ?fragment_context (reader : Bytes.Reader.t) = let tb = Parser_tree_builder.create ~collect_errors ?fragment_context () in let tokenizer = Tokenizer.create (module TreeBuilderSink) tb ~collect_errors () in (* Set tokenizer state for fragment parsing *) (* Note: We do NOT set last_start_tag because in fragment parsing, no start tag has been emitted. This means end tags won't match as "appropriate end tags" and will be treated as raw text in RCDATA/RAWTEXT/Script modes. *) (* Only change tokenizer state for HTML namespace contexts - foreign contexts use Data state *) (match fragment_context with | Some ctx when ctx.namespace = None || ctx.namespace = Some "html" -> let name = String.lowercase_ascii ctx.tag_name in if List.mem name ["title"; "textarea"] then Tokenizer.set_state tokenizer Tokenizer_state.Rcdata else if List.mem name ["style"; "xmp"; "iframe"; "noembed"; "noframes"] then Tokenizer.set_state tokenizer Tokenizer_state.Rawtext else if name = "script" then Tokenizer.set_state tokenizer Tokenizer_state.Script_data else if name = "plaintext" then Tokenizer.set_state tokenizer Tokenizer_state.Plaintext | _ -> ()); Tokenizer.run tokenizer (module TreeBuilderSink) reader; let root = Parser_tree_builder.finish tb in let tokenizer_errors = Tokenizer.get_errors tokenizer in let tree_errors = Parser_tree_builder.get_errors tb in let all_errors = List.map (fun e -> { Parser_tree_builder.code = e.Tokenizer.Errors.code; line = e.Tokenizer.Errors.line; column = e.Tokenizer.Errors.column } ) tokenizer_errors @ tree_errors in { root; errors = all_errors; encoding = None } (* Parse raw bytes with automatic encoding detection *) let parse_bytes ?(collect_errors=false) ?transport_encoding ?fragment_context data = let (html, enc) = Encoding.decode data ?transport_encoding () in let reader = Bytes.Reader.of_string html in let result = parse ~collect_errors ?fragment_context reader in { result with encoding = Some enc } let query t selector = Selector.query t.root selector (* Serialize to a Bytes.Writer.t *) let to_writer ?(pretty=true) ?(indent_size=2) t (writer : Bytes.Writer.t) = let html = Dom.to_html ~pretty ~indent_size t.root in Bytes.Writer.write_string writer html (* Serialize to string (convenience for when result fits in memory) *) let to_string ?(pretty=true) ?(indent_size=2) t = Dom.to_html ~pretty ~indent_size t.root (* Extract text content *) let to_text ?(separator=" ") ?(strip=true) t = Dom.to_text ~separator ~strip t.root (* For testing *) let to_test_format t = Dom.to_test_format t.root