···11+open B0_kit.V000
22+33+(* OCaml library names *)
44+55+let xmlm = B0_ocaml.libname "xmlm"
66+77+(* Libraries *)
88+99+let xmlm_lib =
1010+ let srcs = Fpath.[`Dir (v "src")] in
1111+ let requires = [] in
1212+ B0_ocaml.lib xmlm ~doc:"The xmlm library" ~srcs ~requires
1313+1414+(* Tests *)
1515+1616+let test_exe src ~doc =
1717+ let src = Fpath.v src in
1818+ let srcs = Fpath.[`File src] in
1919+ let meta = B0_meta.(empty |> tag test) in
2020+ let requires = [ xmlm ] in
2121+ B0_ocaml.exe (Fpath.basename ~strip_ext:true src) ~srcs ~doc ~meta ~requires
2222+2323+let test = test_exe "test/test.ml" ~doc:"Test suite"
2424+let test_tree = test_exe "test/test_tree.ml" ~doc:"Test Xmlm.output_tree"
2525+let xhtml = test_exe "test/xhtml.ml" ~doc:"XHTML entities"
2626+2727+let xmltrip =
2828+ let doc = "Reads xml files and outputs them on stdout" in
2929+ let srcs = Fpath.[`File (v "test/xmltrip.ml");
3030+ `File (v "test/xhtml.ml") ]
3131+ in
3232+ let requires = [xmlm] in
3333+ B0_ocaml.exe "xmltrip" ~public:true ~doc ~srcs ~requires
3434+3535+(* Packs *)
3636+3737+let default =
3838+ let meta =
3939+ B0_meta.empty
4040+ |> B0_meta.(add authors) ["The xmlm programmers"]
4141+ |> B0_meta.(add maintainers)
4242+ ["Daniel Bünzli <daniel.buenzl i@erratique.ch>"]
4343+ |> B0_meta.(add homepage) "https://erratique.ch/software/xmlm"
4444+ |> B0_meta.(add online_doc) "https://erratique.ch/software/xmlm/doc/"
4545+ |> B0_meta.(add licenses) ["ISC"]
4646+ |> B0_meta.(add repo) "git+https://erratique.ch/repos/xmlm.git"
4747+ |> B0_meta.(add issues) "https://github.com/dbuenzli/xmlm/issues"
4848+ |> B0_meta.(add description_tags)
4949+ ["xml"; "codec"; "org:erratique"]
5050+ |> B0_meta.tag B0_opam.tag
5151+ |> B0_meta.add B0_opam.depends
5252+ [ "ocaml", {|>= "4.05.0"|};
5353+ "ocamlfind", {|build|};
5454+ "ocamlbuild", {|build|};
5555+ "topkg", {|build & >= "1.0.3"|};
5656+ ]
5757+ |> B0_meta.add B0_opam.build
5858+ {|[["ocaml" "pkg/pkg.ml" "build" "--dev-pkg" "%{dev}%"]]|}
5959+ in
6060+ B0_pack.make "default" ~doc:"xmlm package" ~meta ~locked:true @@
6161+ B0_unit.list ()
vendor/opam/xmlm/BRZO
This is a binary file and will not be displayed.
+87
vendor/opam/xmlm/CHANGES.md
···11+22+33+- Add support for latin-9 (ISO-8859-15) encoded XML files.
44+ Thanks to Liu Yuxi for the patch.
55+66+77+v1.4.0 2022-02-08 La Forclaz (VS)
88+---------------------------------
99+1010+- OCaml 5.00 support. Thanks to Antonio Nuno Monteiro for the patch.
1111+1212+v1.3.0 2017-03-15 La Forclaz (VS)
1313+---------------------------------
1414+1515+- Add `Xmlm.pp_{dtd,name,attribute,tag,signal}`
1616+- Safe-string support.
1717+- Build depend on topkg.
1818+- Relicense from BSD3 to ISC.
1919+2020+v1.2.0 2013-09-06 Cambridge (UK)
2121+--------------------------------
2222+2323+- `Xmlm.output`, illegal XML Unicode characters in Data signals or
2424+ attribute values are output as U+FFFD (thanks to David Sheets for
2525+ insisting that something should be done about that).
2626+- Deprecate the ability to IO multiple documents from the same
2727+ IO abstraction.
2828+- Deprecate the functorial interface.
2929+- OPAM friendly workflow and drop OASIS support.
3030+3131+3232+v1.1.1 2012-08-05 Lausanne
3333+--------------------------
3434+3535+- OASIS 0.3.0 support.
3636+3737+3838+v1.1.0 2012-03-16 La Forclaz (VS)
3939+---------------------------------
4040+4141+- OASIS support.
4242+- Fixes a bug in the UTF-16 decoder.
4343+- Fixes a bug in `Xmlm.make_output` with a custom function. Thanks to
4444+ Konstantinas Myalo for the report and the patch.
4545+- New optional argument `decl` to `Xmlm.make_output` to control whether the
4646+ XML declaration should be output.
4747+- New function `Xmlm.output_depth`, returns the current element nesting level.
4848+4949+5050+v1.0.2 2009-11-11 大足县
5151+-----------------------
5252+5353+- Replaced a (non tail-recursive) use of `List.map`.
5454+5555+5656+v1.0.1 2008-08-01 Lausanne
5757+----------------------------
5858+5959+- POSIX compliant build shell script (thanks to Michael D Ekstrand).
6060+- Support for Debian packaging.
6161+6262+6363+v1.0.0 2008-03-17 Lausanne
6464+----------------------------
6565+6666+## New features:
6767+- Streaming IO api with support to IO arborescent data structures.
6868+- Proper XML namespace support, all names are expanded names.
6969+- Whitespace stripping respects the xml:space attributes.
7070+- Xmlm.Make functor to use other types for strings and internal buffers.
7171+- UTF-8 encoded documents can start with an UTF-8 encoded BOM.
7272+7373+## Incompatible changes:
7474+- `Xmlm.encoding` becomes a polymorphic variant.
7575+- `Xmlm.error` becomes a polymorphic variant and the "E_" prefix is dropped.
7676+- Removed the callback api.
7777+- Removed the tree and cursor api.
7878+7979+## Other:
8080+- `test/xhtml.ml` has a mapping from XHTML entities to their UTF-8 sequence.
8181+- Build system switched from make to ocamlbuild
8282+8383+8484+v0.9.0 2007-02-26 Lausanne
8585+--------------------------
8686+8787+- First release.
+13
vendor/opam/xmlm/LICENSE.md
···11+Copyright (c) 2007 The xmlm programmers
22+33+Permission to use, copy, modify, and/or distribute this software for any
44+purpose with or without fee is hereby granted, provided that the above
55+copyright notice and this permission notice appear in all copies.
66+77+THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
88+WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
99+MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
1010+ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
1111+WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
1212+ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
1313+OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+50
vendor/opam/xmlm/README.md
···11+Xmlm — Streaming XML codec for OCaml
22+-------------------------------------------------------------------------------
33+%%VERSION%%
44+55+Xmlm is a streaming codec to decode and encode the XML data format. It
66+can process XML documents without a complete in-memory representation of the
77+data.
88+99+Xmlm is made of a single independent module and distributed
1010+under the ISC license.
1111+1212+Home page: http://erratique.ch/software/xmlm
1313+1414+## Installation
1515+1616+Xmlm can be installed with `opam`:
1717+1818+ opam install xmlm
1919+2020+If you don't use `opam` consult the [`opam`](opam) file for build
2121+instructions.
2222+2323+## Documentation
2424+2525+The documentation and API reference is automatically generated
2626+from the source interfaces. It can be consulted [online][doc]
2727+or via `odig doc xmlm`.
2828+2929+[doc]: http://erratique.ch/software/xmlm/doc/Xmlm
3030+3131+## Sample programs
3232+3333+If you installed xmlm with `opam` sample programs are located in
3434+the directory `opam config var xmlm:doc`.
3535+3636+3737+In the distribution sample programs and tests are located in the
3838+[`test`](test) directory of the distribution. They can be built and run
3939+with:
4040+4141+ topkg build --tests true && topkg test
4242+4343+The `xmltrip` tool reads XML files with Xmlm and outputs them back in
4444+various ways. It is useful to understand how Xmlm handles
4545+documents. `xmltrip -help` has more information.
4646+4747+If you need to parse XHTML, the file [`xhtml.ml`](test/xhtml.ml) in
4848+the `test` directory has an OCaml list coupling each XHTML character
4949+entity with its corresponding UTF-8 encoded character string. You can
5050+use it to program a suitable entity callback.
+4
vendor/opam/xmlm/_tags
···11+true : bin_annot, safe_string
22+<_b0> : -traverse
33+<src> : include
44+<test> : include
+7
vendor/opam/xmlm/doc/index.mld
···11+{0 Xmlm {%html: <span class="version">%%VERSION%%</span>%}}
22+33+Xmlm is a streaming codec to decode and encode the XML data format. It
44+can process XML documents without a complete in-memory representation of the
55+data.
66+77+{!modules: Xmlm}
···11+opam-version: "2.0"
22+name: "xmlm"
33+synopsis: "Streaming XML codec for OCaml"
44+description: """\
55+Xmlm is a streaming codec to decode and encode the XML data format. It
66+can process XML documents without a complete in-memory representation of the
77+data.
88+99+Xmlm is made of a single independent module and distributed
1010+under the ISC license.
1111+1212+Home page: http://erratique.ch/software/xmlm"""
1313+maintainer: "Daniel Bünzli <daniel.buenzl i@erratique.ch>"
1414+authors: "The xmlm programmers"
1515+license: "ISC"
1616+tags: ["xml" "codec" "org:erratique"]
1717+homepage: "https://erratique.ch/software/xmlm"
1818+doc: "https://erratique.ch/software/xmlm/doc/"
1919+bug-reports: "https://github.com/dbuenzli/xmlm/issues"
2020+depends: [
2121+ "ocaml" {>= "4.05.0"}
2222+ "ocamlfind" {build}
2323+ "ocamlbuild" {build}
2424+ "topkg" {build & >= "1.0.3"}
2525+]
2626+build: ["ocaml" "pkg/pkg.ml" "build" "--dev-pkg" "%{dev}%"]
2727+dev-repo: "git+https://erratique.ch/repos/xmlm.git"
···11+(*---------------------------------------------------------------------------
22+ Copyright (c) 2007 The xmlm programmers. All rights reserved.
33+ Distributed under the ISC license, see terms at the end of the file.
44+ ---------------------------------------------------------------------------*)
55+66+module Std_string = String
77+module Std_buffer = Buffer
88+99+type std_string = string
1010+type std_buffer = Buffer.t
1111+1212+module type String = sig
1313+ type t
1414+ val empty : t
1515+ val length : t -> int
1616+ val append : t -> t -> t
1717+ val lowercase : t -> t
1818+ val iter : (int -> unit) -> t -> unit
1919+ val of_string : std_string -> t
2020+ val to_utf_8 : ('a -> std_string -> 'a) -> 'a -> t -> 'a
2121+ val compare : t -> t -> int
2222+end
2323+2424+module type Buffer = sig
2525+ type string
2626+ type t
2727+ exception Full
2828+ val create : int -> t
2929+ val add_uchar : t -> int -> unit
3030+ val clear : t -> unit
3131+ val contents : t -> string
3232+ val length : t -> int
3333+end
3434+3535+module type S = sig
3636+ type string
3737+ type encoding = [
3838+ | `UTF_8
3939+ | `UTF_16
4040+ | `UTF_16BE
4141+ | `UTF_16LE
4242+ | `ISO_8859_1
4343+ | `ISO_8859_15
4444+ | `US_ASCII ]
4545+4646+ type dtd = string option
4747+ type name = string * string
4848+ type attribute = name * string
4949+ type tag = name * attribute list
5050+ type signal = [ `Dtd of dtd | `El_start of tag | `El_end | `Data of string ]
5151+5252+ val ns_xml : string
5353+ val ns_xmlns : string
5454+5555+ type pos = int * int
5656+ type error = [
5757+ | `Max_buffer_size
5858+ | `Unexpected_eoi
5959+ | `Malformed_char_stream
6060+ | `Unknown_encoding of string
6161+ | `Unknown_entity_ref of string
6262+ | `Unknown_ns_prefix of string
6363+ | `Illegal_char_ref of string
6464+ | `Illegal_char_seq of string
6565+ | `Expected_char_seqs of string list * string
6666+ | `Expected_root_element ]
6767+6868+ exception Error of pos * error
6969+ val error_message : error -> string
7070+7171+ type source = [
7272+ | `Channel of in_channel
7373+ | `String of int * std_string
7474+ | `Fun of (unit -> int) ]
7575+7676+ type input
7777+7878+ val make_input : ?enc:encoding option -> ?strip:bool ->
7979+ ?ns:(string -> string option) ->
8080+ ?entity: (string -> string option) -> source -> input
8181+8282+ val input : input -> signal
8383+8484+ val input_tree : el:(tag -> 'a list -> 'a) -> data:(string -> 'a) ->
8585+ input -> 'a
8686+8787+ val input_doc_tree : el:(tag -> 'a list -> 'a) -> data:(string -> 'a) ->
8888+ input -> (dtd * 'a)
8989+9090+ val peek : input -> signal
9191+ val eoi : input -> bool
9292+ val pos : input -> pos
9393+9494+ type 'a frag = [ `El of tag * 'a list | `Data of string ]
9595+ type dest = [
9696+ | `Channel of out_channel | `Buffer of std_buffer | `Fun of (int -> unit) ]
9797+9898+ type output
9999+ val make_output : ?decl:bool -> ?nl:bool -> ?indent:int option ->
100100+ ?ns_prefix:(string -> string option) -> dest -> output
101101+102102+ val output_depth : output -> int
103103+ val output : output -> signal -> unit
104104+ val output_tree : ('a -> 'a frag) -> output -> 'a -> unit
105105+ val output_doc_tree : ('a -> 'a frag) -> output -> (dtd * 'a) -> unit
106106+end
107107+108108+109109+(* Unicode character lexers *)
110110+111111+exception Malformed (* for character stream, internal only. *)
112112+113113+let utf8_len = [| (* Char byte length according to first UTF-8 byte. *)
114114+ 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1;
115115+ 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1;
116116+ 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1;
117117+ 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1;
118118+ 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1;
119119+ 1; 1; 1; 1; 1; 1; 1; 1; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0;
120120+ 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0;
121121+ 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0;
122122+ 0; 0; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2;
123123+ 2; 2; 2; 2; 2; 2; 2; 2; 3; 3; 3; 3; 3; 3; 3; 3; 3; 3; 3; 3; 3; 3; 3; 3;
124124+ 4; 4; 4; 4; 4; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0 |]
125125+126126+let uchar_utf8 i =
127127+ let b0 = i () in
128128+ begin match utf8_len.(b0) with
129129+ | 0 -> raise Malformed
130130+ | 1 -> b0
131131+ | 2 ->
132132+ let b1 = i () in
133133+ if b1 lsr 6 != 0b10 then raise Malformed else
134134+ ((b0 land 0x1F) lsl 6) lor (b1 land 0x3F)
135135+ | 3 ->
136136+ let b1 = i () in
137137+ let b2 = i () in
138138+ if b2 lsr 6 != 0b10 then raise Malformed else
139139+ begin match b0 with
140140+ | 0xE0 -> if b1 < 0xA0 || 0xBF < b1 then raise Malformed else ()
141141+ | 0xED -> if b1 < 0x80 || 0x9F < b1 then raise Malformed else ()
142142+ | _ -> if b1 lsr 6 != 0b10 then raise Malformed else ()
143143+ end;
144144+ ((b0 land 0x0F) lsl 12) lor ((b1 land 0x3F) lsl 6) lor (b2 land 0x3F)
145145+ | 4 ->
146146+ let b1 = i () in
147147+ let b2 = i () in
148148+ let b3 = i () in
149149+ if b3 lsr 6 != 0b10 || b2 lsr 6 != 0b10 then raise Malformed else
150150+ begin match b0 with
151151+ | 0xF0 -> if b1 < 0x90 || 0xBF < b1 then raise Malformed else ()
152152+ | 0xF4 -> if b1 < 0x80 || 0x8F < b1 then raise Malformed else ()
153153+ | _ -> if b1 lsr 6 != 0b10 then raise Malformed else ()
154154+ end;
155155+ ((b0 land 0x07) lsl 18) lor ((b1 land 0x3F) lsl 12) lor
156156+ ((b2 land 0x3F) lsl 6) lor (b3 land 0x3F)
157157+ | _ -> assert false
158158+ end
159159+160160+let int16_be i =
161161+ let b0 = i () in
162162+ let b1 = i () in
163163+ (b0 lsl 8) lor b1
164164+165165+let int16_le i =
166166+ let b0 = i () in
167167+ let b1 = i () in
168168+ (b1 lsl 8) lor b0
169169+170170+let uchar_utf16 int16 i =
171171+ let c0 = int16 i in
172172+ if c0 < 0xD800 || c0 > 0xDFFF then c0 else
173173+ if c0 > 0xDBFF then raise Malformed else
174174+ let c1 = int16 i in
175175+ (((c0 land 0x3FF) lsl 10) lor (c1 land 0x3FF)) + 0x10000
176176+177177+let uchar_utf16be = uchar_utf16 int16_be
178178+let uchar_utf16le = uchar_utf16 int16_le
179179+let uchar_byte i = i ()
180180+let uchar_iso_8859_1 i = i ()
181181+182182+let uchar_iso_8859_15 i =
183183+ (* https://www.iana.org/assignments/charset-reg/ISO-8859-15 *)
184184+ match i () with
185185+ | 0x00A4 -> 0x20AC (* € *)
186186+ | 0x00A6 -> 0x0160 (* Š *)
187187+ | 0x00A8 -> 0x0161 (* š *)
188188+ | 0x00B4 -> 0x017D (* Ž *)
189189+ | 0x00B8 -> 0x017E (* ž *)
190190+ | 0x00BC -> 0x0152 (* Œ *)
191191+ | 0x00BD -> 0x0153 (* œ *)
192192+ | 0x00BE -> 0x0178 (* Ÿ *)
193193+ | c -> c
194194+195195+let uchar_ascii i = let b = i () in if b > 127 then raise Malformed else b
196196+197197+(* Functorized streaming XML IO *)
198198+199199+module Make (String : String) (Buffer : Buffer with type string = String.t) =
200200+struct
201201+ type string = String.t
202202+203203+ let str = String.of_string
204204+ let str_eq s s' = (compare s s') = 0
205205+ let str_empty s = (compare s String.empty) = 0
206206+ let cat = String.append
207207+ let str_of_char u =
208208+ let b = Buffer.create 4 in
209209+ Buffer.add_uchar b u;
210210+ Buffer.contents b
211211+212212+ module Ht = Hashtbl.Make (struct type t = string
213213+ let equal = str_eq
214214+ let hash = Hashtbl.hash end)
215215+216216+ let u_nl = 0x000A (* newline *)
217217+ let u_cr = 0x000D (* carriage return *)
218218+ let u_space = 0x0020 (* space *)
219219+ let u_quot = 0x0022 (* quote *)
220220+ let u_sharp = 0x0023 (* # *)
221221+ let u_amp = 0x0026 (* & *)
222222+ let u_apos = 0x0027 (* ' *)
223223+ let u_minus = 0x002D (* - *)
224224+ let u_slash = 0x002F (* / *)
225225+ let u_colon = 0x003A (* : *)
226226+ let u_scolon = 0x003B (* ; *)
227227+ let u_lt = 0x003C (* < *)
228228+ let u_eq = 0x003D (* = *)
229229+ let u_gt = 0x003E (* > *)
230230+ let u_qmark = 0x003F (* ? *)
231231+ let u_emark = 0x0021 (* ! *)
232232+ let u_lbrack = 0x005B (* [ *)
233233+ let u_rbrack = 0x005D (* ] *)
234234+ let u_x = 0x0078 (* x *)
235235+ let u_bom = 0xFEFF (* BOM *)
236236+ let u_9 = 0x0039 (* 9 *)
237237+ let u_F = 0x0046 (* F *)
238238+ let u_D = 0X0044 (* D *)
239239+240240+ let s_cdata = str "CDATA["
241241+ let ns_xml = str "http://www.w3.org/XML/1998/namespace"
242242+ let ns_xmlns = str "http://www.w3.org/2000/xmlns/"
243243+ let n_xml = str "xml"
244244+ let n_xmlns = str "xmlns"
245245+ let n_space = str "space"
246246+ let n_version = str "version"
247247+ let n_encoding = str "encoding"
248248+ let n_standalone = str "standalone"
249249+ let v_yes = str "yes"
250250+ let v_no = str "no"
251251+ let v_preserve = str "preserve"
252252+ let v_default = str "default"
253253+ let v_version_1_0 = str "1.0"
254254+ let v_version_1_1 = str "1.1"
255255+ let v_utf_8 = str "utf-8"
256256+ let v_utf_16 = str "utf-16"
257257+ let v_utf_16be = str "utf-16be"
258258+ let v_utf_16le = str "utf-16le"
259259+ let v_iso_8859_1 = str "iso-8859-1"
260260+ let v_iso_8859_15 = str "iso-8859-15"
261261+ let v_us_ascii = str "us-ascii"
262262+ let v_ascii = str "ascii"
263263+264264+ let name_str (p,l) = if str_empty p then l else cat p (cat (str ":") l)
265265+266266+ (* Basic types and values *)
267267+268268+ type encoding = [
269269+ | `UTF_8
270270+ | `UTF_16
271271+ | `UTF_16BE
272272+ | `UTF_16LE
273273+ | `ISO_8859_1
274274+ | `ISO_8859_15
275275+ | `US_ASCII ]
276276+277277+ type dtd = string option
278278+ type name = string * string
279279+ type attribute = name * string
280280+ type tag = name * attribute list
281281+ type signal = [ `Dtd of dtd | `El_start of tag | `El_end | `Data of string ]
282282+283283+ (* Input *)
284284+285285+ type pos = int * int
286286+ type error = [
287287+ | `Max_buffer_size
288288+ | `Unexpected_eoi
289289+ | `Malformed_char_stream
290290+ | `Unknown_encoding of string
291291+ | `Unknown_entity_ref of string
292292+ | `Unknown_ns_prefix of string
293293+ | `Illegal_char_ref of string
294294+ | `Illegal_char_seq of string
295295+ | `Expected_char_seqs of string list * string
296296+ | `Expected_root_element ]
297297+298298+ exception Error of pos * error
299299+300300+ let error_message e =
301301+ let bracket l v r = cat (str l) (cat v (str r)) in
302302+ match e with
303303+ | `Expected_root_element -> str "expected root element"
304304+ | `Max_buffer_size -> str "maximal buffer size exceeded"
305305+ | `Unexpected_eoi -> str "unexpected end of input"
306306+ | `Malformed_char_stream -> str "malformed character stream"
307307+ | `Unknown_encoding e -> bracket "unknown encoding (" e ")"
308308+ | `Unknown_entity_ref e -> bracket "unknown entity reference (" e ")"
309309+ | `Unknown_ns_prefix e -> bracket "unknown namespace prefix (" e ")"
310310+ | `Illegal_char_ref s -> bracket "illegal character reference (#" s ")"
311311+ | `Illegal_char_seq s ->
312312+ bracket "character sequence illegal here (\"" s "\")"
313313+ | `Expected_char_seqs (exps, fnd) ->
314314+ let exps =
315315+ let exp acc v = cat acc (bracket "\"" v "\", ") in
316316+ List.fold_left exp String.empty exps
317317+ in
318318+ cat (str "expected one of these character sequence: ")
319319+ (cat exps (bracket "found \"" fnd "\""))
320320+321321+ type limit = (* XML is odd to parse. *)
322322+ | Stag of name (* '<' qname *)
323323+ | Etag of name (* '</' qname whitespace* *)
324324+ | Pi of name (* '<?' qname *)
325325+ | Comment (* '<!--' *)
326326+ | Cdata (* '<![CDATA[' *)
327327+ | Dtd (* '<!' *)
328328+ | Text (* other character *)
329329+ | Eoi (* End of input *)
330330+331331+ type source = [
332332+ | `Channel of in_channel
333333+ | `String of int * std_string
334334+ | `Fun of (unit -> int) ]
335335+336336+ type input =
337337+ { enc : encoding option; (* Expected encoding. *)
338338+ strip : bool; (* Whitespace stripping default behaviour. *)
339339+ fun_ns : string -> string option; (* Namespace callback. *)
340340+ fun_entity : string -> string option; (* Entity reference callback. *)
341341+ i : unit -> int; (* Byte level input. *)
342342+ mutable uchar : (unit -> int) -> int; (* Unicode character lexer. *)
343343+ mutable c : int; (* Character lookahead. *)
344344+ mutable cr : bool; (* True if last u was '\r'. *)
345345+ mutable line : int; (* Current line number. *)
346346+ mutable col : int; (* Current column number. *)
347347+ mutable limit : limit; (* Last parsed limit. *)
348348+ mutable peek : signal; (* Signal lookahead. *)
349349+ mutable stripping : bool; (* True if stripping whitespace. *)
350350+ mutable last_white : bool; (* True if last char was white. *)
351351+ mutable scopes : (name * string list * bool) list;
352352+ (* Stack of qualified el. name, bound prefixes and strip behaviour. *)
353353+ ns : string Ht.t; (* prefix -> uri bindings. *)
354354+ ident : Buffer.t; (* Buffer for names and entity refs. *)
355355+ data : Buffer.t; } (* Buffer for character and attribute data. *)
356356+357357+ let err_input_tree = "input signal not `El_start or `Data"
358358+ let err_input_doc_tree = "input signal not `Dtd"
359359+ let err i e = raise (Error ((i.line, i.col), e))
360360+ let err_illegal_char i u = err i (`Illegal_char_seq (str_of_char u))
361361+ let err_expected_seqs i exps s = err i (`Expected_char_seqs (exps, s))
362362+ let err_expected_chars i exps =
363363+ err i (`Expected_char_seqs (List.map str_of_char exps, str_of_char i.c))
364364+365365+ let u_eoi = max_int
366366+ let u_start_doc = u_eoi - 1
367367+ let u_end_doc = u_start_doc - 1
368368+ let signal_start_stream = `Data String.empty
369369+370370+ let make_input ?(enc = None) ?(strip = false) ?(ns = fun _ -> None)
371371+ ?(entity = fun _ -> None) src =
372372+ let i = match src with
373373+ | `Fun f -> f
374374+ | `Channel ic -> fun () -> input_byte ic
375375+ | `String (pos, s) ->
376376+ let len = Std_string.length s in
377377+ let pos = ref (pos - 1) in
378378+ fun () ->
379379+ incr pos;
380380+ if !pos = len then raise End_of_file else
381381+ Char.code (Std_string.get s !pos)
382382+ in
383383+ let bindings =
384384+ let h = Ht.create 15 in
385385+ Ht.add h String.empty String.empty;
386386+ Ht.add h n_xml ns_xml;
387387+ Ht.add h n_xmlns ns_xmlns;
388388+ h
389389+ in
390390+ { enc = enc; strip = strip; fun_ns = ns; fun_entity = entity;
391391+ i = i; uchar = uchar_byte; c = u_start_doc; cr = false;
392392+ line = 1; col = 0; limit = Text; peek = signal_start_stream;
393393+ stripping = strip; last_white = true; scopes = []; ns = bindings;
394394+ ident = Buffer.create 64; data = Buffer.create 1024; }
395395+396396+ (* Bracketed non-terminals in comments refer to XML 1.0 non terminals *)
397397+398398+ let r : int -> int -> int -> bool = fun u a b -> a <= u && u <= b
399399+ let is_white = function 0x0020 | 0x0009 | 0x000D | 0x000A -> true | _ -> false
400400+401401+ let is_char = function (* {Char} *)
402402+ | u when r u 0x0020 0xD7FF -> true
403403+ | 0x0009 | 0x000A | 0x000D -> true
404404+ | u when r u 0xE000 0xFFFD || r u 0x10000 0x10FFFF -> true
405405+ | _ -> false
406406+407407+ let is_digit u = r u 0x0030 0x0039
408408+ let is_hex_digit u =
409409+ r u 0x0030 0x0039 || r u 0x0041 0x0046 || r u 0x0061 0x0066
410410+411411+ let comm_range u = (* common to functions below *)
412412+ r u 0x00C0 0x00D6 || r u 0x00D8 0x00F6 || r u 0x00F8 0x02FF ||
413413+ r u 0x0370 0x037D || r u 0x037F 0x1FFF || r u 0x200C 0x200D ||
414414+ r u 0x2070 0x218F || r u 0x2C00 0x2FEF || r u 0x3001 0xD7FF ||
415415+ r u 0xF900 0xFDCF || r u 0xFDF0 0xFFFD || r u 0x10000 0xEFFFF
416416+417417+ let is_name_start_char = function (* {NameStartChar} - ':' (XML 1.1) *)
418418+ | u when r u 0x0061 0x007A || r u 0x0041 0x005A -> true (* [a-z] | [A-Z] *)
419419+ | u when is_white u -> false
420420+ | 0x005F -> true (* '_' *)
421421+ | u when comm_range u -> true
422422+ | _ -> false
423423+424424+ let is_name_char = function (* {NameChar} - ':' (XML 1.1) *)
425425+ | u when r u 0x0061 0x007A || r u 0x0041 0x005A -> true (* [a-z] | [A-Z] *)
426426+ | u when is_white u -> false
427427+ | u when r u 0x0030 0x0039 -> true (* [0-9] *)
428428+ | 0x005F | 0x002D | 0x002E | 0x00B7 -> true (* '_' '-' '.' *)
429429+ | u when comm_range u || r u 0x0300 0x036F || r u 0x203F 0x2040 -> true
430430+ | _ -> false
431431+432432+ let rec nextc i =
433433+ if i.c = u_eoi then err i `Unexpected_eoi;
434434+ if i.c = u_nl then (i.line <- i.line + 1; i.col <- 1)
435435+ else i.col <- i.col + 1;
436436+ i.c <- i.uchar i.i;
437437+ if not (is_char i.c) then raise Malformed;
438438+ if i.cr && i.c = u_nl then i.c <- i.uchar i.i; (* cr nl business *)
439439+ if i.c = u_cr then (i.cr <- true; i.c <- u_nl) else i.cr <- false
440440+441441+ let nextc_eof i = try nextc i with End_of_file -> i.c <- u_eoi
442442+ let skip_white i = while (is_white i.c) do nextc i done
443443+ let skip_white_eof i = while (is_white i.c) do nextc_eof i done
444444+ let accept i c = if i.c = c then nextc i else err_expected_chars i [ c ]
445445+446446+ let clear_ident i = Buffer.clear i.ident
447447+ let clear_data i = Buffer.clear i.data
448448+ let addc_ident i c = Buffer.add_uchar i.ident c
449449+ let addc_data i c = Buffer.add_uchar i.data c
450450+451451+ let addc_data_strip i c =
452452+ if is_white c then i.last_white <- true else
453453+ begin
454454+ if i.last_white && Buffer.length i.data <> 0 then addc_data i u_space;
455455+ i.last_white <- false;
456456+ addc_data i c
457457+ end
458458+459459+ let expand_name i (prefix, local) =
460460+ let external_ prefix = match i.fun_ns prefix with
461461+ | None -> err i (`Unknown_ns_prefix prefix)
462462+ | Some uri -> uri
463463+ in
464464+ try
465465+ let uri = Ht.find i.ns prefix in
466466+ if not (str_empty uri) then (uri, local) else
467467+ if str_empty prefix then String.empty, local else
468468+ (external_ prefix), local (* unbound with xmlns:prefix="" *)
469469+ with Not_found -> external_ prefix, local
470470+471471+ let find_encoding i = (* Encoding mess. *)
472472+ let reset uchar i = i.uchar <- uchar; i.col <- 0; nextc i in
473473+ match i.enc with
474474+ | None -> (* User doesn't know encoding. *)
475475+ begin match nextc i; i.c with
476476+ | 0xFE -> (* UTF-16BE BOM. *)
477477+ nextc i; if i.c <> 0xFF then err i `Malformed_char_stream;
478478+ reset uchar_utf16be i;
479479+ true
480480+ | 0xFF -> (* UTF-16LE BOM. *)
481481+ nextc i; if i.c <> 0xFE then err i `Malformed_char_stream;
482482+ reset uchar_utf16le i;
483483+ true
484484+ | 0xEF -> (* UTF-8 BOM. *)
485485+ nextc i; if i.c <> 0xBB then err i `Malformed_char_stream;
486486+ nextc i; if i.c <> 0xBF then err i `Malformed_char_stream;
487487+ reset uchar_utf8 i;
488488+ true
489489+ | 0x3C | _ -> (* UTF-8 or other, try declaration. *)
490490+ i.uchar <- uchar_utf8;
491491+ false
492492+ end
493493+ | Some e -> (* User knows encoding. *)
494494+ begin match e with
495495+ | `US_ASCII -> reset uchar_ascii i
496496+ | `ISO_8859_1 -> reset uchar_iso_8859_1 i
497497+ | `ISO_8859_15 -> reset uchar_iso_8859_15 i
498498+ | `UTF_8 -> (* Skip BOM if present. *)
499499+ reset uchar_utf8 i; if i.c = u_bom then (i.col <- 0; nextc i)
500500+ | `UTF_16 -> (* Which UTF-16 ? look BOM. *)
501501+ let b0 = nextc i; i.c in
502502+ let b1 = nextc i; i.c in
503503+ begin match b0, b1 with
504504+ | 0xFE, 0xFF -> reset uchar_utf16be i
505505+ | 0xFF, 0xFE -> reset uchar_utf16le i
506506+ | _ -> err i `Malformed_char_stream;
507507+ end
508508+ | `UTF_16BE -> (* Skip BOM if present. *)
509509+ reset uchar_utf16be i; if i.c = u_bom then (i.col <- 0; nextc i)
510510+ | `UTF_16LE ->
511511+ reset uchar_utf16le i; if i.c = u_bom then (i.col <- 0; nextc i)
512512+ end;
513513+ true (* Ignore xml declaration. *)
514514+515515+516516+ let p_ncname i = (* {NCName} (Namespace 1.1) *)
517517+ clear_ident i;
518518+ if not (is_name_start_char i.c) then err_illegal_char i i.c else
519519+ begin
520520+ addc_ident i i.c; nextc i;
521521+ while is_name_char i.c do addc_ident i i.c; nextc i done;
522522+ Buffer.contents i.ident
523523+ end
524524+525525+ let p_qname i = (* {QName} (Namespace 1.1) *)
526526+ let n = p_ncname i in
527527+ if i.c <> u_colon then (String.empty, n) else (nextc i; (n, p_ncname i))
528528+529529+ let p_charref i = (* {CharRef}, '&' was eaten. *)
530530+ let c = ref 0 in
531531+ clear_ident i;
532532+ nextc i;
533533+ if i.c = u_scolon then err i (`Illegal_char_ref String.empty) else
534534+ begin
535535+ try
536536+ if i.c = u_x then
537537+ begin
538538+ addc_ident i i.c;
539539+ nextc i;
540540+ while (i.c <> u_scolon) do
541541+ addc_ident i i.c;
542542+ if not (is_hex_digit i.c) then raise Exit else
543543+ c := !c * 16 + (if i.c <= u_9 then i.c - 48 else
544544+ if i.c <= u_F then i.c - 55 else
545545+ i.c - 87);
546546+ nextc i;
547547+ done
548548+ end
549549+ else
550550+ while (i.c <> u_scolon) do
551551+ addc_ident i i.c;
552552+ if not (is_digit i.c) then raise Exit else
553553+ c := !c * 10 + (i.c - 48);
554554+ nextc i
555555+ done
556556+ with Exit ->
557557+ c := -1; while i.c <> u_scolon do addc_ident i i.c; nextc i done
558558+ end;
559559+ nextc i;
560560+ if is_char !c then (clear_ident i; addc_ident i !c; Buffer.contents i.ident)
561561+ else err i (`Illegal_char_ref (Buffer.contents i.ident))
562562+563563+ let predefined_entities =
564564+ let h = Ht.create 5 in
565565+ let e k v = Ht.add h (str k) (str v) in
566566+ e "lt" "<"; e "gt" ">"; e "amp" "&"; e "apos" "'"; e "quot" "\"";
567567+ h
568568+569569+ let p_entity_ref i = (* {EntityRef}, '&' was eaten. *)
570570+ let ent = p_ncname i in
571571+ accept i u_scolon;
572572+ try Ht.find predefined_entities ent with Not_found ->
573573+ match i.fun_entity ent with
574574+ | Some s -> s
575575+ | None -> err i (`Unknown_entity_ref ent)
576576+577577+ let p_reference i = (* {Reference} *)
578578+ nextc i; if i.c = u_sharp then p_charref i else p_entity_ref i
579579+580580+ let p_attr_value i = (* {S}? {AttValue} *)
581581+ skip_white i;
582582+ let delim =
583583+ if i.c = u_quot || i.c = u_apos then i.c else
584584+ err_expected_chars i [ u_quot; u_apos]
585585+ in
586586+ nextc i;
587587+ skip_white i;
588588+ clear_data i;
589589+ i.last_white <- true;
590590+ while (i.c <> delim) do
591591+ if i.c = u_lt then err_illegal_char i u_lt else
592592+ if i.c = u_amp then String.iter (addc_data_strip i) (p_reference i)
593593+ else (addc_data_strip i i.c; nextc i)
594594+ done;
595595+ nextc i;
596596+ Buffer.contents i.data
597597+598598+ let p_attributes i = (* ({S} {Attribute})* {S}? *)
599599+ let rec aux i pre_acc acc =
600600+ if not (is_white i.c) then pre_acc, acc else
601601+ begin
602602+ skip_white i;
603603+ if i.c = u_slash || i.c = u_gt then pre_acc, acc else
604604+ begin
605605+ let (prefix, local) as n = p_qname i in
606606+ let v = skip_white i; accept i u_eq; p_attr_value i in
607607+ let att = n, v in
608608+ if str_empty prefix && str_eq local n_xmlns then
609609+ begin (* xmlns *)
610610+ Ht.add i.ns String.empty v;
611611+ aux i (String.empty :: pre_acc) (att :: acc)
612612+ end
613613+ else if str_eq prefix n_xmlns then
614614+ begin (* xmlns:local *)
615615+ Ht.add i.ns local v;
616616+ aux i (local :: pre_acc) (att :: acc)
617617+ end
618618+ else if str_eq prefix n_xml && str_eq local n_space then
619619+ begin (* xml:space *)
620620+ if str_eq v v_preserve then i.stripping <- false else
621621+ if str_eq v v_default then i.stripping <- i.strip else ();
622622+ aux i pre_acc (att :: acc)
623623+ end
624624+ else
625625+ aux i pre_acc (att :: acc)
626626+ end
627627+ end
628628+ in
629629+ aux i [] [] (* Returns a list of bound prefixes and attributes *)
630630+631631+ let p_limit i = (* Parses a markup limit *)
632632+ i.limit <-
633633+ if i.c = u_eoi then Eoi else
634634+ if i.c <> u_lt then Text else
635635+ begin
636636+ nextc i;
637637+ if i.c = u_qmark then (nextc i; Pi (p_qname i)) else
638638+ if i.c = u_slash then
639639+ begin
640640+ nextc i;
641641+ let n = p_qname i in
642642+ skip_white i;
643643+ Etag n
644644+ end
645645+ else if i.c = u_emark then
646646+ begin
647647+ nextc i;
648648+ if i.c = u_minus then (nextc i; accept i u_minus; Comment) else
649649+ if i.c = u_D then Dtd else
650650+ if i.c = u_lbrack then
651651+ begin
652652+ nextc i;
653653+ clear_ident i;
654654+ for k = 1 to 6 do (addc_ident i i.c; nextc i) done;
655655+ let cdata = Buffer.contents i.ident in
656656+ if str_eq cdata s_cdata then Cdata else
657657+ err_expected_seqs i [ s_cdata ] cdata
658658+ end
659659+ else
660660+ err i (`Illegal_char_seq (cat (str "<!") (str_of_char i.c)))
661661+ end
662662+ else
663663+ Stag (p_qname i)
664664+ end
665665+666666+ let rec skip_comment i = (* {Comment}, '<!--' was eaten *)
667667+ while (i.c <> u_minus) do nextc i done;
668668+ nextc i;
669669+ if i.c <> u_minus then skip_comment i else
670670+ begin
671671+ nextc i;
672672+ if i.c <> u_gt then err_expected_chars i [ u_gt ];
673673+ nextc_eof i
674674+ end
675675+676676+ let rec skip_pi i = (* {PI}, '<?' qname was eaten *)
677677+ while (i.c <> u_qmark) do nextc i done;
678678+ nextc i;
679679+ if i.c <> u_gt then skip_pi i else nextc_eof i
680680+681681+ let rec skip_misc i ~allow_xmlpi = match i.limit with (* {Misc}* *)
682682+ | Pi (p,l) when (str_empty p && str_eq n_xml (String.lowercase l)) ->
683683+ if allow_xmlpi then () else err i (`Illegal_char_seq l)
684684+ | Pi _ -> skip_pi i; p_limit i; skip_misc i ~allow_xmlpi
685685+ | Comment -> skip_comment i; p_limit i; skip_misc i ~allow_xmlpi
686686+ | Text when is_white i.c ->
687687+ skip_white_eof i; p_limit i; skip_misc i ~allow_xmlpi
688688+ | _ -> ()
689689+690690+ let p_chardata addc i = (* {CharData}* ({Reference}{Chardata})* *)
691691+ while (i.c <> u_lt) do
692692+ if i.c = u_amp then String.iter (addc i) (p_reference i)
693693+ else if i.c = u_rbrack then
694694+ begin
695695+ addc i i.c;
696696+ nextc i;
697697+ if i.c = u_rbrack then begin
698698+ addc i i.c;
699699+ nextc i; (* detects ']'*']]>' *)
700700+ while (i.c = u_rbrack) do addc i i.c; nextc i done;
701701+ if i.c = u_gt then err i (`Illegal_char_seq (str "]]>"));
702702+ end
703703+ end
704704+ else
705705+ (addc i i.c; nextc i)
706706+ done
707707+708708+ let rec p_cdata addc i = (* {CData} {CDEnd} *)
709709+ try while (true) do
710710+ if i.c = u_rbrack then begin
711711+ nextc i;
712712+ while i.c = u_rbrack do
713713+ nextc i;
714714+ if i.c = u_gt then (nextc i; raise Exit);
715715+ addc i u_rbrack
716716+ done;
717717+ addc i u_rbrack;
718718+ end;
719719+ addc i i.c;
720720+ nextc i;
721721+ done with Exit -> ()
722722+723723+ let p_xml_decl i ~ignore_enc ~ignore_utf16 = (* {XMLDecl}? *)
724724+ let yes_no = [v_yes; v_no] in
725725+ let p_val i = skip_white i; accept i u_eq; skip_white i; p_attr_value i in
726726+ let p_val_exp i exp =
727727+ let v = p_val i in
728728+ if not (List.exists (str_eq v) exp) then err_expected_seqs i exp v
729729+ in
730730+ match i.limit with
731731+ | Pi (p, l) when (str_empty p && str_eq l n_xml) ->
732732+ let v = skip_white i; p_ncname i in
733733+ if not (str_eq v n_version) then err_expected_seqs i [ n_version ] v;
734734+ p_val_exp i [v_version_1_0; v_version_1_1];
735735+ skip_white i;
736736+ if i.c <> u_qmark then begin
737737+ let n = p_ncname i in
738738+ if str_eq n n_encoding then begin
739739+ let enc = String.lowercase (p_val i) in
740740+ if not ignore_enc then begin
741741+ if str_eq enc v_utf_8 then i.uchar <- uchar_utf8 else
742742+ if str_eq enc v_utf_16be then i.uchar <- uchar_utf16be else
743743+ if str_eq enc v_utf_16le then i.uchar <- uchar_utf16le else
744744+ if str_eq enc v_iso_8859_1 then i.uchar <- uchar_iso_8859_1 else
745745+ if str_eq enc v_iso_8859_15 then i.uchar <- uchar_iso_8859_15 else
746746+ if str_eq enc v_us_ascii then i.uchar <- uchar_ascii else
747747+ if str_eq enc v_ascii then i.uchar <- uchar_ascii else
748748+ if str_eq enc v_utf_16 then
749749+ if ignore_utf16 then () else (err i `Malformed_char_stream)
750750+ (* A BOM should have been found. *)
751751+ else
752752+ err i (`Unknown_encoding enc)
753753+ end;
754754+ skip_white i;
755755+ if i.c <> u_qmark then begin
756756+ let n = p_ncname i in
757757+ if str_eq n n_standalone then p_val_exp i yes_no else
758758+ err_expected_seqs i [ n_standalone; str "?>" ] n
759759+ end
760760+ end
761761+ else if str_eq n n_standalone then
762762+ p_val_exp i yes_no
763763+ else
764764+ err_expected_seqs i [ n_encoding; n_standalone; str "?>" ] n
765765+ end;
766766+ skip_white i;
767767+ accept i u_qmark;
768768+ accept i u_gt;
769769+ p_limit i
770770+ | _ -> ()
771771+772772+ let p_dtd_signal i =(* {Misc}* {doctypedecl} {Misc}* *)
773773+ skip_misc i ~allow_xmlpi:false;
774774+ if i.limit <> Dtd then `Dtd None else
775775+ begin
776776+ let buf = addc_data i in
777777+ let nest = ref 1 in
778778+ clear_data i;
779779+ buf u_lt; buf u_emark; (* add eaten "<!" *)
780780+ while (!nest > 0) do
781781+ if i.c = u_lt then
782782+ begin
783783+ nextc i;
784784+ if i.c <> u_emark then
785785+ (buf u_lt; incr nest)
786786+ else
787787+ begin
788788+ nextc i;
789789+ if i.c <> u_minus then (* Carefull with comments ! *)
790790+ (buf u_lt; buf u_emark; incr nest)
791791+ else
792792+ begin
793793+ nextc i;
794794+ if i.c <> u_minus then
795795+ (buf u_lt; buf u_emark; buf u_minus; incr nest)
796796+ else
797797+ (nextc i; skip_comment i)
798798+ end
799799+ end
800800+ end
801801+ else if i.c = u_quot || i.c = u_apos then
802802+ begin
803803+ let c = i.c in
804804+ buf c; nextc i;
805805+ while (i.c <> c) do (buf i.c; nextc i) done;
806806+ buf c; nextc i
807807+ end
808808+ else if i.c = u_gt then (buf u_gt; nextc i; decr nest)
809809+ else (buf i.c; nextc i)
810810+ done;
811811+ let dtd = Buffer.contents i.data in
812812+ p_limit i;
813813+ skip_misc i ~allow_xmlpi:false;
814814+ `Dtd (Some dtd);
815815+ end
816816+817817+ let p_data i =
818818+ let rec bufferize addc i = match i.limit with
819819+ | Text -> p_chardata addc i; p_limit i; bufferize addc i
820820+ | Cdata -> p_cdata addc i; p_limit i; bufferize addc i
821821+ | (Stag _ | Etag _) -> ()
822822+ | Pi _ -> skip_pi i; p_limit i; bufferize addc i
823823+ | Comment -> skip_comment i; p_limit i; bufferize addc i
824824+ | Dtd -> err i (`Illegal_char_seq (str "<!D"))
825825+ | Eoi -> err i `Unexpected_eoi
826826+ in
827827+ clear_data i;
828828+ i.last_white <- true;
829829+ bufferize (if i.stripping then addc_data_strip else addc_data) i;
830830+ let d = Buffer.contents i.data in
831831+ d
832832+833833+ let p_el_start_signal i n =
834834+ let expand_att (((prefix, local) as n, v) as att) =
835835+ if not (str_eq prefix String.empty) then expand_name i n, v else
836836+ if str_eq local n_xmlns then (ns_xmlns, n_xmlns), v else
837837+ att (* default namespaces do not influence attributes. *)
838838+ in
839839+ let strip = i.stripping in (* save it here, p_attributes may change it. *)
840840+ let prefixes, atts = p_attributes i in
841841+ i.scopes <- (n, prefixes, strip) :: i.scopes;
842842+ `El_start ((expand_name i n), List.rev_map expand_att atts)
843843+844844+ let p_el_end_signal i n = match i.scopes with
845845+ | (n', prefixes, strip) :: scopes ->
846846+ if i.c <> u_gt then err_expected_chars i [ u_gt ];
847847+ if not (str_eq n n') then err_expected_seqs i [name_str n'] (name_str n);
848848+ i.scopes <- scopes;
849849+ i.stripping <- strip;
850850+ List.iter (Ht.remove i.ns) prefixes;
851851+ if scopes = [] then i.c <- u_end_doc else (nextc i; p_limit i);
852852+ `El_end
853853+ | _ -> assert false
854854+855855+ let p_signal i =
856856+ if i.scopes = [] then
857857+ match i.limit with
858858+ | Stag n -> p_el_start_signal i n
859859+ | _ -> err i `Expected_root_element
860860+ else
861861+ let rec find i = match i.limit with
862862+ | Stag n -> p_el_start_signal i n
863863+ | Etag n -> p_el_end_signal i n
864864+ | Text | Cdata ->
865865+ let d = p_data i in
866866+ if str_empty d then find i else `Data d
867867+ | Pi _ -> skip_pi i; p_limit i; find i
868868+ | Comment -> skip_comment i; p_limit i; find i
869869+ | Dtd -> err i (`Illegal_char_seq (str "<!D"))
870870+ | Eoi -> err i `Unexpected_eoi
871871+ in
872872+ begin match i.peek with
873873+ | `El_start (n, _) -> (* finish to input start el. *)
874874+ skip_white i;
875875+ if i.c = u_gt then (accept i u_gt; p_limit i) else
876876+ if i.c = u_slash then
877877+ begin
878878+ let tag = match i.scopes with
879879+ | (tag, _, _) :: _ -> tag | _ -> assert false
880880+ in
881881+ (nextc i; i.limit <- Etag tag)
882882+ end
883883+ else
884884+ err_expected_chars i [ u_slash; u_gt ]
885885+ | _ -> ()
886886+ end;
887887+ find i
888888+889889+ let eoi i =
890890+ try
891891+ if i.c = u_eoi then true else
892892+ if i.c <> u_start_doc then false else (* In a document. *)
893893+ if i.peek <> `El_end then (* Start of document sequence. *)
894894+ begin
895895+ let ignore_enc = find_encoding i in
896896+ p_limit i;
897897+ p_xml_decl i ~ignore_enc ~ignore_utf16:false;
898898+ i.peek <- p_dtd_signal i;
899899+ false
900900+ end
901901+ else (* Subsequent documents. *)
902902+ begin
903903+ nextc_eof i;
904904+ p_limit i;
905905+ if i.c = u_eoi then true else
906906+ begin
907907+ skip_misc i ~allow_xmlpi:true;
908908+ if i.c = u_eoi then true else
909909+ begin
910910+ p_xml_decl i ~ignore_enc:false ~ignore_utf16:true;
911911+ i.peek <- p_dtd_signal i;
912912+ false
913913+ end
914914+ end
915915+ end
916916+ with
917917+ | Buffer.Full -> err i `Max_buffer_size
918918+ | Malformed -> err i `Malformed_char_stream
919919+ | End_of_file -> err i `Unexpected_eoi
920920+921921+ let peek i = if eoi i then err i `Unexpected_eoi else i.peek
922922+923923+ let input i =
924924+ try
925925+ if i.c = u_end_doc then (i.c <- u_start_doc; i.peek) else
926926+ let s = peek i in
927927+ i.peek <- p_signal i;
928928+ s
929929+ with
930930+ | Buffer.Full -> err i `Max_buffer_size
931931+ | Malformed -> err i `Malformed_char_stream
932932+ | End_of_file -> err i `Unexpected_eoi
933933+934934+ let input_tree ~el ~data i = match input i with
935935+ | `Data d -> data d
936936+ | `El_start tag ->
937937+ let rec aux i tags context = match input i with
938938+ | `El_start tag -> aux i (tag :: tags) ([] :: context)
939939+ | `El_end ->
940940+ begin match tags, context with
941941+ | tag :: tags', childs :: context' ->
942942+ let el = el tag (List.rev childs) in
943943+ begin match context' with
944944+ | parent :: context'' -> aux i tags' ((el :: parent) :: context'')
945945+ | [] -> el
946946+ end
947947+ | _ -> assert false
948948+ end
949949+ | `Data d ->
950950+ begin match context with
951951+ | childs :: context' -> aux i tags (((data d) :: childs) :: context')
952952+ | [] -> assert false
953953+ end
954954+ | `Dtd _ -> assert false
955955+ in
956956+ aux i (tag :: []) ([] :: [])
957957+ | _ -> invalid_arg err_input_tree
958958+959959+960960+ let input_doc_tree ~el ~data i = match input i with
961961+ | `Dtd d -> d, input_tree ~el ~data i
962962+ | _ -> invalid_arg err_input_doc_tree
963963+964964+ let pos i = i.line, i.col
965965+966966+ (* Output *)
967967+968968+ type 'a frag = [ `El of tag * 'a list | `Data of string ]
969969+ type dest = [
970970+ | `Channel of out_channel | `Buffer of std_buffer | `Fun of (int -> unit) ]
971971+972972+ type output =
973973+ { decl : bool; (* True if the XML declaration should be output. *)
974974+ nl : bool; (* True if a newline is output at the end. *)
975975+ indent : int option; (* Optional indentation. *)
976976+ fun_prefix : string -> string option; (* Prefix callback. *)
977977+ prefixes : string Ht.t; (* uri -> prefix bindings. *)
978978+ outs : std_string -> int -> int -> unit; (* String output. *)
979979+ outc : char -> unit; (* character output. *)
980980+ mutable last_el_start : bool; (* True if last signal was `El_start *)
981981+ mutable scopes : (name * (string list)) list;
982982+ (* Qualified el. name and bound uris. *)
983983+ mutable depth : int; } (* Scope depth. *)
984984+985985+ let err_prefix uri = "unbound namespace (" ^ uri ^ ")"
986986+ let err_dtd = "dtd signal not allowed here"
987987+ let err_el_start = "start signal not allowed here"
988988+ let err_el_end = "end signal without matching start signal"
989989+ let err_data = "data signal not allowed here"
990990+991991+ let make_output ?(decl = true) ?(nl = false) ?(indent = None)
992992+ ?(ns_prefix = fun _ ->None) d =
993993+ let outs, outc = match d with
994994+ | `Channel c -> (output_substring c), (output_char c)
995995+ | `Buffer b -> (Std_buffer.add_substring b), (Std_buffer.add_char b)
996996+ | `Fun f ->
997997+ let os s p l =
998998+ for i = p to p + l - 1 do f (Char.code (Std_string.get s i)) done
999999+ in
10001000+ let oc c = f (Char.code c) in
10011001+ os, oc
10021002+ in
10031003+ let prefixes =
10041004+ let h = Ht.create 10 in
10051005+ Ht.add h String.empty String.empty;
10061006+ Ht.add h ns_xml n_xml;
10071007+ Ht.add h ns_xmlns n_xmlns;
10081008+ h
10091009+ in
10101010+ { decl = decl; outs = outs; outc = outc; nl = nl; indent = indent;
10111011+ last_el_start = false; prefixes = prefixes; scopes = []; depth = -1;
10121012+ fun_prefix = ns_prefix; }
10131013+10141014+ let output_depth o = o.depth
10151015+ let outs o s = o.outs s 0 (Std_string.length s)
10161016+ let str_utf_8 s = String.to_utf_8 (fun _ s -> s) "" s
10171017+ let out_utf_8 o s = ignore (String.to_utf_8 (fun o s -> outs o s; o) o s)
10181018+10191019+ let prefix_name o (ns, local) =
10201020+ try
10211021+ if str_eq ns ns_xmlns && str_eq local n_xmlns then (String.empty, n_xmlns)
10221022+ else (Ht.find o.prefixes ns, local)
10231023+ with Not_found ->
10241024+ match o.fun_prefix ns with
10251025+ | None -> invalid_arg (err_prefix (str_utf_8 ns))
10261026+ | Some prefix -> prefix, local
10271027+10281028+ let bind_prefixes o atts =
10291029+ let add acc ((ns, local), uri) =
10301030+ if not (str_eq ns ns_xmlns) then acc else
10311031+ begin
10321032+ let prefix = if str_eq local n_xmlns then String.empty else local in
10331033+ Ht.add o.prefixes uri prefix;
10341034+ uri :: acc
10351035+ end
10361036+ in
10371037+ List.fold_left add [] atts
10381038+10391039+ let out_data o s =
10401040+ let out () s =
10411041+ let len = Std_string.length s in
10421042+ let start = ref 0 in
10431043+ let last = ref 0 in
10441044+ let escape e =
10451045+ o.outs s !start (!last - !start);
10461046+ outs o e;
10471047+ incr last;
10481048+ start := !last
10491049+ in
10501050+ while (!last < len) do match Std_string.get s !last with
10511051+ | '<' -> escape "<" (* Escape markup delimiters. *)
10521052+ | '>' -> escape ">"
10531053+ | '&' -> escape "&"
10541054+ (* | '\'' -> escape "'" *) (* Not needed we use \x22 for attributes. *)
10551055+ | '\x22' -> escape """
10561056+ | '\n' | '\t' | '\r' -> incr last
10571057+ | c when c < ' ' -> escape "\xEF\xBF\xBD" (* illegal, subst. by U+FFFD *)
10581058+ | _ -> incr last
10591059+ done;
10601060+ o.outs s !start (!last - !start)
10611061+ in
10621062+ String.to_utf_8 out () s
10631063+10641064+ let out_qname o (p, l) =
10651065+ if not (str_empty p) then (out_utf_8 o p; o.outc ':');
10661066+ out_utf_8 o l
10671067+10681068+ let out_attribute o (n, v) =
10691069+ o.outc ' '; out_qname o (prefix_name o n); outs o "=\x22";
10701070+ out_data o v;
10711071+ o.outc '\x22'
10721072+10731073+ let output o s =
10741074+ let indent o = match o.indent with
10751075+ | None -> ()
10761076+ | Some c -> for i = 1 to (o.depth * c) do o.outc ' ' done
10771077+ in
10781078+ let unindent o = match o.indent with None -> () | Some _ -> o.outc '\n' in
10791079+ if o.depth = -1 then
10801080+ begin match s with
10811081+ | `Dtd d ->
10821082+ if o.decl then outs o "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
10831083+ begin match d with
10841084+ | Some dtd -> out_utf_8 o dtd; o.outc '\n'
10851085+ | None -> ()
10861086+ end;
10871087+ o.depth <- 0
10881088+ | `Data _ -> invalid_arg err_data
10891089+ | `El_start _ -> invalid_arg err_el_start
10901090+ | `El_end -> invalid_arg err_el_end
10911091+ end
10921092+ else
10931093+ begin match s with
10941094+ | `El_start (n, atts) ->
10951095+ if o.last_el_start then (outs o ">"; unindent o);
10961096+ indent o;
10971097+ let uris = bind_prefixes o atts in
10981098+ let qn = prefix_name o n in
10991099+ o.outc '<'; out_qname o qn; List.iter (out_attribute o) atts;
11001100+ o.scopes <- (qn, uris) :: o.scopes;
11011101+ o.depth <- o.depth + 1;
11021102+ o.last_el_start <- true
11031103+ | `El_end ->
11041104+ begin match o.scopes with
11051105+ | (n, uris) :: scopes' ->
11061106+ o.depth <- o.depth - 1;
11071107+ if o.last_el_start then outs o "/>" else
11081108+ begin
11091109+ indent o;
11101110+ outs o "</"; out_qname o n; o.outc '>';
11111111+ end;
11121112+ o.scopes <- scopes';
11131113+ List.iter (Ht.remove o.prefixes) uris;
11141114+ o.last_el_start <- false;
11151115+ if o.depth = 0 then (if o.nl then o.outc '\n'; o.depth <- -1;)
11161116+ else unindent o
11171117+ | [] -> invalid_arg err_el_end
11181118+ end
11191119+ | `Data d ->
11201120+ if o.last_el_start then (outs o ">"; unindent o);
11211121+ indent o;
11221122+ out_data o d;
11231123+ unindent o;
11241124+ o.last_el_start <- false
11251125+ | `Dtd _ -> failwith err_dtd
11261126+ end
11271127+11281128+ let output_tree frag o v =
11291129+ let rec aux o = function
11301130+ | (v :: rest) :: context ->
11311131+ begin match frag v with
11321132+ | `El (tag, childs) ->
11331133+ output o (`El_start tag);
11341134+ aux o (childs :: rest :: context)
11351135+ | (`Data d) as signal ->
11361136+ output o signal;
11371137+ aux o (rest :: context)
11381138+ end
11391139+ | [] :: [] -> ()
11401140+ | [] :: context -> output o `El_end; aux o context
11411141+ | [] -> assert false
11421142+ in
11431143+ aux o ([v] :: [])
11441144+11451145+ let output_doc_tree frag o (dtd, v) =
11461146+ output o (`Dtd dtd);
11471147+ output_tree frag o v
11481148+11491149+end
11501150+11511151+(* Default streaming XML IO *)
11521152+11531153+module String = struct
11541154+ type t = string
11551155+ let empty = ""
11561156+ let length = String.length
11571157+ let append = ( ^ )
11581158+ let lowercase = String.lowercase_ascii
11591159+ let iter f s =
11601160+ let len = Std_string.length s in
11611161+ let pos = ref ~-1 in
11621162+ let i () =
11631163+ incr pos;
11641164+ if !pos = len then raise Exit else
11651165+ Char.code (Std_string.get s !pos)
11661166+ in
11671167+ try while true do f (uchar_utf8 i) done with Exit -> ()
11681168+11691169+ let of_string s = s
11701170+ let to_utf_8 f v x = f v x
11711171+ let compare = String.compare
11721172+end
11731173+11741174+module Buffer = struct
11751175+ type string = String.t
11761176+ type t = Buffer.t
11771177+ exception Full
11781178+ let create = Buffer.create
11791179+ let add_uchar b u =
11801180+ try
11811181+ (* UTF-8 encodes an uchar in the buffer, assumes u is valid code point. *)
11821182+ let buf c = Buffer.add_char b (Char.chr c) in
11831183+ if u <= 0x007F then
11841184+ (buf u)
11851185+ else if u <= 0x07FF then
11861186+ (buf (0xC0 lor (u lsr 6));
11871187+ buf (0x80 lor (u land 0x3F)))
11881188+ else if u <= 0xFFFF then
11891189+ (buf (0xE0 lor (u lsr 12));
11901190+ buf (0x80 lor ((u lsr 6) land 0x3F));
11911191+ buf (0x80 lor (u land 0x3F)))
11921192+ else
11931193+ (buf (0xF0 lor (u lsr 18));
11941194+ buf (0x80 lor ((u lsr 12) land 0x3F));
11951195+ buf (0x80 lor ((u lsr 6) land 0x3F));
11961196+ buf (0x80 lor (u land 0x3F)))
11971197+ with Failure _ -> raise Full
11981198+11991199+ let clear b = Buffer.clear b
12001200+ let contents = Buffer.contents
12011201+ let length = Buffer.length
12021202+end
12031203+12041204+include Make(String) (Buffer)
12051205+12061206+(* Pretty printers *)
12071207+12081208+let pp = Format.fprintf
12091209+let rec pp_list ?(pp_sep = Format.pp_print_cut) pp_v ppf = function
12101210+| [] -> ()
12111211+| v :: vs ->
12121212+ pp_v ppf v; if vs <> [] then (pp_sep ppf (); pp_list ~pp_sep pp_v ppf vs)
12131213+12141214+let pp_name ppf (p, l) = if p <> "" then pp ppf "%s:%s" p l else pp ppf "%s" l
12151215+let pp_attribute ppf (n, v) = pp ppf "@[<1>(%a,@,%S)@]" pp_name n v
12161216+let pp_tag ppf (name, atts) =
12171217+ let pp_sep ppf () = pp ppf ";@ " in
12181218+ pp ppf "@[<1>(%a,@,@[<1>[%a]@])@]"
12191219+ pp_name name (pp_list ~pp_sep pp_attribute) atts
12201220+12211221+let pp_dtd ppf = function
12221222+| None -> pp ppf "None"
12231223+| Some dtd -> pp ppf "@[<1>(Some@ %S)@]" dtd
12241224+12251225+let pp_signal ppf = function
12261226+| `Data s -> pp ppf "@[`Data %S@]" s
12271227+| `El_end -> pp ppf "`El_end"
12281228+| `El_start tag -> pp ppf "@[`El_start %a@]" pp_tag tag
12291229+| `Dtd dtd -> pp ppf "@[`Dtd %a@]" pp_dtd dtd
12301230+12311231+(*----------------------------------------------------------------------------
12321232+ Copyright (c) 2007 The xmlm programmers
12331233+12341234+ Permission to use, copy, modify, and/or distribute this software for any
12351235+ purpose with or without fee is hereby granted, provided that the above
12361236+ copyright notice and this permission notice appear in all copies.
12371237+12381238+ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
12391239+ WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12401240+ MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12411241+ ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
12421242+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
12431243+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
12441244+ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
12451245+ ---------------------------------------------------------------------------*)
+827
vendor/opam/xmlm/src/xmlm.mli
···11+(*---------------------------------------------------------------------------
22+ Copyright (c) 2007 The xmlm programmers. All rights reserved.
33+ Distributed under the ISC license, see terms at the end of the file.
44+ ---------------------------------------------------------------------------*)
55+66+(** Streaming XML codec.
77+88+ A well-formed sequence of {{!signal}signals} represents an
99+ {{:http://www.w3.org/TR/REC-xml}XML} document tree traversal in
1010+ depth first order (this has nothing to do with XML
1111+ well-formedness). Input pulls a well-formed sequence of signals
1212+ from a data source and output pushes a well-formed sequence of
1313+ signals to a data destination. Functions are provided to easily
1414+ transform sequences of signals to/from arborescent data structures.
1515+1616+ Consult the {{!io}features and limitations} and {{!ex}examples}
1717+ of use.
1818+1919+ {b References}
2020+ {ul
2121+ {- Tim Bray.
2222+ {e {{:http://www.xml.com/axml/axml.html}The annotated XML Specification}},
2323+ 1998.}
2424+ {- Tim Bray et al.
2525+ {e {{:http://www.w3.org/TR/xml-names11}Namespaces in XML 1.1 (2nd ed.)}},
2626+ 2006.}} *)
2727+2828+(** {1:types Basic types and values} *)
2929+3030+(** The type for character encodings. For [`UTF_16], endianness is
3131+ determined from the
3232+ {{:http://www.unicode.org/unicode/faq/utf_bom.html#BOM}BOM}. *)
3333+type encoding = [
3434+ | `UTF_8
3535+ | `UTF_16
3636+ (** Endianness determined from the
3737+ {{:http://www.unicode.org/unicode/faq/utf_bom.html#BOM}BOM}. *)
3838+ | `UTF_16BE
3939+ | `UTF_16LE
4040+ | `ISO_8859_1
4141+ | `ISO_8859_15
4242+ | `US_ASCII ]
4343+4444+type dtd = string option
4545+(** The type for the optional
4646+ {{:http://www.w3.org/TR/REC-xml/#dt-doctype}DTD}. *)
4747+4848+type name = string * string
4949+(** The type for attribute and element's
5050+ {{:http://www.w3.org/TR/xml-names11/#dt-expname}expanded names}
5151+ [(uri,local)]. An empty [uri] represents a name without a
5252+ namespace name, i.e. an unprefixed name
5353+ that is not under the scope of a default namespace. *)
5454+5555+type attribute = name * string
5656+(** The type for attributes. Name and attribute data. *)
5757+5858+type tag = name * attribute list
5959+(** The type for an element tag. Tag name and attribute list. *)
6060+6161+type signal = [ `Dtd of dtd | `El_start of tag | `El_end | `Data of string ]
6262+(** The type for signals. A {e well-formed} sequence of signals belongs
6363+ to the language of the [doc] grammar :
6464+ {[doc ::= `Dtd tree
6565+tree ::= `El_start child `El_end
6666+child ::= `Data trees | trees
6767+trees ::= tree child | epsilon]}
6868+ The [trees] production is used to expresses the fact that there will
6969+ never be two consecutive `Data signals in the children of an element.
7070+7171+ Input and output deal only with well-formed sequences or
7272+ exceptions are raised. However on output consecutive [`Data]
7373+ signals are allowed. *)
7474+7575+val ns_xml : string
7676+(** Namespace name {{:http://www.w3.org/XML/1998/namespace}value} bound to the
7777+ reserved ["xml"] prefix. *)
7878+7979+val ns_xmlns : string
8080+(** Namespace name {{:http://www.w3.org/2000/xmlns/}value} bound to the
8181+ reserved ["xmlns"] prefix. *)
8282+8383+val pp_dtd : Format.formatter -> dtd -> unit
8484+(** [pp_dtd ppf dtd] prints an unspecified representation of [dtd] on [ppf]. *)
8585+8686+val pp_name : Format.formatter -> name -> unit
8787+(** [pp_name ppf name] prints an unspecified representation of [name] on
8888+ [ppf]. *)
8989+9090+val pp_attribute : Format.formatter -> attribute -> unit
9191+(** [pp_attribute ppf att] prints an unspecified representation of [att] on
9292+ [ppf]. *)
9393+9494+val pp_tag : Format.formatter -> tag -> unit
9595+(** [pp_tag ppf tag] prints an unspecified representation of [tag] on
9696+ [ppf]. *)
9797+9898+val pp_signal : Format.formatter -> signal -> unit
9999+(** [pp_signal ppf s] prints an unspecified representation of [s] on
100100+ [ppf]. *)
101101+102102+(** {1:api_input Input} *)
103103+104104+type pos = int * int
105105+(** The type for input positions. Line and column number, both start
106106+ with 1. *)
107107+108108+(** The type for input errors. *)
109109+type error = [
110110+ | `Max_buffer_size
111111+ (** Maximal buffer size exceeded ([Sys.max_string_length]). *)
112112+ | `Unexpected_eoi
113113+ (** Unexpected end of input. *)
114114+ | `Malformed_char_stream
115115+ (** Malformed underlying character stream. *)
116116+ | `Unknown_encoding of string
117117+ (** Unknown encoding. *)
118118+ | `Unknown_entity_ref of string
119119+ (** Unknown entity reference, {{!inentity} details}. *)
120120+ | `Unknown_ns_prefix of string
121121+ (** Unknown namespace prefix {{!inns} details} *)
122122+ | `Illegal_char_ref of string
123123+ (** Illegal character reference. *)
124124+ | `Illegal_char_seq of string
125125+ (** Illegal character sequence. *)
126126+ | `Expected_char_seqs of string list * string
127127+ (** Expected one of the character sequences in the list but found another. *)
128128+ | `Expected_root_element
129129+ (** Expected the document's root element. *) ]
130130+131131+val error_message : error -> string
132132+(** Converts the error to an english error message. *)
133133+134134+exception Error of pos * error
135135+(** Raised on input errors. *)
136136+137137+type source = [
138138+ | `Channel of in_channel | `String of int * string | `Fun of (unit -> int) ]
139139+(** The type for input sources. For [`String] starts reading at the
140140+ given integer position. For [`Fun] the function must return the
141141+ next {e byte} as an [int] and raise [End_of_file] if there is no
142142+ such byte. *)
143143+144144+type input
145145+(** The type for input abstractions. *)
146146+147147+val make_input : ?enc:encoding option -> ?strip:bool ->
148148+ ?ns:(string -> string option) ->
149149+ ?entity: (string -> string option) -> source -> input
150150+(** Returns a new input abstraction reading from the given source.
151151+ {ul
152152+ {- [enc], character encoding of the document, {{!inenc} details}.
153153+ Defaults to [None].}
154154+ {- [strip], strips whitespace in character data, {{!inwspace} details}.
155155+ Defaults to [false].}
156156+ {- [ns] is called to bind undeclared namespace prefixes,
157157+ {{!inns} details}. Default returns always [None].}
158158+ {- [entity] is called to resolve non predefined entity references,
159159+ {{!inentity} details}. Default returns always [None].}} *)
160160+161161+val input : input -> signal
162162+(** Inputs a signal. Repeated invocation of the function with the same
163163+ input abstraction will generate a {{!signal}well-formed} sequence
164164+ of signals or an {!Error} is raised. Furthermore there will be no
165165+ two consecutive [`Data] signals in the sequence and their string
166166+ is always non empty.
167167+168168+ This behaviour is {b deprecated}: after a well-formed sequence was
169169+ input another may be input, see {!eoi} and {{!iseq}details}.
170170+171171+ {b Raises} {!Error} on input errors. *)
172172+173173+val input_tree : el:(tag -> 'a list -> 'a) -> data:(string -> 'a) ->
174174+ input -> 'a
175175+(** If the next signal is a :
176176+ {ul
177177+ {- [`Data] signal, inputs it and invokes [data] with the character data.}
178178+ {- [`El_start] signal, inputs the sequence of signals until its
179179+ matching [`El_end] and invokes [el] and [data] as follows
180180+ {ul
181181+ {- [el], is called on each [`El_end] signals with the corresponding
182182+ [`El_start] tag and the result of the callback invocation for the
183183+ element's children.}
184184+ {- [data], is called on each [`Data] signals with the character data.
185185+ This function won't be called twice consecutively or with the empty
186186+ string.}}}
187187+ {- Other signals, raises [Invalid_argument].}}
188188+189189+ {b Raises} {!Error} on input errors and [Invalid_argument]
190190+ if the next signal is not [`El_start] or [`Data]. *)
191191+192192+val input_doc_tree : el:(tag -> 'a list -> 'a) -> data:(string -> 'a) ->
193193+ input -> (dtd * 'a)
194194+(** Same as {!input_tree} but reads a complete {{!signal}well-formed}
195195+ sequence of signals.
196196+197197+ {b Raises} {!Error} on input errors and [Invalid_argument]
198198+ if the next signal is not [`Dtd]. *)
199199+200200+val peek : input -> signal
201201+(** Same as {!val-input} but doesn't remove the signal from the sequence.
202202+203203+ {b Raises} {!Error} on input errors. *)
204204+205205+val eoi : input -> bool
206206+(** Returns [true] if the end of input is reached. See {{!iseq}details}.
207207+208208+ {b Raises} {!Error} on input errors. *)
209209+210210+val pos : input -> pos
211211+(** Current position in the input abstraction. *)
212212+213213+(** {1:api_output Output} *)
214214+215215+type 'a frag = [ `El of tag * 'a list | `Data of string ]
216216+(** The type for deconstructing data structures of type ['a]. *)
217217+218218+type dest = [ `Channel of out_channel | `Buffer of Buffer.t
219219+ | `Fun of (int -> unit) ]
220220+(** The type for output destinations. For [`Buffer], the buffer won't
221221+ be cleared. For [`Fun] the function is called with the output {e
222222+ bytes} as [int]s. *)
223223+224224+type output
225225+(** The type for output abstractions. *)
226226+227227+val make_output : ?decl:bool -> ?nl:bool -> ?indent:int option ->
228228+ ?ns_prefix:(string -> string option) -> dest -> output
229229+(** Returns a new output abstraction writing to the given destination.
230230+ {ul
231231+ {- [decl], if [true] the {{:http://www.w3.org/TR/REC-xml/#NT-XMLDecl} XML
232232+ declaration} is output (defaults to [true]).}
233233+ {- [nl], if [true] a newline is output when the root's element [`El_end]
234234+ signal is output.
235235+ Defaults to [false].}
236236+ {- [indent], identation behaviour, see {{!outindent} details}. Defaults to
237237+ [None].}
238238+ {- [ns_prefix], undeclared namespace prefix bindings,
239239+ see {{!outns}details}. Default returns always [None].}} *)
240240+241241+val output : output -> signal -> unit
242242+(** Outputs a signal.
243243+244244+ This behaviour is {b deprecated}: after a well-formed sequence of
245245+ signals was output a new well-formed sequence can be output.
246246+247247+ {b Raises} [Invalid_argument] if the resulting signal sequence on
248248+ the output abstraction is not {{!signal}well-formed} or if a
249249+ namespace name could not be bound to a prefix. *)
250250+251251+val output_depth : output -> int
252252+(** [output_depth o] is [o]'s current element nesting level (undefined
253253+ before the first [`El_start] and after the last [`El_end]). *)
254254+255255+val output_tree : ('a -> 'a frag) -> output -> 'a -> unit
256256+(** Outputs signals corresponding to a value by recursively
257257+ applying the given value deconstructor.
258258+259259+ {b Raises} see {!val-output}. *)
260260+261261+val output_doc_tree : ('a -> 'a frag) -> output -> (dtd * 'a) -> unit
262262+(** Same as {!output_tree} but outputs a complete {{!signal}well-formed}
263263+ sequence of signals.
264264+265265+ {b Raises} see {!val-output}. *)
266266+267267+(** {1:sto Functorial interface (deprecated)}
268268+269269+ {b WARNING.} The functioral interface is deprecated and will be
270270+ removed.
271271+272272+ {!Make} allows client to specify types for strings and internal
273273+ buffers. Among other things this can be used to perform
274274+ hash-consing or to process the character stream, e.g. to normalize
275275+ unicode characters or to convert to a custom encoding. *)
276276+277277+type std_string = string
278278+type std_buffer = Buffer.t
279279+280280+(** Input signature for strings. *)
281281+module type String = sig
282282+283283+ type t
284284+ (** The type for strings. *)
285285+286286+ val empty : t
287287+ (** The empty string. *)
288288+289289+ val length : t -> int
290290+ (** Returns the length of the string. *)
291291+292292+ val append : t -> t -> t
293293+ (** Concatenates two strings. *)
294294+295295+ val lowercase : t -> t
296296+ (** New string with uppercase letter translated
297297+ to lowercase (correctness is only needed for ASCII
298298+ {{:http://www.unicode.org/glossary/#code_point}code point}). *)
299299+300300+ val iter : (int -> unit) -> t -> unit
301301+ (** Iterates over the unicode
302302+ {{:http://www.unicode.org/glossary/#code_point}code point}
303303+ of the given string. *)
304304+305305+ val of_string : std_string -> t
306306+ (** String from an OCaml string. *)
307307+308308+ val to_utf_8 : ('a -> std_string -> 'a) -> 'a -> t -> 'a
309309+ (** [to_utf_8 f v s], is [f (... (f (f v s1) s2) ...) sn]. Where the
310310+ concatenation of [s1], [s2], ... [sn] is [s] as an UTF-8 stream. *)
311311+312312+ val compare : t -> t -> int
313313+ (** String comparison. Binary comparison is sufficent. *)
314314+end
315315+316316+(** Input signature for internal buffers. *)
317317+module type Buffer = sig
318318+319319+ type string
320320+ (** The type for strings. *)
321321+322322+ type t
323323+ (** The type for buffers. *)
324324+325325+ exception Full
326326+ (** Raised if the buffer cannot be grown. *)
327327+328328+ val create : int -> t
329329+ (** Creates a buffer of the given size. *)
330330+331331+ val add_uchar : t -> int -> unit
332332+ (** Adds the given (guaranteed valid) unicode
333333+ {{:http://www.unicode.org/glossary/#code_point}code point} to a
334334+ buffer.
335335+336336+ {b Raises} {!Full} if the buffer cannot be grown. *)
337337+338338+ val clear : t -> unit
339339+ (** Clears the buffer. *)
340340+341341+ val contents : t -> string
342342+ (** Returns the buffer contents. *)
343343+344344+ val length : t -> int
345345+ (** Returns the number of characters contained in the buffer. *)
346346+end
347347+348348+(** Output signature of {!Make}. *)
349349+module type S = sig
350350+351351+ (** {1 Basic types and values} *)
352352+353353+ type string
354354+355355+ type encoding = [
356356+ | `UTF_8
357357+ | `UTF_16
358358+ | `UTF_16BE
359359+ | `UTF_16LE
360360+ | `ISO_8859_1
361361+ | `ISO_8859_15
362362+ | `US_ASCII ]
363363+364364+ type dtd = string option
365365+ type name = string * string
366366+ type attribute = name * string
367367+ type tag = name * attribute list
368368+ type signal = [ `Dtd of dtd | `El_start of tag | `El_end | `Data of string ]
369369+370370+ val ns_xml : string
371371+ val ns_xmlns : string
372372+373373+ (** {1 Input} *)
374374+375375+ type pos = int * int
376376+ type error = [
377377+ | `Max_buffer_size
378378+ | `Unexpected_eoi
379379+ | `Malformed_char_stream
380380+ | `Unknown_encoding of string
381381+ | `Unknown_entity_ref of string
382382+ | `Unknown_ns_prefix of string
383383+ | `Illegal_char_ref of string
384384+ | `Illegal_char_seq of string
385385+ | `Expected_char_seqs of string list * string
386386+ | `Expected_root_element ]
387387+388388+ exception Error of pos * error
389389+ val error_message : error -> string
390390+391391+ type source = [
392392+ | `Channel of in_channel
393393+ | `String of int * std_string
394394+ | `Fun of (unit -> int) ]
395395+396396+ type input
397397+398398+ val make_input : ?enc:encoding option -> ?strip:bool ->
399399+ ?ns:(string -> string option) ->
400400+ ?entity: (string -> string option) -> source -> input
401401+402402+ val input : input -> signal
403403+404404+ val input_tree : el:(tag -> 'a list -> 'a) -> data:(string -> 'a) ->
405405+ input -> 'a
406406+407407+ val input_doc_tree : el:(tag -> 'a list -> 'a) -> data:(string -> 'a) ->
408408+ input -> (dtd * 'a)
409409+410410+ val peek : input -> signal
411411+ val eoi : input -> bool
412412+ val pos : input -> pos
413413+414414+ (** {1 Output} *)
415415+416416+ type 'a frag = [ `El of tag * 'a list | `Data of string ]
417417+ type dest = [
418418+ | `Channel of out_channel | `Buffer of std_buffer | `Fun of (int -> unit) ]
419419+420420+ type output
421421+ val make_output : ?decl:bool -> ?nl:bool -> ?indent:int option ->
422422+ ?ns_prefix:(string -> string option) -> dest -> output
423423+424424+ val output_depth : output -> int
425425+ val output : output -> signal -> unit
426426+ val output_tree : ('a -> 'a frag) -> output -> 'a -> unit
427427+ val output_doc_tree : ('a -> 'a frag) -> output -> (dtd * 'a) -> unit
428428+end
429429+430430+(** Functor building streaming XML IO with the given strings and buffers. *)
431431+module Make (String : String) (Buffer : Buffer with type string = String.t) : S
432432+ with type string = String.t
433433+434434+(** {1:io Features and limitations}
435435+436436+ The module assumes strings are immutable, thus strings
437437+ the client gives or receives {e during} the input and output process
438438+ must not be modified.
439439+ {2:input Input}
440440+ {3:inenc Encoding}
441441+442442+ The parser supports ASCII, US-ASCII,
443443+ {{:http://www.faqs.org/rfcs/rfc3629.html} UTF-8},
444444+ {{:http://www.faqs.org/rfcs/rfc2781.html} UTF-16},
445445+ {{:http://www.faqs.org/rfcs/rfc2781.html} UTF-16LE},
446446+ {{:http://www.faqs.org/rfcs/rfc2781.html} UTF-16BE} and
447447+ {{:http://anubis.dkuug.dk/JTC1/SC2/WG3/docs/n411.pdf}ISO-8559-1}
448448+ (Latin-1) encoded documents. But strings returned by
449449+ the library are {b always} UTF-8 encoded.
450450+451451+ The encoding can be specified explicitly using the optional
452452+ argument [enc]. Otherwise the parser uses UTF-16 or UTF-8 if there is a
453453+ {{:http://www.unicode.org/unicode/faq/utf_bom.html#BOM}BOM} at the
454454+ beginning of the document. If there is no BOM it uses the encoding
455455+ specified in the {{:http://www.w3.org/TR/REC-xml/#NT-XMLDecl} XML
456456+ declaration}. Finally, if there is no XML declaration UTF-8 is assumed.
457457+ {3:inwspace White space handling}
458458+459459+ The parser performs
460460+ {{:http://www.w3.org/TR/REC-xml/#AVNormalize}attribute data
461461+ normalization} on {e every} attribute data. This means that
462462+ attribute data does not have leading and trailling white space and that
463463+ any white space is collapsed and transformed to a single space
464464+ character ([U+0020]).
465465+466466+ White space handling of character data depends on the [strip]
467467+ argument. If [strip] is [true], character data is treated like
468468+ attribute data, white space before and after elements is removed
469469+ and any white space is collapsed and transformed to a single
470470+ space character ([U+0020]), except if the data is under the scope of a {e
471471+ xml:space} attribute whose value is {e preserve}. If [strip] is
472472+ [false] all white space data is preserved as present in the
473473+ document (however all kinds of
474474+ {{:http://www.w3.org/TR/REC-xml/#sec-line-ends}line ends} are
475475+ translated to the newline character ([U+000A]).
476476+477477+ {3:inns Namespaces}
478478+479479+ Xmlm's {{!name}names} are
480480+ {{:http://www.w3.org/TR/xml-names11/#dt-expname}expanded names}.
481481+ The parser automatically handles the document's namespace
482482+ declarations. Undeclared namespace prefixes can be bound via the
483483+ callback [ns], which must return a namespace name. If [ns] returns
484484+ [None] an [`Unknown_ns_prefix] error is raised.
485485+486486+ Attributes used for namespace declarations are preserved by the
487487+ parser. They are in the {!ns_xmlns} namespace. Default namespace
488488+ declarations made with {i xmlns} have the attribute name
489489+ [(Xmlm.ns_xmlns, "xmlns")]. Prefix declarations have the prefix as
490490+ the local name, for example {i xmlns:ex} results in the attribute name
491491+ [(Xmlm.ns_xmlns, "ex")].
492492+493493+ Regarding constraints on the usage of the {i xml} and {i xmlns}
494494+ prefixes by documents, the parser does not report errors on violations
495495+ of the {i must} constraints listed in
496496+ {{:http://www.w3.org/TR/xml-names11/#xmlReserved}this paragraph}.
497497+498498+ {3:inentity Character and entity references}
499499+500500+ {{:http://www.w3.org/TR/REC-xml/#dt-charref}Character references}
501501+ and {{:http://www.w3.org/TR/REC-xml/#sec-predefined-ent}predefined
502502+ entities} are automatically resolved. Other entity references can
503503+ be resolved by the callback [entity], which must return an UTF-8
504504+ string corresponding to the
505505+ replacement character data. The replacement data is {e not}
506506+ analysed for further references, it is added to the data as such
507507+ modulo white space stripping. If [entity] returns [None] the error
508508+ [`Unknown_entity_ref] is returned.
509509+510510+ {3:iseq Sequences of documents (deprecated)}
511511+512512+ {b WARNING.} This feature is deprecated and will be removed.
513513+514514+ When a well-formed sequence of signals is input, no data is consumed beyond
515515+ the closing ['>'] of the document's root element.
516516+517517+ If you want to parse a document as
518518+ {{:http://www.w3.org/TR/REC-xml/#NT-document}defined} in the XML
519519+ specification, call {!eoi} after a well-formed sequence of
520520+ signals, it must return [true]. If you expect another document on
521521+ the same input abstraction a new well-formed sequence of signals
522522+ can be {!val-input}. Use {!eoi} to check if a document follows (this
523523+ may consume data).
524524+525525+ Invoking {!eoi} after a well-formed sequence of signals skips
526526+ whitespaces, comments and processing instructions until it gets to
527527+ either an {{:http://www.w3.org/TR/REC-xml/#NT-XMLDecl} XML
528528+ declaration} or a {{:http://www.w3.org/TR/REC-xml/#dt-doctype}DTD}
529529+ or the start of a new element or the end of input (in which case
530530+ {!eoi} returns [true]). If there is a new document but there is no
531531+ XML declaration or the declaration specifies UTF-16, the same
532532+ encoding as for the previous document is used.
533533+534534+ {3:inmisc Miscellaneous}
535535+ {ul
536536+ {- Parses the more liberal and simpler XML 1.1
537537+ {{:http://www.w3.org/TR/xml11/#NT-Name}Name} definition (minus [':'] because
538538+ of namespaces).}
539539+ {- The {{:http://www.w3.org/TR/REC-xml/#dt-doctype}DTD} is parsed
540540+ roughly (no guarantee it is well formed) and its information is ignored.}
541541+ {- The parser drops
542542+ {{:http://www.w3.org/TR/REC-xml/#dt-comment}comments},
543543+ {{:http://www.w3.org/TR/REC-xml/#dt-pi}processing instructions}, and
544544+ {{:http://www.w3.org/TR/REC-xml/#sec-rmd}standalone declaration}.}
545545+ {- Element attributes are not checked for uniqueness.}
546546+ {- Attribute and character data chunks are limited by
547547+ [Sys.max_string_length].
548548+ The error [`Max_buffer_size] is raised if the limit is hit.}
549549+ {- Tail recursive.}
550550+ {- Non validating.}
551551+ }
552552+553553+ {2:output Output}
554554+ {3:outenc Encoding}
555555+556556+ Outputs only {{:http://www.faqs.org/rfcs/rfc3629.html} UTF-8}
557557+ encoded documents. Strings given to
558558+ output functions {b must be} UTF-8 encoded, no checks are
559559+ performed. Unicode characters that are not legal XML
560560+ {{:http://www.w3.org/TR/REC-xml/#NT-Char}characters} are replaced
561561+ by the {{:http://unicode.org/glossary/#replacement_character}Unicode
562562+ replacement character}.
563563+564564+ {3:outns Namespaces}
565565+566566+ Xmlm's {{:#TYPEname}names} are
567567+ {{:http://www.w3.org/TR/xml-names11/#dt-expname}expanded names}.
568568+ Expanded names are automatically converted to
569569+ {{:http://www.w3.org/TR/xml-names11/#dt-qualname}qualified
570570+ names} by the output abstraction. There is no particular api to specify
571571+ prefixes and default namespaces,
572572+ the actual result depends solely on the output
573573+ of attributes belonging to the {!ns_xmlns} namespace. For example to set
574574+ the default namespace of an element to {i http://example.org/myns},
575575+ use the following attribute :
576576+ {[(* xmlns='http://example.org/myns' *)
577577+let default_ns = (Xmlm.ns_xmlns, "xmlns"), "http://example.org/myns"]}
578578+ To bind the prefix ["ex"] to {i http://example.org/ex}, use the
579579+ following attribute :
580580+ {[(* xmlns:ex='http://example.org/ex' *)
581581+let ex_ns = (Xmlm.ns_xmlns, "ex"), "http://example.org/ex"]}
582582+ Note that outputing input signals without
583583+ touching namespace declaration attributes will preserve existing
584584+ prefixes and bindings provided the same namespace name is not
585585+ bound to different prefixes in a given context.
586586+587587+ The callback [ns_prefix] of an output abstraction can be used to
588588+ give a prefix to a namespace name lacking a prefix binding in the
589589+ current output scope. Given a namespace name the function must return
590590+ the prefix to use. Note that this
591591+ will {b not} add any namespace declaration attribute to the
592592+ output. If the function returns [None], {!val-output} will raise
593593+ [Invalid_argument]. The default function returns always [None].
594594+ {3:outindent Indentation}
595595+596596+ Output can be indented by specifying the [indent] argument when an
597597+ output abstraction is created. If [indent] is [None] (default)
598598+ signal output does not introduce any extra white space. If
599599+ [ident] is [Some c], each {!signal} is output on its own line
600600+ (for empty elements [`El_start] and [`El_end] are collapsed on a single
601601+ line) and nested elements are indented with [c] space
602602+ characters.
603603+604604+ {3:oseq Sequences of documents (deprecated)}
605605+606606+ {b WARNING.} This feature is deprecated and will be removed.
607607+608608+ After a well-formed sequence of signals was output, the output
609609+ abstraction can be reused to output a new well-formed sequence of
610610+ signals.
611611+612612+ {3:outmisc Miscellaneous}
613613+ {ul
614614+ {- Output on a channel does not flush it.}
615615+ {- In attribute and character data you provide, markup
616616+ delimiters ['<'],['>'],['&'], and ['\"'] are
617617+ automatically escaped to
618618+ {{:http://www.w3.org/TR/REC-xml/#sec-predefined-ent}predefined
619619+ entities}.}
620620+ {- No checks are peformed on the prefix and local part of output
621621+ names to verify they are
622622+ {{:http://www.w3.org/TR/xml-names11/#NT-NCName}NCName}s.
623623+ For example using the tag name [("","dip d")] will produce
624624+ a non well-formed document because of the space character.}
625625+ {- Tail recursive.}}
626626+627627+ {2 Tips}
628628+ {ul
629629+ {- The best options to do an input/output round trip
630630+ and preserve as much information as possible is to
631631+ input with [strip = false] and output with [indent = None].}
632632+ {- Complete whitespace control on output is achieved
633633+ with [indent = None] and suitable [`Data] signals}}
634634+635635+ {1:ex Examples}
636636+637637+ {2:exseq Sequential processing}
638638+639639+ Sequential processing has the advantage that you don't need to get
640640+ the whole document tree in memory to process it.
641641+642642+ The following function reads a {e single} document on an
643643+ input channel and outputs it.
644644+{[let id ic oc =
645645+ let i = Xmlm.make_input (`Channel ic) in
646646+ let o = Xmlm.make_output (`Channel oc) in
647647+ let rec pull i o depth =
648648+ Xmlm.output o (Xmlm.peek i);
649649+ match Xmlm.input i with
650650+ | `El_start _ -> pull i o (depth + 1)
651651+ | `El_end -> if depth = 1 then () else pull i o (depth - 1)
652652+ | `Data _ -> pull i o depth
653653+ | `Dtd _ -> assert false
654654+ in
655655+ Xmlm.output o (Xmlm.input i); (* `Dtd *)
656656+ pull i o 0;
657657+ if not (Xmlm.eoi i) then invalid_arg "document not well-formed"]}
658658+659659+ The following function reads a {e sequence} of documents on an
660660+ input channel and outputs it.
661661+{[let id_seq ic oc =
662662+ let i = Xmlm.make_input (`Channel ic) in
663663+ let o = Xmlm.make_output ~nl:true (`Channel oc) in
664664+ while not (Xmlm.eoi i) do Xmlm.output o (Xmlm.input i) done]}
665665+ The following function reads a {e sequence} of documents on the
666666+ input channel. In each document's tree it prunes non root elements
667667+ whose name belongs to [prune_list].
668668+{[let prune_docs prune_list ic oc =
669669+ let i = Xmlm.make_input (`Channel ic) in
670670+ let o = Xmlm.make_output ~nl:true (`Channel oc) in
671671+ let copy i o = Xmlm.output o (Xmlm.input i) in
672672+ let prune (name, _) = List.mem name prune_list in
673673+ let rec process i o d =
674674+ let rec skip i d = match Xmlm.input i with
675675+ | `El_start _ -> skip i (d + 1)
676676+ | `El_end -> if d = 1 then () else skip i (d - 1)
677677+ | s -> skip i d
678678+ in
679679+ match Xmlm.peek i with
680680+ | `El_start tag when prune tag -> skip i 0; process i o d
681681+ | `El_start _ -> copy i o; process i o (d + 1)
682682+ | `El_end -> copy i o; if d = 0 then () else process i o (d - 1)
683683+ | `Data _ -> copy i o; process i o d
684684+ | `Dtd _ -> assert false
685685+ in
686686+ let rec docs i o =
687687+ copy i o; (* `Dtd *)
688688+ copy i o; (* root start *)
689689+ process i o 0;
690690+ if Xmlm.eoi i then () else docs i o
691691+ in
692692+ docs i o]}
693693+694694+ {2:extree Tree processing}
695695+696696+ A document's sequence of signals can be easily converted
697697+ to an arborescent data structure. Assume your trees are defined by :
698698+ {[type tree = E of Xmlm.tag * tree list | D of string]}
699699+ The following functions input/output xml documents from/to abstractions
700700+ as value of type [tree].
701701+{[let in_tree i =
702702+ let el tag childs = E (tag, childs) in
703703+ let data d = D d in
704704+ Xmlm.input_doc_tree ~el ~data i
705705+706706+let out_tree o t =
707707+ let frag = function
708708+ | E (tag, childs) -> `El (tag, childs)
709709+ | D d -> `Data d
710710+ in
711711+ Xmlm.output_doc_tree frag o t]}
712712+713713+ {2:exrow Tabular data processing}
714714+715715+ We show how to process XML data that represents tabular data (some
716716+ people like do that).
717717+718718+ The file we need to deal with represents nominal data about
719719+ {{:http://www.w3.org/}W3C bureaucrats}. There are no namespaces
720720+ and attributes are ignored. The element structure of the document
721721+ is :
722722+ {ul {- <list>
723723+ {ul {- <bureaucrat> represents a W3C bureaucrat
724724+ (zero or more).
725725+726726+ A bureaucrat contains the following elements, in order.
727727+ {ul {- <name> its name (mandatory, string).}
728728+ {- <surname> its surname (mandatory, string).}
729729+ {- <honest> present iff he implemented one of its spec
730730+ (optional, empty).}
731731+ {- <obfuscation_level> its grade on the
732732+ open scale of obfuscation (mandatory, float).}
733733+ {- <tr> (zero or more, string), technical reports he
734734+ worked on.}}}}}}
735735+736736+ In OCaml we represent a W3C bureaucrat by this type :
737737+{[type w3c_bureaucrat = {
738738+ name : string;
739739+ surname : string;
740740+ honest : bool;
741741+ obfuscation_level : float;
742742+ trs : string list; }]}
743743+ The following functions input and output W3C bureaucrats as lists
744744+ of values of type [w3c_bureaucrat].
745745+{[let in_w3c_bureaucrats src =
746746+ let i = Xmlm.make_input ~strip:true src in
747747+ let tag n = ("", n), [] in
748748+ let error () = invalid_arg "parse error" in
749749+ let accept s i = if Xmlm.input i = s then () else error () in
750750+ let rec i_seq el acc i = match Xmlm.peek i with
751751+ | `El_start _ -> i_seq el ((el i) :: acc) i
752752+ | `El_end -> List.rev acc
753753+ | _ -> error ()
754754+ in
755755+ let i_el n i =
756756+ accept (`El_start (tag n)) i;
757757+ let d = match Xmlm.peek i with
758758+ | `Data d -> ignore (Xmlm.input i); d
759759+ | `El_end -> ""
760760+ | _ -> error ()
761761+ in
762762+ accept (`El_end) i;
763763+ d
764764+ in
765765+ let i_bureaucrat i =
766766+ try
767767+ accept (`El_start (tag "bureaucrat")) i;
768768+ let name = i_el "name" i in
769769+ let surname = i_el "surname" i in
770770+ let honest = match Xmlm.peek i with
771771+ | `El_start (("", "honest"), []) -> ignore (i_el "honest" i); true
772772+ | _ -> false
773773+ in
774774+ let obf = float_of_string (i_el "obfuscation_level" i) in
775775+ let trs = i_seq (i_el "tr") [] i in
776776+ accept (`El_end) i;
777777+ { name = name; surname = surname; honest = honest;
778778+ obfuscation_level = obf; trs = trs }
779779+ with
780780+ | Failure _ -> error () (* float_of_string *)
781781+ in
782782+ accept (`Dtd None) i;
783783+ accept (`El_start (tag "list")) i;
784784+ let bl = i_seq i_bureaucrat [] i in
785785+ accept (`El_end) i;
786786+ if not (Xmlm.eoi i) then invalid_arg "more than one document";
787787+ bl
788788+789789+let out_w3c_bureaucrats dst bl =
790790+ let tag n = ("", n), [] in
791791+ let o = Xmlm.make_output ~nl:true ~indent:(Some 2) dst in
792792+ let out = Xmlm.output o in
793793+ let o_el n d =
794794+ out (`El_start (tag n));
795795+ if d <> "" then out (`Data d);
796796+ out `El_end
797797+ in
798798+ let o_bureaucrat b =
799799+ out (`El_start (tag "bureaucrat"));
800800+ o_el "name" b.name;
801801+ o_el "surname" b.surname;
802802+ if b.honest then o_el "honest" "";
803803+ o_el "obfuscation_level" (string_of_float b.obfuscation_level);
804804+ List.iter (o_el "tr") b.trs;
805805+ out `El_end
806806+ in
807807+ out (`Dtd None);
808808+ out (`El_start (tag "list"));
809809+ List.iter o_bureaucrat bl;
810810+ out (`El_end)]}
811811+*)
812812+813813+(*---------------------------------------------------------------------------
814814+ Copyright (c) 2007 The xmlm programmers
815815+816816+ Permission to use, copy, modify, and/or distribute this software for any
817817+ purpose with or without fee is hereby granted, provided that the above
818818+ copyright notice and this permission notice appear in all copies.
819819+820820+ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
821821+ WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
822822+ MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
823823+ ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
824824+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
825825+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
826826+ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
827827+ ---------------------------------------------------------------------------*)
···11+(* Examples from the documentation, this code is in public domain. *)
22+33+(* Sequential processing *)
44+55+let id ic oc =
66+ let i = Xmlm.make_input (`Channel ic) in
77+ let o = Xmlm.make_output (`Channel oc) in
88+ let rec pull i o depth =
99+ Xmlm.output o (Xmlm.peek i);
1010+ match Xmlm.input i with
1111+ | `El_start _ -> pull i o (depth + 1)
1212+ | `El_end -> if depth = 1 then () else pull i o (depth - 1)
1313+ | `Data _ -> pull i o depth
1414+ | `Dtd _ -> assert false
1515+ in
1616+ Xmlm.output o (Xmlm.input i); (* `Dtd *)
1717+ pull i o 0;
1818+ if not (Xmlm.eoi i) then invalid_arg "document not well-formed"
1919+2020+let id_seq ic oc =
2121+ let i = Xmlm.make_input (`Channel ic) in
2222+ let o = Xmlm.make_output ~nl:true (`Channel oc) in
2323+ while not (Xmlm.eoi i) do Xmlm.output o (Xmlm.input i) done
2424+2525+let prune_docs prune_list ic oc =
2626+ let i = Xmlm.make_input (`Channel ic) in
2727+ let o = Xmlm.make_output ~nl:true (`Channel oc) in
2828+ let copy i o = Xmlm.output o (Xmlm.input i) in
2929+ let prune (name, _) = List.mem name prune_list in
3030+ let rec process i o d =
3131+ let rec skip i d = match Xmlm.input i with
3232+ | `El_start _ -> skip i (d + 1)
3333+ | `El_end -> if d = 1 then () else skip i (d - 1)
3434+ | s -> skip i d
3535+ in
3636+ match Xmlm.peek i with
3737+ | `El_start tag when prune tag -> skip i 0; process i o d
3838+ | `El_start _ -> copy i o; process i o (d + 1)
3939+ | `El_end -> copy i o; if d = 0 then () else process i o (d - 1)
4040+ | `Data _ -> copy i o; process i o d
4141+ | `Dtd _ -> assert false
4242+ in
4343+ let rec docs i o =
4444+ copy i o; (* `Dtd *)
4545+ copy i o; (* root start *)
4646+ process i o 0;
4747+ if Xmlm.eoi i then () else docs i o
4848+ in
4949+ docs i o
5050+5151+(* Tree processing *)
5252+5353+type tree = E of Xmlm.tag * tree list | D of string
5454+5555+let in_tree i =
5656+ let el tag childs = E (tag, childs) in
5757+ let data d = D d in
5858+ Xmlm.input_doc_tree ~el ~data i
5959+6060+let out_tree o t =
6161+ let frag = function
6262+ | E (tag, childs) -> `El (tag, childs)
6363+ | D d -> `Data d
6464+ in
6565+ Xmlm.output_doc_tree frag o t
6666+6767+(* Tabular data processing. *)
6868+6969+type w3c_bureaucrat =
7070+ { name : string;
7171+ surname : string;
7272+ honest : bool;
7373+ obfuscation_level : float;
7474+ trs : string list; }
7575+7676+let in_w3c_bureaucrats src =
7777+ let i = Xmlm.make_input ~strip:true src in
7878+ let tag n = ("", n), [] in
7979+ let error () = invalid_arg "parse error" in
8080+ let accept s i = if Xmlm.input i = s then () else error () in
8181+ let rec i_seq el acc i = match Xmlm.peek i with
8282+ | `El_start _ -> i_seq el ((el i) :: acc) i
8383+ | `El_end -> List.rev acc
8484+ | _ -> error ()
8585+ in
8686+ let i_el n i =
8787+ accept (`El_start (tag n)) i;
8888+ let d = match Xmlm.peek i with
8989+ | `Data d -> ignore (Xmlm.input i); d
9090+ | `El_end -> ""
9191+ | _ -> error ()
9292+ in
9393+ accept (`El_end) i;
9494+ d
9595+ in
9696+ let i_bureaucrat i =
9797+ try
9898+ accept (`El_start (tag "bureaucrat")) i;
9999+ let name = i_el "name" i in
100100+ let surname = i_el "surname" i in
101101+ let honest = match Xmlm.peek i with
102102+ | `El_start (("", "honest"), []) -> ignore (i_el "honest" i); true
103103+ | _ -> false
104104+ in
105105+ let obf = float_of_string (i_el "obfuscation_level" i) in
106106+ let trs = i_seq (i_el "tr") [] i in
107107+ accept (`El_end) i;
108108+ { name = name; surname = surname; honest = honest;
109109+ obfuscation_level = obf; trs = trs }
110110+ with
111111+ | Failure _ -> error () (* float_of_string *)
112112+ in
113113+ accept (`Dtd None) i;
114114+ accept (`El_start (tag "list")) i;
115115+ let bl = i_seq i_bureaucrat [] i in
116116+ accept (`El_end) i;
117117+ if not (Xmlm.eoi i) then invalid_arg "more than one document";
118118+ bl
119119+120120+let out_w3c_bureaucrats dst bl =
121121+ let tag n = ("", n), [] in
122122+ let o = Xmlm.make_output ~nl:true ~indent:(Some 2) dst in
123123+ let out = Xmlm.output o in
124124+ let o_el n d =
125125+ out (`El_start (tag n));
126126+ if d <> "" then out (`Data d);
127127+ out `El_end
128128+ in
129129+ let o_bureaucrat b =
130130+ out (`El_start (tag "bureaucrat"));
131131+ o_el "name" b.name;
132132+ o_el "surname" b.surname;
133133+ if b.honest then o_el "honest" "";
134134+ o_el "obfuscation_level" (string_of_float b.obfuscation_level);
135135+ List.iter (o_el "tr") b.trs;
136136+ out `El_end
137137+ in
138138+ out (`Dtd None);
139139+ out (`El_start (tag "list"));
140140+ List.iter o_bureaucrat bl;
141141+ out (`El_end)
···11+(*---------------------------------------------------------------------------
22+ Copyright (c) 2014 The xmlm programmers. All rights reserved.
33+ Distributed under the ISC license, see terms at the end of the file.
44+ ---------------------------------------------------------------------------*)
55+66+let str = Format.sprintf
77+let log f = Format.printf (f ^^ "@?")
88+let fail fmt =
99+ let fail _ = failwith (Format.flush_str_formatter ()) in
1010+ Format.kfprintf fail Format.str_formatter fmt
1111+1212+(* We should add mode more coverage here see e.g. what is done in jsonm. *)
1313+1414+let test_decode fnd exp =
1515+ if fnd <> exp
1616+ then fail "found: %a expected: %a" Xmlm.pp_signal fnd Xmlm.pp_signal exp
1717+1818+let test_seq ?enc ?strip ?ns ?entity ?dtd src seq =
1919+ let d = Xmlm.make_input ?enc ?strip ?ns ?entity (`String (0, src)) in
2020+ let rec loop d = function [] -> ()
2121+ | v :: vs -> test_decode (Xmlm.input d) v; loop d vs
2222+ in
2323+ try
2424+ let seq = match dtd with None -> `Dtd None :: seq | Some d -> d :: seq in
2525+ loop d seq;
2626+ if not (Xmlm.eoi d) then fail "Expected end of input"
2727+ with Xmlm.Error ((l,c), e) ->
2828+ fail "error:%d:%d: %s" l c (Xmlm.error_message e)
2929+3030+let name ?(ns = "") n = (ns, n)
3131+let att ?ns n v = name ?ns n, v
3232+let tag ?(atts = []) ?ns n = (name ?ns n), atts
3333+let el ?atts ?ns n content =
3434+ (`El_start (tag ?atts ?ns n)) :: List.flatten content @ [`El_end]
3535+3636+let decoder_strip_atts () =
3737+ log "Decoder attribute stripping.\n";
3838+ let test_attv v pv =
3939+ test_seq (str "<e a ='%s'></e>" v) (el "e" ~atts:[att "a" pv] [])
4040+ in
4141+ test_attv " bla bli\n\n blo " "bla bli blo";
4242+ let test_iso_8859_15 v pv =
4343+ test_seq ~enc:(Some `ISO_8859_15) (str "<e>%s</e>" v) (el "e" [[`Data pv]])
4444+ in
4545+ List.iter
4646+ (fun (v, pv) -> test_iso_8859_15 v pv)
4747+ [
4848+ ("\065", "\u{0041}"); (* A *)
4949+ ("\164", "\u{20AC}"); (* € *)
5050+ ("\166", "\u{0160}"); (* Š *)
5151+ ("\168", "\u{0161}"); (* š *)
5252+ ("\180", "\u{017D}"); (* Ž *)
5353+ ("\184", "\u{017E}"); (* ž *)
5454+ ("\188", "\u{0152}"); (* Œ *)
5555+ ("\189", "\u{0153}"); (* œ *)
5656+ ("\190", "\u{0178}"); (* Ÿ *)
5757+ ];
5858+ ()
5959+6060+let test () =
6161+ Printexc.record_backtrace true;
6262+ decoder_strip_atts ();
6363+ log "All tests succeeded.\n"
6464+6565+let () = if not (!Sys.interactive) then test ()
6666+6767+(*---------------------------------------------------------------------------
6868+ Copyright (c) 2014 The xmlm programmers
6969+7070+ Permission to use, copy, modify, and/or distribute this software for any
7171+ purpose with or without fee is hereby granted, provided that the above
7272+ copyright notice and this permission notice appear in all copies.
7373+7474+ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
7575+ WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
7676+ MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
7777+ ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
7878+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
7979+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
8080+ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
8181+ ---------------------------------------------------------------------------*)
+11
vendor/opam/xmlm/test/test_tree.ml
···11+let li d = `El ((("", "li"), []), [`Data d])
22+let frag = `El ((("", "ol"), []), [li "bli"; li "bla"; li "blo"])
33+44+let main () =
55+ let b = Buffer.create 233 in
66+ let o = Xmlm.make_output (`Buffer b) in
77+ Xmlm.output o (`Dtd None);
88+ Xmlm.output_tree (fun x -> x) o frag;
99+ print_endline (Buffer.contents b)
1010+1111+let () = main ()
···11+(*---------------------------------------------------------------------------
22+ Copyright (c) 2007 The xmlm programmers. All rights reserved.
33+ Distributed under the ISC license, see terms at the end of the file.
44+ ---------------------------------------------------------------------------*)
55+66+let str = Printf.sprintf
77+let exec = Filename.basename Sys.executable_name
88+let pr_err s = Printf.eprintf "%s:%s\n" exec s
99+let apply f x ~finally y =
1010+ let result = try f x with exn -> finally y; raise exn in
1111+ finally y;
1212+ result
1313+1414+let fail ((l, c), e) = failwith (str "%d:%d: %s" l c (Xmlm.error_message e))
1515+1616+type tree = E of Xmlm.tag * tree list | D of string
1717+1818+let in_tree i =
1919+ let el tag childs = E (tag, childs) in
2020+ let data d = D d in
2121+ Xmlm.input_doc_tree ~el ~data i
2222+2323+let out_tree o t =
2424+ let frag = function
2525+ | E (tag, childs) -> `El (tag, childs)
2626+ | D d -> `Data d
2727+ in
2828+ Xmlm.output_doc_tree frag o t
2929+3030+let xml_parse tree enc strip entity ns ic () = (* parse only *)
3131+ let i = Xmlm.make_input ~enc ~strip ~entity ~ns (`Channel ic) in
3232+ let doc i =
3333+ if tree then ignore (in_tree i) else
3434+ begin
3535+ let rec pull i l = match Xmlm.input i with
3636+ | `El_start _ -> pull i (l + 1)
3737+ | `El_end -> if l = 1 then () else pull i (l - 1)
3838+ | `Data _ -> pull i l
3939+ | `Dtd _ -> assert false
4040+ in
4141+ ignore (Xmlm.input i); (* `Dtd *)
4242+ pull i 0;
4343+ end
4444+ in
4545+ try while not (Xmlm.eoi i) do doc i done
4646+ with Xmlm.Error (p, e) -> fail (p, e)
4747+4848+let xml_signals _ enc strip entity ns ic _ = (* output signals *)
4949+ let i = Xmlm.make_input ~enc ~strip ~entity ~ns (`Channel ic) in
5050+ let pp_signal s = Format.printf "@[%a@]@," Xmlm.pp_signal s in
5151+ try
5252+ Format.printf "@[<v>";
5353+ while not (Xmlm.eoi i) do pp_signal (Xmlm.input i); done;
5454+ Format.printf "@]";
5555+ with Xmlm.Error (p, e) -> fail (p, e)
5656+5757+let xml_outline tree enc strip entity ns ic oc = (* ascii outline *)
5858+ let pr s = Printf.fprintf oc s in
5959+ let pr_dtd dtd = match dtd with Some s -> pr "+-DTD %S\n" s | _ -> () in
6060+ let pr_depth d = for k = 1 to d do pr "| " done in
6161+ let pr_data d data = pr_depth d; pr "%S\n" data in
6262+ let pr_name c (p, l) = if p <> "" then pr "%s:%s" p l else pr "%s" l in
6363+ let pr_att d (n, v) = pr_depth (d + 1); pr "* %a = %S\n" pr_name n v in
6464+ let pr_tag d (n, atts) =
6565+ pr_depth d; pr "+-%a\n" pr_name n; List.iter (pr_att d) atts
6666+ in
6767+ let i = Xmlm.make_input ~enc ~strip ~entity ~ns (`Channel ic) in
6868+ let doc i =
6969+ if tree then
7070+ begin
7171+ let rec pr_tree d = function
7272+ | (n :: next) :: path ->
7373+ begin match n with
7474+ | D data -> pr_data d data; pr_tree d (next :: path)
7575+ | E (tag, childs) ->
7676+ pr_tag d tag; pr_tree (d+1) (childs :: next :: path)
7777+ end
7878+ | [] :: path -> if d = 0 then () else pr_tree (d - 1) path
7979+ | _ -> assert false
8080+ in
8181+ let dtd, t = in_tree i in
8282+ pr_dtd dtd;
8383+ pr_tree 0 ([t] :: [])
8484+ end
8585+ else
8686+ begin
8787+ let rec pull i l = match Xmlm.input i with
8888+ | `El_start tag -> pr_tag l tag; pull i (l + 1)
8989+ | `El_end -> if l = 1 then () else pull i (l - 1)
9090+ | `Data d -> pr_data l d; pull i l
9191+ | `Dtd _ -> assert false
9292+ in
9393+ pr_dtd (match Xmlm.input i with `Dtd d -> d | _ -> assert false);
9494+ pull i 0;
9595+ end;
9696+ flush oc
9797+ in
9898+ try while not (Xmlm.eoi i) do doc i done
9999+ with Xmlm.Error (p, e) -> fail (p, e)
100100+101101+let xml_xml indent tree enc strip entity ns ic oc = (* xml trip *)
102102+ let nl = (indent = None) in
103103+ let i = Xmlm.make_input ~enc ~strip ~ns ~entity (`Channel ic) in
104104+ let o = Xmlm.make_output ~nl ~indent ~ns_prefix:ns (`Channel oc) in
105105+ let doc i o =
106106+ if tree then (out_tree o (in_tree i)) else
107107+ begin
108108+ let rec pull i o depth =
109109+ let s = Xmlm.input i in
110110+ Xmlm.output o s;
111111+ match s with
112112+ | `El_start _ -> pull i o (depth + 1)
113113+ | `El_end -> if depth = 1 then () else pull i o (depth - 1)
114114+ | `Data _ -> pull i o depth
115115+ | `Dtd _ -> assert false
116116+ in
117117+ Xmlm.output o (Xmlm.input i); (* `Dtd *)
118118+ pull i o 0
119119+ end
120120+ in
121121+ try while not (Xmlm.eoi i) do doc i o done
122122+ with Xmlm.Error (p, e) -> fail (p, e)
123123+124124+let with_inf f inf v =
125125+ try
126126+ let ic = if inf <> "" then open_in_bin inf else stdin in
127127+ let close ic = if inf <> "" then close_in ic else () in
128128+ apply (f ic) v ~finally:close ic
129129+ with
130130+ | Sys_error e -> pr_err (str " %s" e)
131131+ | Failure e -> pr_err (str "%s:%s" inf e)
132132+133133+let with_outf f ic outf =
134134+ try
135135+ let oc = if outf <> "" then open_out_bin outf else stdout in
136136+ let close oc = if outf <> "" then close_out oc else () in
137137+ apply (f ic) oc ~finally:close oc
138138+ with
139139+ | Sys_error e -> pr_err (str " %s" e)
140140+141141+let entity_fun eref xhtml =
142142+ if not xhtml then (if eref then fun x -> Some x else fun x -> None) else
143143+ let h = Hashtbl.create 270 in
144144+ List.iter (fun (e, ustr) -> Hashtbl.add h e ustr) Xhtml.entities;
145145+ if eref then (fun x -> try Some (Hashtbl.find h x) with Not_found -> Some x)
146146+ else (fun x -> try Some (Hashtbl.find h x) with Not_found -> None)
147147+148148+let process signals tree enc strip eref ns xhtml parse_only outline indent
149149+ suffix files =
150150+ let entity = entity_fun eref xhtml in
151151+ let ns = if ns then fun x -> Some x else fun x -> None in
152152+ let f =
153153+ if parse_only then
154154+ fun inf -> with_inf (xml_parse tree enc strip entity ns) inf ()
155155+ else
156156+ let outf inf =
157157+ if inf = "" || suffix = "" then "" (* stdout *) else
158158+ str "%s.%s" inf suffix
159159+ in
160160+ let f =
161161+ if outline then xml_outline else
162162+ if signals then xml_signals else
163163+ (xml_xml indent)
164164+ in
165165+ fun inf ->
166166+ with_inf (with_outf (f tree enc strip entity ns)) inf (outf inf)
167167+ in
168168+ List.iter f files
169169+170170+let encoding_of_str enc = match (String.lowercase_ascii enc) with
171171+| "" -> None
172172+| "utf-8" | "utf8" | "utf_8" -> Some `UTF_8
173173+| "utf-16" | "utf16" | "utf_16" -> Some `UTF_16
174174+| "utf-16be" | "utf16be" | "utf16_be" -> Some `UTF_16BE
175175+| "utf-16le" | "utf16le" | "utf16_le" -> Some `UTF_16LE
176176+| "iso-8859-1" | "iso88591"
177177+| "iso_8859_1" | "latin1" | "latin-1" -> Some `ISO_8859_1
178178+| "ascii" | "us-ascii" -> Some `US_ASCII
179179+| e -> pr_err (str "unknown encoding '%s', trying to guess." e); None
180180+181181+let main () =
182182+ let usage =
183183+ str "Usage: %s <options> <files>\n\
184184+ Reads xml files and outputs them on stdout.\n\
185185+ Options:" exec
186186+ in
187187+ let enc = ref "" in
188188+ let strip = ref false in
189189+ let ns = ref false in
190190+ let eref = ref false in
191191+ let xhtml = ref false in
192192+ let parse_only = ref false in
193193+ let tree = ref false in
194194+ let signals = ref false in
195195+ let outline = ref false in
196196+ let indent = ref false in
197197+ let suffix = ref "" in
198198+ let files = ref [] in
199199+ let add_file s = files := s :: !files in
200200+ let options = [
201201+ "-enc", Arg.Set_string enc,
202202+ "<enc>, use specified encoding, utf-8, utf-16, utf-16be, utf-16le,\n\
203203+ \ iso-8859-1, ascii (otherwise guesses).";
204204+ "-strip", Arg.Set strip,
205205+ "strip and collapse white space in character data.";
206206+ "-ns", Arg.Set ns,
207207+ "replace unbound namespaces prefixes by themselves (on input and output).";
208208+ "-eref", Arg.Set eref,
209209+ "replace unknown entity references by their name.";
210210+ "-xhtml", Arg.Set xhtml,
211211+ "resolve XHTML character entities.";
212212+ "-p", Arg.Set parse_only,
213213+ "parse only, no output.";
214214+ "-t", Arg.Set tree,
215215+ "build document tree in memory.";
216216+ "-signals", Arg.Set signals,
217217+ "outputs the stream of signals instead of xml (excludes -t).";
218218+ "-ot", Arg.Set outline,
219219+ "output document ascii outline instead of xml.";
220220+ "-indent", Arg.Set indent,
221221+ "indent xml output.";
222222+ "-trip", Arg.Set_string suffix,
223223+ "<suffix>, result for file <file> is output to a file <file.suffix>."; ]
224224+ in
225225+ Arg.parse options add_file usage;
226226+ let files = match (List.rev !files) with [] -> ["" (* stdin *) ] | l -> l in
227227+ let enc = encoding_of_str !enc in
228228+ let indent = if !indent then Some 2 else None in
229229+ process !signals !tree enc !strip !eref !ns !xhtml !parse_only
230230+ !outline indent !suffix files
231231+232232+let () = main ()
233233+234234+(*---------------------------------------------------------------------------
235235+ Copyright (c) 2007 The xmlm programmers
236236+237237+ Permission to use, copy, modify, and/or distribute this software for any
238238+ purpose with or without fee is hereby granted, provided that the above
239239+ copyright notice and this permission notice appear in all copies.
240240+241241+ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
242242+ WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
243243+ MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
244244+ ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
245245+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
246246+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
247247+ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
248248+ ---------------------------------------------------------------------------*)