My working unpac space for OCaml projects in development
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge opam/patches/xmlm

+3106
+9
vendor/opam/xmlm/.gitignore
··· 1 + _b0 2 + _build 3 + tmp 4 + *~ 5 + \#*# 6 + TODO 7 + *.native 8 + *.byte 9 + *.install
+2
vendor/opam/xmlm/.merlin
··· 1 + PKG b0.kit 2 + B _build/**
+1
vendor/opam/xmlm/.ocp-indent
··· 1 + strict_with=always,match_clause=4,strict_else=never
+61
vendor/opam/xmlm/B0.ml
··· 1 + open B0_kit.V000 2 + 3 + (* OCaml library names *) 4 + 5 + let xmlm = B0_ocaml.libname "xmlm" 6 + 7 + (* Libraries *) 8 + 9 + let xmlm_lib = 10 + let srcs = Fpath.[`Dir (v "src")] in 11 + let requires = [] in 12 + B0_ocaml.lib xmlm ~doc:"The xmlm library" ~srcs ~requires 13 + 14 + (* Tests *) 15 + 16 + let test_exe src ~doc = 17 + let src = Fpath.v src in 18 + let srcs = Fpath.[`File src] in 19 + let meta = B0_meta.(empty |> tag test) in 20 + let requires = [ xmlm ] in 21 + B0_ocaml.exe (Fpath.basename ~strip_ext:true src) ~srcs ~doc ~meta ~requires 22 + 23 + let test = test_exe "test/test.ml" ~doc:"Test suite" 24 + let test_tree = test_exe "test/test_tree.ml" ~doc:"Test Xmlm.output_tree" 25 + let xhtml = test_exe "test/xhtml.ml" ~doc:"XHTML entities" 26 + 27 + let xmltrip = 28 + let doc = "Reads xml files and outputs them on stdout" in 29 + let srcs = Fpath.[`File (v "test/xmltrip.ml"); 30 + `File (v "test/xhtml.ml") ] 31 + in 32 + let requires = [xmlm] in 33 + B0_ocaml.exe "xmltrip" ~public:true ~doc ~srcs ~requires 34 + 35 + (* Packs *) 36 + 37 + let default = 38 + let meta = 39 + B0_meta.empty 40 + |> B0_meta.(add authors) ["The xmlm programmers"] 41 + |> B0_meta.(add maintainers) 42 + ["Daniel Bünzli <daniel.buenzl i@erratique.ch>"] 43 + |> B0_meta.(add homepage) "https://erratique.ch/software/xmlm" 44 + |> B0_meta.(add online_doc) "https://erratique.ch/software/xmlm/doc/" 45 + |> B0_meta.(add licenses) ["ISC"] 46 + |> B0_meta.(add repo) "git+https://erratique.ch/repos/xmlm.git" 47 + |> B0_meta.(add issues) "https://github.com/dbuenzli/xmlm/issues" 48 + |> B0_meta.(add description_tags) 49 + ["xml"; "codec"; "org:erratique"] 50 + |> B0_meta.tag B0_opam.tag 51 + |> B0_meta.add B0_opam.depends 52 + [ "ocaml", {|>= "4.05.0"|}; 53 + "ocamlfind", {|build|}; 54 + "ocamlbuild", {|build|}; 55 + "topkg", {|build & >= "1.0.3"|}; 56 + ] 57 + |> B0_meta.add B0_opam.build 58 + {|[["ocaml" "pkg/pkg.ml" "build" "--dev-pkg" "%{dev}%"]]|} 59 + in 60 + B0_pack.make "default" ~doc:"xmlm package" ~meta ~locked:true @@ 61 + B0_unit.list ()
vendor/opam/xmlm/BRZO

This is a binary file and will not be displayed.

+87
vendor/opam/xmlm/CHANGES.md
··· 1 + 2 + 3 + - Add support for latin-9 (ISO-8859-15) encoded XML files. 4 + Thanks to Liu Yuxi for the patch. 5 + 6 + 7 + v1.4.0 2022-02-08 La Forclaz (VS) 8 + --------------------------------- 9 + 10 + - OCaml 5.00 support. Thanks to Antonio Nuno Monteiro for the patch. 11 + 12 + v1.3.0 2017-03-15 La Forclaz (VS) 13 + --------------------------------- 14 + 15 + - Add `Xmlm.pp_{dtd,name,attribute,tag,signal}` 16 + - Safe-string support. 17 + - Build depend on topkg. 18 + - Relicense from BSD3 to ISC. 19 + 20 + v1.2.0 2013-09-06 Cambridge (UK) 21 + -------------------------------- 22 + 23 + - `Xmlm.output`, illegal XML Unicode characters in Data signals or 24 + attribute values are output as U+FFFD (thanks to David Sheets for 25 + insisting that something should be done about that). 26 + - Deprecate the ability to IO multiple documents from the same 27 + IO abstraction. 28 + - Deprecate the functorial interface. 29 + - OPAM friendly workflow and drop OASIS support. 30 + 31 + 32 + v1.1.1 2012-08-05 Lausanne 33 + -------------------------- 34 + 35 + - OASIS 0.3.0 support. 36 + 37 + 38 + v1.1.0 2012-03-16 La Forclaz (VS) 39 + --------------------------------- 40 + 41 + - OASIS support. 42 + - Fixes a bug in the UTF-16 decoder. 43 + - Fixes a bug in `Xmlm.make_output` with a custom function. Thanks to 44 + Konstantinas Myalo for the report and the patch. 45 + - New optional argument `decl` to `Xmlm.make_output` to control whether the 46 + XML declaration should be output. 47 + - New function `Xmlm.output_depth`, returns the current element nesting level. 48 + 49 + 50 + v1.0.2 2009-11-11 大足县 51 + ----------------------- 52 + 53 + - Replaced a (non tail-recursive) use of `List.map`. 54 + 55 + 56 + v1.0.1 2008-08-01 Lausanne 57 + ---------------------------- 58 + 59 + - POSIX compliant build shell script (thanks to Michael D Ekstrand). 60 + - Support for Debian packaging. 61 + 62 + 63 + v1.0.0 2008-03-17 Lausanne 64 + ---------------------------- 65 + 66 + ## New features: 67 + - Streaming IO api with support to IO arborescent data structures. 68 + - Proper XML namespace support, all names are expanded names. 69 + - Whitespace stripping respects the xml:space attributes. 70 + - Xmlm.Make functor to use other types for strings and internal buffers. 71 + - UTF-8 encoded documents can start with an UTF-8 encoded BOM. 72 + 73 + ## Incompatible changes: 74 + - `Xmlm.encoding` becomes a polymorphic variant. 75 + - `Xmlm.error` becomes a polymorphic variant and the "E_" prefix is dropped. 76 + - Removed the callback api. 77 + - Removed the tree and cursor api. 78 + 79 + ## Other: 80 + - `test/xhtml.ml` has a mapping from XHTML entities to their UTF-8 sequence. 81 + - Build system switched from make to ocamlbuild 82 + 83 + 84 + v0.9.0 2007-02-26 Lausanne 85 + -------------------------- 86 + 87 + - First release.
+13
vendor/opam/xmlm/LICENSE.md
··· 1 + Copyright (c) 2007 The xmlm programmers 2 + 3 + Permission to use, copy, modify, and/or distribute this software for any 4 + purpose with or without fee is hereby granted, provided that the above 5 + copyright notice and this permission notice appear in all copies. 6 + 7 + THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 8 + WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 9 + MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 10 + ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 11 + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 12 + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 13 + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+50
vendor/opam/xmlm/README.md
··· 1 + Xmlm — Streaming XML codec for OCaml 2 + ------------------------------------------------------------------------------- 3 + %%VERSION%% 4 + 5 + Xmlm is a streaming codec to decode and encode the XML data format. It 6 + can process XML documents without a complete in-memory representation of the 7 + data. 8 + 9 + Xmlm is made of a single independent module and distributed 10 + under the ISC license. 11 + 12 + Home page: http://erratique.ch/software/xmlm 13 + 14 + ## Installation 15 + 16 + Xmlm can be installed with `opam`: 17 + 18 + opam install xmlm 19 + 20 + If you don't use `opam` consult the [`opam`](opam) file for build 21 + instructions. 22 + 23 + ## Documentation 24 + 25 + The documentation and API reference is automatically generated 26 + from the source interfaces. It can be consulted [online][doc] 27 + or via `odig doc xmlm`. 28 + 29 + [doc]: http://erratique.ch/software/xmlm/doc/Xmlm 30 + 31 + ## Sample programs 32 + 33 + If you installed xmlm with `opam` sample programs are located in 34 + the directory `opam config var xmlm:doc`. 35 + 36 + 37 + In the distribution sample programs and tests are located in the 38 + [`test`](test) directory of the distribution. They can be built and run 39 + with: 40 + 41 + topkg build --tests true && topkg test 42 + 43 + The `xmltrip` tool reads XML files with Xmlm and outputs them back in 44 + various ways. It is useful to understand how Xmlm handles 45 + documents. `xmltrip -help` has more information. 46 + 47 + If you need to parse XHTML, the file [`xhtml.ml`](test/xhtml.ml) in 48 + the `test` directory has an OCaml list coupling each XHTML character 49 + entity with its corresponding UTF-8 encoded character string. You can 50 + use it to program a suitable entity callback.
+4
vendor/opam/xmlm/_tags
··· 1 + true : bin_annot, safe_string 2 + <_b0> : -traverse 3 + <src> : include 4 + <test> : include
+7
vendor/opam/xmlm/doc/index.mld
··· 1 + {0 Xmlm {%html: <span class="version">%%VERSION%%</span>%}} 2 + 3 + Xmlm is a streaming codec to decode and encode the XML data format. It 4 + can process XML documents without a complete in-memory representation of the 5 + data. 6 + 7 + {!modules: Xmlm}
+2
vendor/opam/xmlm/dune-project
··· 1 + (lang dune 3.0) 2 + (name xmlm)
+27
vendor/opam/xmlm/opam
··· 1 + opam-version: "2.0" 2 + name: "xmlm" 3 + synopsis: "Streaming XML codec for OCaml" 4 + description: """\ 5 + Xmlm is a streaming codec to decode and encode the XML data format. It 6 + can process XML documents without a complete in-memory representation of the 7 + data. 8 + 9 + Xmlm is made of a single independent module and distributed 10 + under the ISC license. 11 + 12 + Home page: http://erratique.ch/software/xmlm""" 13 + maintainer: "Daniel Bünzli <daniel.buenzl i@erratique.ch>" 14 + authors: "The xmlm programmers" 15 + license: "ISC" 16 + tags: ["xml" "codec" "org:erratique"] 17 + homepage: "https://erratique.ch/software/xmlm" 18 + doc: "https://erratique.ch/software/xmlm/doc/" 19 + bug-reports: "https://github.com/dbuenzli/xmlm/issues" 20 + depends: [ 21 + "ocaml" {>= "4.05.0"} 22 + "ocamlfind" {build} 23 + "ocamlbuild" {build} 24 + "topkg" {build & >= "1.0.3"} 25 + ] 26 + build: ["ocaml" "pkg/pkg.ml" "build" "--dev-pkg" "%{dev}%"] 27 + dev-repo: "git+https://erratique.ch/repos/xmlm.git"
+7
vendor/opam/xmlm/pkg/META
··· 1 + version = "%%VERSION_NUM%%" 2 + description = "Streaming XML codec for OCaml" 3 + requires = "" 4 + archive(byte) = "xmlm.cma" 5 + archive(native) = "xmlm.cmxa" 6 + plugin(byte) = "xmlm.cma" 7 + plugin(native) = "xmlm.cmxs"
+16
vendor/opam/xmlm/pkg/pkg.ml
··· 1 + #!/usr/bin/env ocaml 2 + #use "topfind" 3 + #require "topkg" 4 + open Topkg 5 + 6 + let () = 7 + Pkg.describe "xmlm" @@ fun c -> 8 + Ok [ Pkg.mllib "src/xmlm.mllib"; 9 + Pkg.bin "test/xmltrip"; 10 + Pkg.test "test/examples"; 11 + Pkg.test "test/test"; 12 + Pkg.test "test/xhtml"; 13 + Pkg.test "test/test_tree"; 14 + Pkg.doc "doc/index.mld" ~dst:"odoc-pages/index.mld"; 15 + Pkg.doc "test/examples.ml"; 16 + Pkg.doc "test/xhtml.ml"; ]
+5
vendor/opam/xmlm/src/dune
··· 1 + (library 2 + (name xmlm) 3 + (public_name xmlm) 4 + (modules Xmlm) 5 + (flags (:standard -w -27-35-39)))
+1245
vendor/opam/xmlm/src/xmlm.ml
··· 1 + (*--------------------------------------------------------------------------- 2 + Copyright (c) 2007 The xmlm programmers. All rights reserved. 3 + Distributed under the ISC license, see terms at the end of the file. 4 + ---------------------------------------------------------------------------*) 5 + 6 + module Std_string = String 7 + module Std_buffer = Buffer 8 + 9 + type std_string = string 10 + type std_buffer = Buffer.t 11 + 12 + module type String = sig 13 + type t 14 + val empty : t 15 + val length : t -> int 16 + val append : t -> t -> t 17 + val lowercase : t -> t 18 + val iter : (int -> unit) -> t -> unit 19 + val of_string : std_string -> t 20 + val to_utf_8 : ('a -> std_string -> 'a) -> 'a -> t -> 'a 21 + val compare : t -> t -> int 22 + end 23 + 24 + module type Buffer = sig 25 + type string 26 + type t 27 + exception Full 28 + val create : int -> t 29 + val add_uchar : t -> int -> unit 30 + val clear : t -> unit 31 + val contents : t -> string 32 + val length : t -> int 33 + end 34 + 35 + module type S = sig 36 + type string 37 + type encoding = [ 38 + | `UTF_8 39 + | `UTF_16 40 + | `UTF_16BE 41 + | `UTF_16LE 42 + | `ISO_8859_1 43 + | `ISO_8859_15 44 + | `US_ASCII ] 45 + 46 + type dtd = string option 47 + type name = string * string 48 + type attribute = name * string 49 + type tag = name * attribute list 50 + type signal = [ `Dtd of dtd | `El_start of tag | `El_end | `Data of string ] 51 + 52 + val ns_xml : string 53 + val ns_xmlns : string 54 + 55 + type pos = int * int 56 + type error = [ 57 + | `Max_buffer_size 58 + | `Unexpected_eoi 59 + | `Malformed_char_stream 60 + | `Unknown_encoding of string 61 + | `Unknown_entity_ref of string 62 + | `Unknown_ns_prefix of string 63 + | `Illegal_char_ref of string 64 + | `Illegal_char_seq of string 65 + | `Expected_char_seqs of string list * string 66 + | `Expected_root_element ] 67 + 68 + exception Error of pos * error 69 + val error_message : error -> string 70 + 71 + type source = [ 72 + | `Channel of in_channel 73 + | `String of int * std_string 74 + | `Fun of (unit -> int) ] 75 + 76 + type input 77 + 78 + val make_input : ?enc:encoding option -> ?strip:bool -> 79 + ?ns:(string -> string option) -> 80 + ?entity: (string -> string option) -> source -> input 81 + 82 + val input : input -> signal 83 + 84 + val input_tree : el:(tag -> 'a list -> 'a) -> data:(string -> 'a) -> 85 + input -> 'a 86 + 87 + val input_doc_tree : el:(tag -> 'a list -> 'a) -> data:(string -> 'a) -> 88 + input -> (dtd * 'a) 89 + 90 + val peek : input -> signal 91 + val eoi : input -> bool 92 + val pos : input -> pos 93 + 94 + type 'a frag = [ `El of tag * 'a list | `Data of string ] 95 + type dest = [ 96 + | `Channel of out_channel | `Buffer of std_buffer | `Fun of (int -> unit) ] 97 + 98 + type output 99 + val make_output : ?decl:bool -> ?nl:bool -> ?indent:int option -> 100 + ?ns_prefix:(string -> string option) -> dest -> output 101 + 102 + val output_depth : output -> int 103 + val output : output -> signal -> unit 104 + val output_tree : ('a -> 'a frag) -> output -> 'a -> unit 105 + val output_doc_tree : ('a -> 'a frag) -> output -> (dtd * 'a) -> unit 106 + end 107 + 108 + 109 + (* Unicode character lexers *) 110 + 111 + exception Malformed (* for character stream, internal only. *) 112 + 113 + let utf8_len = [| (* Char byte length according to first UTF-8 byte. *) 114 + 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 115 + 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 116 + 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 117 + 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 118 + 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 119 + 1; 1; 1; 1; 1; 1; 1; 1; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 120 + 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 121 + 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 122 + 0; 0; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 123 + 2; 2; 2; 2; 2; 2; 2; 2; 3; 3; 3; 3; 3; 3; 3; 3; 3; 3; 3; 3; 3; 3; 3; 3; 124 + 4; 4; 4; 4; 4; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0 |] 125 + 126 + let uchar_utf8 i = 127 + let b0 = i () in 128 + begin match utf8_len.(b0) with 129 + | 0 -> raise Malformed 130 + | 1 -> b0 131 + | 2 -> 132 + let b1 = i () in 133 + if b1 lsr 6 != 0b10 then raise Malformed else 134 + ((b0 land 0x1F) lsl 6) lor (b1 land 0x3F) 135 + | 3 -> 136 + let b1 = i () in 137 + let b2 = i () in 138 + if b2 lsr 6 != 0b10 then raise Malformed else 139 + begin match b0 with 140 + | 0xE0 -> if b1 < 0xA0 || 0xBF < b1 then raise Malformed else () 141 + | 0xED -> if b1 < 0x80 || 0x9F < b1 then raise Malformed else () 142 + | _ -> if b1 lsr 6 != 0b10 then raise Malformed else () 143 + end; 144 + ((b0 land 0x0F) lsl 12) lor ((b1 land 0x3F) lsl 6) lor (b2 land 0x3F) 145 + | 4 -> 146 + let b1 = i () in 147 + let b2 = i () in 148 + let b3 = i () in 149 + if b3 lsr 6 != 0b10 || b2 lsr 6 != 0b10 then raise Malformed else 150 + begin match b0 with 151 + | 0xF0 -> if b1 < 0x90 || 0xBF < b1 then raise Malformed else () 152 + | 0xF4 -> if b1 < 0x80 || 0x8F < b1 then raise Malformed else () 153 + | _ -> if b1 lsr 6 != 0b10 then raise Malformed else () 154 + end; 155 + ((b0 land 0x07) lsl 18) lor ((b1 land 0x3F) lsl 12) lor 156 + ((b2 land 0x3F) lsl 6) lor (b3 land 0x3F) 157 + | _ -> assert false 158 + end 159 + 160 + let int16_be i = 161 + let b0 = i () in 162 + let b1 = i () in 163 + (b0 lsl 8) lor b1 164 + 165 + let int16_le i = 166 + let b0 = i () in 167 + let b1 = i () in 168 + (b1 lsl 8) lor b0 169 + 170 + let uchar_utf16 int16 i = 171 + let c0 = int16 i in 172 + if c0 < 0xD800 || c0 > 0xDFFF then c0 else 173 + if c0 > 0xDBFF then raise Malformed else 174 + let c1 = int16 i in 175 + (((c0 land 0x3FF) lsl 10) lor (c1 land 0x3FF)) + 0x10000 176 + 177 + let uchar_utf16be = uchar_utf16 int16_be 178 + let uchar_utf16le = uchar_utf16 int16_le 179 + let uchar_byte i = i () 180 + let uchar_iso_8859_1 i = i () 181 + 182 + let uchar_iso_8859_15 i = 183 + (* https://www.iana.org/assignments/charset-reg/ISO-8859-15 *) 184 + match i () with 185 + | 0x00A4 -> 0x20AC (* € *) 186 + | 0x00A6 -> 0x0160 (* Š *) 187 + | 0x00A8 -> 0x0161 (* š *) 188 + | 0x00B4 -> 0x017D (* Ž *) 189 + | 0x00B8 -> 0x017E (* ž *) 190 + | 0x00BC -> 0x0152 (* Œ *) 191 + | 0x00BD -> 0x0153 (* œ *) 192 + | 0x00BE -> 0x0178 (* Ÿ *) 193 + | c -> c 194 + 195 + let uchar_ascii i = let b = i () in if b > 127 then raise Malformed else b 196 + 197 + (* Functorized streaming XML IO *) 198 + 199 + module Make (String : String) (Buffer : Buffer with type string = String.t) = 200 + struct 201 + type string = String.t 202 + 203 + let str = String.of_string 204 + let str_eq s s' = (compare s s') = 0 205 + let str_empty s = (compare s String.empty) = 0 206 + let cat = String.append 207 + let str_of_char u = 208 + let b = Buffer.create 4 in 209 + Buffer.add_uchar b u; 210 + Buffer.contents b 211 + 212 + module Ht = Hashtbl.Make (struct type t = string 213 + let equal = str_eq 214 + let hash = Hashtbl.hash end) 215 + 216 + let u_nl = 0x000A (* newline *) 217 + let u_cr = 0x000D (* carriage return *) 218 + let u_space = 0x0020 (* space *) 219 + let u_quot = 0x0022 (* quote *) 220 + let u_sharp = 0x0023 (* # *) 221 + let u_amp = 0x0026 (* & *) 222 + let u_apos = 0x0027 (* ' *) 223 + let u_minus = 0x002D (* - *) 224 + let u_slash = 0x002F (* / *) 225 + let u_colon = 0x003A (* : *) 226 + let u_scolon = 0x003B (* ; *) 227 + let u_lt = 0x003C (* < *) 228 + let u_eq = 0x003D (* = *) 229 + let u_gt = 0x003E (* > *) 230 + let u_qmark = 0x003F (* ? *) 231 + let u_emark = 0x0021 (* ! *) 232 + let u_lbrack = 0x005B (* [ *) 233 + let u_rbrack = 0x005D (* ] *) 234 + let u_x = 0x0078 (* x *) 235 + let u_bom = 0xFEFF (* BOM *) 236 + let u_9 = 0x0039 (* 9 *) 237 + let u_F = 0x0046 (* F *) 238 + let u_D = 0X0044 (* D *) 239 + 240 + let s_cdata = str "CDATA[" 241 + let ns_xml = str "http://www.w3.org/XML/1998/namespace" 242 + let ns_xmlns = str "http://www.w3.org/2000/xmlns/" 243 + let n_xml = str "xml" 244 + let n_xmlns = str "xmlns" 245 + let n_space = str "space" 246 + let n_version = str "version" 247 + let n_encoding = str "encoding" 248 + let n_standalone = str "standalone" 249 + let v_yes = str "yes" 250 + let v_no = str "no" 251 + let v_preserve = str "preserve" 252 + let v_default = str "default" 253 + let v_version_1_0 = str "1.0" 254 + let v_version_1_1 = str "1.1" 255 + let v_utf_8 = str "utf-8" 256 + let v_utf_16 = str "utf-16" 257 + let v_utf_16be = str "utf-16be" 258 + let v_utf_16le = str "utf-16le" 259 + let v_iso_8859_1 = str "iso-8859-1" 260 + let v_iso_8859_15 = str "iso-8859-15" 261 + let v_us_ascii = str "us-ascii" 262 + let v_ascii = str "ascii" 263 + 264 + let name_str (p,l) = if str_empty p then l else cat p (cat (str ":") l) 265 + 266 + (* Basic types and values *) 267 + 268 + type encoding = [ 269 + | `UTF_8 270 + | `UTF_16 271 + | `UTF_16BE 272 + | `UTF_16LE 273 + | `ISO_8859_1 274 + | `ISO_8859_15 275 + | `US_ASCII ] 276 + 277 + type dtd = string option 278 + type name = string * string 279 + type attribute = name * string 280 + type tag = name * attribute list 281 + type signal = [ `Dtd of dtd | `El_start of tag | `El_end | `Data of string ] 282 + 283 + (* Input *) 284 + 285 + type pos = int * int 286 + type error = [ 287 + | `Max_buffer_size 288 + | `Unexpected_eoi 289 + | `Malformed_char_stream 290 + | `Unknown_encoding of string 291 + | `Unknown_entity_ref of string 292 + | `Unknown_ns_prefix of string 293 + | `Illegal_char_ref of string 294 + | `Illegal_char_seq of string 295 + | `Expected_char_seqs of string list * string 296 + | `Expected_root_element ] 297 + 298 + exception Error of pos * error 299 + 300 + let error_message e = 301 + let bracket l v r = cat (str l) (cat v (str r)) in 302 + match e with 303 + | `Expected_root_element -> str "expected root element" 304 + | `Max_buffer_size -> str "maximal buffer size exceeded" 305 + | `Unexpected_eoi -> str "unexpected end of input" 306 + | `Malformed_char_stream -> str "malformed character stream" 307 + | `Unknown_encoding e -> bracket "unknown encoding (" e ")" 308 + | `Unknown_entity_ref e -> bracket "unknown entity reference (" e ")" 309 + | `Unknown_ns_prefix e -> bracket "unknown namespace prefix (" e ")" 310 + | `Illegal_char_ref s -> bracket "illegal character reference (#" s ")" 311 + | `Illegal_char_seq s -> 312 + bracket "character sequence illegal here (\"" s "\")" 313 + | `Expected_char_seqs (exps, fnd) -> 314 + let exps = 315 + let exp acc v = cat acc (bracket "\"" v "\", ") in 316 + List.fold_left exp String.empty exps 317 + in 318 + cat (str "expected one of these character sequence: ") 319 + (cat exps (bracket "found \"" fnd "\"")) 320 + 321 + type limit = (* XML is odd to parse. *) 322 + | Stag of name (* '<' qname *) 323 + | Etag of name (* '</' qname whitespace* *) 324 + | Pi of name (* '<?' qname *) 325 + | Comment (* '<!--' *) 326 + | Cdata (* '<![CDATA[' *) 327 + | Dtd (* '<!' *) 328 + | Text (* other character *) 329 + | Eoi (* End of input *) 330 + 331 + type source = [ 332 + | `Channel of in_channel 333 + | `String of int * std_string 334 + | `Fun of (unit -> int) ] 335 + 336 + type input = 337 + { enc : encoding option; (* Expected encoding. *) 338 + strip : bool; (* Whitespace stripping default behaviour. *) 339 + fun_ns : string -> string option; (* Namespace callback. *) 340 + fun_entity : string -> string option; (* Entity reference callback. *) 341 + i : unit -> int; (* Byte level input. *) 342 + mutable uchar : (unit -> int) -> int; (* Unicode character lexer. *) 343 + mutable c : int; (* Character lookahead. *) 344 + mutable cr : bool; (* True if last u was '\r'. *) 345 + mutable line : int; (* Current line number. *) 346 + mutable col : int; (* Current column number. *) 347 + mutable limit : limit; (* Last parsed limit. *) 348 + mutable peek : signal; (* Signal lookahead. *) 349 + mutable stripping : bool; (* True if stripping whitespace. *) 350 + mutable last_white : bool; (* True if last char was white. *) 351 + mutable scopes : (name * string list * bool) list; 352 + (* Stack of qualified el. name, bound prefixes and strip behaviour. *) 353 + ns : string Ht.t; (* prefix -> uri bindings. *) 354 + ident : Buffer.t; (* Buffer for names and entity refs. *) 355 + data : Buffer.t; } (* Buffer for character and attribute data. *) 356 + 357 + let err_input_tree = "input signal not `El_start or `Data" 358 + let err_input_doc_tree = "input signal not `Dtd" 359 + let err i e = raise (Error ((i.line, i.col), e)) 360 + let err_illegal_char i u = err i (`Illegal_char_seq (str_of_char u)) 361 + let err_expected_seqs i exps s = err i (`Expected_char_seqs (exps, s)) 362 + let err_expected_chars i exps = 363 + err i (`Expected_char_seqs (List.map str_of_char exps, str_of_char i.c)) 364 + 365 + let u_eoi = max_int 366 + let u_start_doc = u_eoi - 1 367 + let u_end_doc = u_start_doc - 1 368 + let signal_start_stream = `Data String.empty 369 + 370 + let make_input ?(enc = None) ?(strip = false) ?(ns = fun _ -> None) 371 + ?(entity = fun _ -> None) src = 372 + let i = match src with 373 + | `Fun f -> f 374 + | `Channel ic -> fun () -> input_byte ic 375 + | `String (pos, s) -> 376 + let len = Std_string.length s in 377 + let pos = ref (pos - 1) in 378 + fun () -> 379 + incr pos; 380 + if !pos = len then raise End_of_file else 381 + Char.code (Std_string.get s !pos) 382 + in 383 + let bindings = 384 + let h = Ht.create 15 in 385 + Ht.add h String.empty String.empty; 386 + Ht.add h n_xml ns_xml; 387 + Ht.add h n_xmlns ns_xmlns; 388 + h 389 + in 390 + { enc = enc; strip = strip; fun_ns = ns; fun_entity = entity; 391 + i = i; uchar = uchar_byte; c = u_start_doc; cr = false; 392 + line = 1; col = 0; limit = Text; peek = signal_start_stream; 393 + stripping = strip; last_white = true; scopes = []; ns = bindings; 394 + ident = Buffer.create 64; data = Buffer.create 1024; } 395 + 396 + (* Bracketed non-terminals in comments refer to XML 1.0 non terminals *) 397 + 398 + let r : int -> int -> int -> bool = fun u a b -> a <= u && u <= b 399 + let is_white = function 0x0020 | 0x0009 | 0x000D | 0x000A -> true | _ -> false 400 + 401 + let is_char = function (* {Char} *) 402 + | u when r u 0x0020 0xD7FF -> true 403 + | 0x0009 | 0x000A | 0x000D -> true 404 + | u when r u 0xE000 0xFFFD || r u 0x10000 0x10FFFF -> true 405 + | _ -> false 406 + 407 + let is_digit u = r u 0x0030 0x0039 408 + let is_hex_digit u = 409 + r u 0x0030 0x0039 || r u 0x0041 0x0046 || r u 0x0061 0x0066 410 + 411 + let comm_range u = (* common to functions below *) 412 + r u 0x00C0 0x00D6 || r u 0x00D8 0x00F6 || r u 0x00F8 0x02FF || 413 + r u 0x0370 0x037D || r u 0x037F 0x1FFF || r u 0x200C 0x200D || 414 + r u 0x2070 0x218F || r u 0x2C00 0x2FEF || r u 0x3001 0xD7FF || 415 + r u 0xF900 0xFDCF || r u 0xFDF0 0xFFFD || r u 0x10000 0xEFFFF 416 + 417 + let is_name_start_char = function (* {NameStartChar} - ':' (XML 1.1) *) 418 + | u when r u 0x0061 0x007A || r u 0x0041 0x005A -> true (* [a-z] | [A-Z] *) 419 + | u when is_white u -> false 420 + | 0x005F -> true (* '_' *) 421 + | u when comm_range u -> true 422 + | _ -> false 423 + 424 + let is_name_char = function (* {NameChar} - ':' (XML 1.1) *) 425 + | u when r u 0x0061 0x007A || r u 0x0041 0x005A -> true (* [a-z] | [A-Z] *) 426 + | u when is_white u -> false 427 + | u when r u 0x0030 0x0039 -> true (* [0-9] *) 428 + | 0x005F | 0x002D | 0x002E | 0x00B7 -> true (* '_' '-' '.' *) 429 + | u when comm_range u || r u 0x0300 0x036F || r u 0x203F 0x2040 -> true 430 + | _ -> false 431 + 432 + let rec nextc i = 433 + if i.c = u_eoi then err i `Unexpected_eoi; 434 + if i.c = u_nl then (i.line <- i.line + 1; i.col <- 1) 435 + else i.col <- i.col + 1; 436 + i.c <- i.uchar i.i; 437 + if not (is_char i.c) then raise Malformed; 438 + if i.cr && i.c = u_nl then i.c <- i.uchar i.i; (* cr nl business *) 439 + if i.c = u_cr then (i.cr <- true; i.c <- u_nl) else i.cr <- false 440 + 441 + let nextc_eof i = try nextc i with End_of_file -> i.c <- u_eoi 442 + let skip_white i = while (is_white i.c) do nextc i done 443 + let skip_white_eof i = while (is_white i.c) do nextc_eof i done 444 + let accept i c = if i.c = c then nextc i else err_expected_chars i [ c ] 445 + 446 + let clear_ident i = Buffer.clear i.ident 447 + let clear_data i = Buffer.clear i.data 448 + let addc_ident i c = Buffer.add_uchar i.ident c 449 + let addc_data i c = Buffer.add_uchar i.data c 450 + 451 + let addc_data_strip i c = 452 + if is_white c then i.last_white <- true else 453 + begin 454 + if i.last_white && Buffer.length i.data <> 0 then addc_data i u_space; 455 + i.last_white <- false; 456 + addc_data i c 457 + end 458 + 459 + let expand_name i (prefix, local) = 460 + let external_ prefix = match i.fun_ns prefix with 461 + | None -> err i (`Unknown_ns_prefix prefix) 462 + | Some uri -> uri 463 + in 464 + try 465 + let uri = Ht.find i.ns prefix in 466 + if not (str_empty uri) then (uri, local) else 467 + if str_empty prefix then String.empty, local else 468 + (external_ prefix), local (* unbound with xmlns:prefix="" *) 469 + with Not_found -> external_ prefix, local 470 + 471 + let find_encoding i = (* Encoding mess. *) 472 + let reset uchar i = i.uchar <- uchar; i.col <- 0; nextc i in 473 + match i.enc with 474 + | None -> (* User doesn't know encoding. *) 475 + begin match nextc i; i.c with 476 + | 0xFE -> (* UTF-16BE BOM. *) 477 + nextc i; if i.c <> 0xFF then err i `Malformed_char_stream; 478 + reset uchar_utf16be i; 479 + true 480 + | 0xFF -> (* UTF-16LE BOM. *) 481 + nextc i; if i.c <> 0xFE then err i `Malformed_char_stream; 482 + reset uchar_utf16le i; 483 + true 484 + | 0xEF -> (* UTF-8 BOM. *) 485 + nextc i; if i.c <> 0xBB then err i `Malformed_char_stream; 486 + nextc i; if i.c <> 0xBF then err i `Malformed_char_stream; 487 + reset uchar_utf8 i; 488 + true 489 + | 0x3C | _ -> (* UTF-8 or other, try declaration. *) 490 + i.uchar <- uchar_utf8; 491 + false 492 + end 493 + | Some e -> (* User knows encoding. *) 494 + begin match e with 495 + | `US_ASCII -> reset uchar_ascii i 496 + | `ISO_8859_1 -> reset uchar_iso_8859_1 i 497 + | `ISO_8859_15 -> reset uchar_iso_8859_15 i 498 + | `UTF_8 -> (* Skip BOM if present. *) 499 + reset uchar_utf8 i; if i.c = u_bom then (i.col <- 0; nextc i) 500 + | `UTF_16 -> (* Which UTF-16 ? look BOM. *) 501 + let b0 = nextc i; i.c in 502 + let b1 = nextc i; i.c in 503 + begin match b0, b1 with 504 + | 0xFE, 0xFF -> reset uchar_utf16be i 505 + | 0xFF, 0xFE -> reset uchar_utf16le i 506 + | _ -> err i `Malformed_char_stream; 507 + end 508 + | `UTF_16BE -> (* Skip BOM if present. *) 509 + reset uchar_utf16be i; if i.c = u_bom then (i.col <- 0; nextc i) 510 + | `UTF_16LE -> 511 + reset uchar_utf16le i; if i.c = u_bom then (i.col <- 0; nextc i) 512 + end; 513 + true (* Ignore xml declaration. *) 514 + 515 + 516 + let p_ncname i = (* {NCName} (Namespace 1.1) *) 517 + clear_ident i; 518 + if not (is_name_start_char i.c) then err_illegal_char i i.c else 519 + begin 520 + addc_ident i i.c; nextc i; 521 + while is_name_char i.c do addc_ident i i.c; nextc i done; 522 + Buffer.contents i.ident 523 + end 524 + 525 + let p_qname i = (* {QName} (Namespace 1.1) *) 526 + let n = p_ncname i in 527 + if i.c <> u_colon then (String.empty, n) else (nextc i; (n, p_ncname i)) 528 + 529 + let p_charref i = (* {CharRef}, '&' was eaten. *) 530 + let c = ref 0 in 531 + clear_ident i; 532 + nextc i; 533 + if i.c = u_scolon then err i (`Illegal_char_ref String.empty) else 534 + begin 535 + try 536 + if i.c = u_x then 537 + begin 538 + addc_ident i i.c; 539 + nextc i; 540 + while (i.c <> u_scolon) do 541 + addc_ident i i.c; 542 + if not (is_hex_digit i.c) then raise Exit else 543 + c := !c * 16 + (if i.c <= u_9 then i.c - 48 else 544 + if i.c <= u_F then i.c - 55 else 545 + i.c - 87); 546 + nextc i; 547 + done 548 + end 549 + else 550 + while (i.c <> u_scolon) do 551 + addc_ident i i.c; 552 + if not (is_digit i.c) then raise Exit else 553 + c := !c * 10 + (i.c - 48); 554 + nextc i 555 + done 556 + with Exit -> 557 + c := -1; while i.c <> u_scolon do addc_ident i i.c; nextc i done 558 + end; 559 + nextc i; 560 + if is_char !c then (clear_ident i; addc_ident i !c; Buffer.contents i.ident) 561 + else err i (`Illegal_char_ref (Buffer.contents i.ident)) 562 + 563 + let predefined_entities = 564 + let h = Ht.create 5 in 565 + let e k v = Ht.add h (str k) (str v) in 566 + e "lt" "<"; e "gt" ">"; e "amp" "&"; e "apos" "'"; e "quot" "\""; 567 + h 568 + 569 + let p_entity_ref i = (* {EntityRef}, '&' was eaten. *) 570 + let ent = p_ncname i in 571 + accept i u_scolon; 572 + try Ht.find predefined_entities ent with Not_found -> 573 + match i.fun_entity ent with 574 + | Some s -> s 575 + | None -> err i (`Unknown_entity_ref ent) 576 + 577 + let p_reference i = (* {Reference} *) 578 + nextc i; if i.c = u_sharp then p_charref i else p_entity_ref i 579 + 580 + let p_attr_value i = (* {S}? {AttValue} *) 581 + skip_white i; 582 + let delim = 583 + if i.c = u_quot || i.c = u_apos then i.c else 584 + err_expected_chars i [ u_quot; u_apos] 585 + in 586 + nextc i; 587 + skip_white i; 588 + clear_data i; 589 + i.last_white <- true; 590 + while (i.c <> delim) do 591 + if i.c = u_lt then err_illegal_char i u_lt else 592 + if i.c = u_amp then String.iter (addc_data_strip i) (p_reference i) 593 + else (addc_data_strip i i.c; nextc i) 594 + done; 595 + nextc i; 596 + Buffer.contents i.data 597 + 598 + let p_attributes i = (* ({S} {Attribute})* {S}? *) 599 + let rec aux i pre_acc acc = 600 + if not (is_white i.c) then pre_acc, acc else 601 + begin 602 + skip_white i; 603 + if i.c = u_slash || i.c = u_gt then pre_acc, acc else 604 + begin 605 + let (prefix, local) as n = p_qname i in 606 + let v = skip_white i; accept i u_eq; p_attr_value i in 607 + let att = n, v in 608 + if str_empty prefix && str_eq local n_xmlns then 609 + begin (* xmlns *) 610 + Ht.add i.ns String.empty v; 611 + aux i (String.empty :: pre_acc) (att :: acc) 612 + end 613 + else if str_eq prefix n_xmlns then 614 + begin (* xmlns:local *) 615 + Ht.add i.ns local v; 616 + aux i (local :: pre_acc) (att :: acc) 617 + end 618 + else if str_eq prefix n_xml && str_eq local n_space then 619 + begin (* xml:space *) 620 + if str_eq v v_preserve then i.stripping <- false else 621 + if str_eq v v_default then i.stripping <- i.strip else (); 622 + aux i pre_acc (att :: acc) 623 + end 624 + else 625 + aux i pre_acc (att :: acc) 626 + end 627 + end 628 + in 629 + aux i [] [] (* Returns a list of bound prefixes and attributes *) 630 + 631 + let p_limit i = (* Parses a markup limit *) 632 + i.limit <- 633 + if i.c = u_eoi then Eoi else 634 + if i.c <> u_lt then Text else 635 + begin 636 + nextc i; 637 + if i.c = u_qmark then (nextc i; Pi (p_qname i)) else 638 + if i.c = u_slash then 639 + begin 640 + nextc i; 641 + let n = p_qname i in 642 + skip_white i; 643 + Etag n 644 + end 645 + else if i.c = u_emark then 646 + begin 647 + nextc i; 648 + if i.c = u_minus then (nextc i; accept i u_minus; Comment) else 649 + if i.c = u_D then Dtd else 650 + if i.c = u_lbrack then 651 + begin 652 + nextc i; 653 + clear_ident i; 654 + for k = 1 to 6 do (addc_ident i i.c; nextc i) done; 655 + let cdata = Buffer.contents i.ident in 656 + if str_eq cdata s_cdata then Cdata else 657 + err_expected_seqs i [ s_cdata ] cdata 658 + end 659 + else 660 + err i (`Illegal_char_seq (cat (str "<!") (str_of_char i.c))) 661 + end 662 + else 663 + Stag (p_qname i) 664 + end 665 + 666 + let rec skip_comment i = (* {Comment}, '<!--' was eaten *) 667 + while (i.c <> u_minus) do nextc i done; 668 + nextc i; 669 + if i.c <> u_minus then skip_comment i else 670 + begin 671 + nextc i; 672 + if i.c <> u_gt then err_expected_chars i [ u_gt ]; 673 + nextc_eof i 674 + end 675 + 676 + let rec skip_pi i = (* {PI}, '<?' qname was eaten *) 677 + while (i.c <> u_qmark) do nextc i done; 678 + nextc i; 679 + if i.c <> u_gt then skip_pi i else nextc_eof i 680 + 681 + let rec skip_misc i ~allow_xmlpi = match i.limit with (* {Misc}* *) 682 + | Pi (p,l) when (str_empty p && str_eq n_xml (String.lowercase l)) -> 683 + if allow_xmlpi then () else err i (`Illegal_char_seq l) 684 + | Pi _ -> skip_pi i; p_limit i; skip_misc i ~allow_xmlpi 685 + | Comment -> skip_comment i; p_limit i; skip_misc i ~allow_xmlpi 686 + | Text when is_white i.c -> 687 + skip_white_eof i; p_limit i; skip_misc i ~allow_xmlpi 688 + | _ -> () 689 + 690 + let p_chardata addc i = (* {CharData}* ({Reference}{Chardata})* *) 691 + while (i.c <> u_lt) do 692 + if i.c = u_amp then String.iter (addc i) (p_reference i) 693 + else if i.c = u_rbrack then 694 + begin 695 + addc i i.c; 696 + nextc i; 697 + if i.c = u_rbrack then begin 698 + addc i i.c; 699 + nextc i; (* detects ']'*']]>' *) 700 + while (i.c = u_rbrack) do addc i i.c; nextc i done; 701 + if i.c = u_gt then err i (`Illegal_char_seq (str "]]>")); 702 + end 703 + end 704 + else 705 + (addc i i.c; nextc i) 706 + done 707 + 708 + let rec p_cdata addc i = (* {CData} {CDEnd} *) 709 + try while (true) do 710 + if i.c = u_rbrack then begin 711 + nextc i; 712 + while i.c = u_rbrack do 713 + nextc i; 714 + if i.c = u_gt then (nextc i; raise Exit); 715 + addc i u_rbrack 716 + done; 717 + addc i u_rbrack; 718 + end; 719 + addc i i.c; 720 + nextc i; 721 + done with Exit -> () 722 + 723 + let p_xml_decl i ~ignore_enc ~ignore_utf16 = (* {XMLDecl}? *) 724 + let yes_no = [v_yes; v_no] in 725 + let p_val i = skip_white i; accept i u_eq; skip_white i; p_attr_value i in 726 + let p_val_exp i exp = 727 + let v = p_val i in 728 + if not (List.exists (str_eq v) exp) then err_expected_seqs i exp v 729 + in 730 + match i.limit with 731 + | Pi (p, l) when (str_empty p && str_eq l n_xml) -> 732 + let v = skip_white i; p_ncname i in 733 + if not (str_eq v n_version) then err_expected_seqs i [ n_version ] v; 734 + p_val_exp i [v_version_1_0; v_version_1_1]; 735 + skip_white i; 736 + if i.c <> u_qmark then begin 737 + let n = p_ncname i in 738 + if str_eq n n_encoding then begin 739 + let enc = String.lowercase (p_val i) in 740 + if not ignore_enc then begin 741 + if str_eq enc v_utf_8 then i.uchar <- uchar_utf8 else 742 + if str_eq enc v_utf_16be then i.uchar <- uchar_utf16be else 743 + if str_eq enc v_utf_16le then i.uchar <- uchar_utf16le else 744 + if str_eq enc v_iso_8859_1 then i.uchar <- uchar_iso_8859_1 else 745 + if str_eq enc v_iso_8859_15 then i.uchar <- uchar_iso_8859_15 else 746 + if str_eq enc v_us_ascii then i.uchar <- uchar_ascii else 747 + if str_eq enc v_ascii then i.uchar <- uchar_ascii else 748 + if str_eq enc v_utf_16 then 749 + if ignore_utf16 then () else (err i `Malformed_char_stream) 750 + (* A BOM should have been found. *) 751 + else 752 + err i (`Unknown_encoding enc) 753 + end; 754 + skip_white i; 755 + if i.c <> u_qmark then begin 756 + let n = p_ncname i in 757 + if str_eq n n_standalone then p_val_exp i yes_no else 758 + err_expected_seqs i [ n_standalone; str "?>" ] n 759 + end 760 + end 761 + else if str_eq n n_standalone then 762 + p_val_exp i yes_no 763 + else 764 + err_expected_seqs i [ n_encoding; n_standalone; str "?>" ] n 765 + end; 766 + skip_white i; 767 + accept i u_qmark; 768 + accept i u_gt; 769 + p_limit i 770 + | _ -> () 771 + 772 + let p_dtd_signal i =(* {Misc}* {doctypedecl} {Misc}* *) 773 + skip_misc i ~allow_xmlpi:false; 774 + if i.limit <> Dtd then `Dtd None else 775 + begin 776 + let buf = addc_data i in 777 + let nest = ref 1 in 778 + clear_data i; 779 + buf u_lt; buf u_emark; (* add eaten "<!" *) 780 + while (!nest > 0) do 781 + if i.c = u_lt then 782 + begin 783 + nextc i; 784 + if i.c <> u_emark then 785 + (buf u_lt; incr nest) 786 + else 787 + begin 788 + nextc i; 789 + if i.c <> u_minus then (* Carefull with comments ! *) 790 + (buf u_lt; buf u_emark; incr nest) 791 + else 792 + begin 793 + nextc i; 794 + if i.c <> u_minus then 795 + (buf u_lt; buf u_emark; buf u_minus; incr nest) 796 + else 797 + (nextc i; skip_comment i) 798 + end 799 + end 800 + end 801 + else if i.c = u_quot || i.c = u_apos then 802 + begin 803 + let c = i.c in 804 + buf c; nextc i; 805 + while (i.c <> c) do (buf i.c; nextc i) done; 806 + buf c; nextc i 807 + end 808 + else if i.c = u_gt then (buf u_gt; nextc i; decr nest) 809 + else (buf i.c; nextc i) 810 + done; 811 + let dtd = Buffer.contents i.data in 812 + p_limit i; 813 + skip_misc i ~allow_xmlpi:false; 814 + `Dtd (Some dtd); 815 + end 816 + 817 + let p_data i = 818 + let rec bufferize addc i = match i.limit with 819 + | Text -> p_chardata addc i; p_limit i; bufferize addc i 820 + | Cdata -> p_cdata addc i; p_limit i; bufferize addc i 821 + | (Stag _ | Etag _) -> () 822 + | Pi _ -> skip_pi i; p_limit i; bufferize addc i 823 + | Comment -> skip_comment i; p_limit i; bufferize addc i 824 + | Dtd -> err i (`Illegal_char_seq (str "<!D")) 825 + | Eoi -> err i `Unexpected_eoi 826 + in 827 + clear_data i; 828 + i.last_white <- true; 829 + bufferize (if i.stripping then addc_data_strip else addc_data) i; 830 + let d = Buffer.contents i.data in 831 + d 832 + 833 + let p_el_start_signal i n = 834 + let expand_att (((prefix, local) as n, v) as att) = 835 + if not (str_eq prefix String.empty) then expand_name i n, v else 836 + if str_eq local n_xmlns then (ns_xmlns, n_xmlns), v else 837 + att (* default namespaces do not influence attributes. *) 838 + in 839 + let strip = i.stripping in (* save it here, p_attributes may change it. *) 840 + let prefixes, atts = p_attributes i in 841 + i.scopes <- (n, prefixes, strip) :: i.scopes; 842 + `El_start ((expand_name i n), List.rev_map expand_att atts) 843 + 844 + let p_el_end_signal i n = match i.scopes with 845 + | (n', prefixes, strip) :: scopes -> 846 + if i.c <> u_gt then err_expected_chars i [ u_gt ]; 847 + if not (str_eq n n') then err_expected_seqs i [name_str n'] (name_str n); 848 + i.scopes <- scopes; 849 + i.stripping <- strip; 850 + List.iter (Ht.remove i.ns) prefixes; 851 + if scopes = [] then i.c <- u_end_doc else (nextc i; p_limit i); 852 + `El_end 853 + | _ -> assert false 854 + 855 + let p_signal i = 856 + if i.scopes = [] then 857 + match i.limit with 858 + | Stag n -> p_el_start_signal i n 859 + | _ -> err i `Expected_root_element 860 + else 861 + let rec find i = match i.limit with 862 + | Stag n -> p_el_start_signal i n 863 + | Etag n -> p_el_end_signal i n 864 + | Text | Cdata -> 865 + let d = p_data i in 866 + if str_empty d then find i else `Data d 867 + | Pi _ -> skip_pi i; p_limit i; find i 868 + | Comment -> skip_comment i; p_limit i; find i 869 + | Dtd -> err i (`Illegal_char_seq (str "<!D")) 870 + | Eoi -> err i `Unexpected_eoi 871 + in 872 + begin match i.peek with 873 + | `El_start (n, _) -> (* finish to input start el. *) 874 + skip_white i; 875 + if i.c = u_gt then (accept i u_gt; p_limit i) else 876 + if i.c = u_slash then 877 + begin 878 + let tag = match i.scopes with 879 + | (tag, _, _) :: _ -> tag | _ -> assert false 880 + in 881 + (nextc i; i.limit <- Etag tag) 882 + end 883 + else 884 + err_expected_chars i [ u_slash; u_gt ] 885 + | _ -> () 886 + end; 887 + find i 888 + 889 + let eoi i = 890 + try 891 + if i.c = u_eoi then true else 892 + if i.c <> u_start_doc then false else (* In a document. *) 893 + if i.peek <> `El_end then (* Start of document sequence. *) 894 + begin 895 + let ignore_enc = find_encoding i in 896 + p_limit i; 897 + p_xml_decl i ~ignore_enc ~ignore_utf16:false; 898 + i.peek <- p_dtd_signal i; 899 + false 900 + end 901 + else (* Subsequent documents. *) 902 + begin 903 + nextc_eof i; 904 + p_limit i; 905 + if i.c = u_eoi then true else 906 + begin 907 + skip_misc i ~allow_xmlpi:true; 908 + if i.c = u_eoi then true else 909 + begin 910 + p_xml_decl i ~ignore_enc:false ~ignore_utf16:true; 911 + i.peek <- p_dtd_signal i; 912 + false 913 + end 914 + end 915 + end 916 + with 917 + | Buffer.Full -> err i `Max_buffer_size 918 + | Malformed -> err i `Malformed_char_stream 919 + | End_of_file -> err i `Unexpected_eoi 920 + 921 + let peek i = if eoi i then err i `Unexpected_eoi else i.peek 922 + 923 + let input i = 924 + try 925 + if i.c = u_end_doc then (i.c <- u_start_doc; i.peek) else 926 + let s = peek i in 927 + i.peek <- p_signal i; 928 + s 929 + with 930 + | Buffer.Full -> err i `Max_buffer_size 931 + | Malformed -> err i `Malformed_char_stream 932 + | End_of_file -> err i `Unexpected_eoi 933 + 934 + let input_tree ~el ~data i = match input i with 935 + | `Data d -> data d 936 + | `El_start tag -> 937 + let rec aux i tags context = match input i with 938 + | `El_start tag -> aux i (tag :: tags) ([] :: context) 939 + | `El_end -> 940 + begin match tags, context with 941 + | tag :: tags', childs :: context' -> 942 + let el = el tag (List.rev childs) in 943 + begin match context' with 944 + | parent :: context'' -> aux i tags' ((el :: parent) :: context'') 945 + | [] -> el 946 + end 947 + | _ -> assert false 948 + end 949 + | `Data d -> 950 + begin match context with 951 + | childs :: context' -> aux i tags (((data d) :: childs) :: context') 952 + | [] -> assert false 953 + end 954 + | `Dtd _ -> assert false 955 + in 956 + aux i (tag :: []) ([] :: []) 957 + | _ -> invalid_arg err_input_tree 958 + 959 + 960 + let input_doc_tree ~el ~data i = match input i with 961 + | `Dtd d -> d, input_tree ~el ~data i 962 + | _ -> invalid_arg err_input_doc_tree 963 + 964 + let pos i = i.line, i.col 965 + 966 + (* Output *) 967 + 968 + type 'a frag = [ `El of tag * 'a list | `Data of string ] 969 + type dest = [ 970 + | `Channel of out_channel | `Buffer of std_buffer | `Fun of (int -> unit) ] 971 + 972 + type output = 973 + { decl : bool; (* True if the XML declaration should be output. *) 974 + nl : bool; (* True if a newline is output at the end. *) 975 + indent : int option; (* Optional indentation. *) 976 + fun_prefix : string -> string option; (* Prefix callback. *) 977 + prefixes : string Ht.t; (* uri -> prefix bindings. *) 978 + outs : std_string -> int -> int -> unit; (* String output. *) 979 + outc : char -> unit; (* character output. *) 980 + mutable last_el_start : bool; (* True if last signal was `El_start *) 981 + mutable scopes : (name * (string list)) list; 982 + (* Qualified el. name and bound uris. *) 983 + mutable depth : int; } (* Scope depth. *) 984 + 985 + let err_prefix uri = "unbound namespace (" ^ uri ^ ")" 986 + let err_dtd = "dtd signal not allowed here" 987 + let err_el_start = "start signal not allowed here" 988 + let err_el_end = "end signal without matching start signal" 989 + let err_data = "data signal not allowed here" 990 + 991 + let make_output ?(decl = true) ?(nl = false) ?(indent = None) 992 + ?(ns_prefix = fun _ ->None) d = 993 + let outs, outc = match d with 994 + | `Channel c -> (output_substring c), (output_char c) 995 + | `Buffer b -> (Std_buffer.add_substring b), (Std_buffer.add_char b) 996 + | `Fun f -> 997 + let os s p l = 998 + for i = p to p + l - 1 do f (Char.code (Std_string.get s i)) done 999 + in 1000 + let oc c = f (Char.code c) in 1001 + os, oc 1002 + in 1003 + let prefixes = 1004 + let h = Ht.create 10 in 1005 + Ht.add h String.empty String.empty; 1006 + Ht.add h ns_xml n_xml; 1007 + Ht.add h ns_xmlns n_xmlns; 1008 + h 1009 + in 1010 + { decl = decl; outs = outs; outc = outc; nl = nl; indent = indent; 1011 + last_el_start = false; prefixes = prefixes; scopes = []; depth = -1; 1012 + fun_prefix = ns_prefix; } 1013 + 1014 + let output_depth o = o.depth 1015 + let outs o s = o.outs s 0 (Std_string.length s) 1016 + let str_utf_8 s = String.to_utf_8 (fun _ s -> s) "" s 1017 + let out_utf_8 o s = ignore (String.to_utf_8 (fun o s -> outs o s; o) o s) 1018 + 1019 + let prefix_name o (ns, local) = 1020 + try 1021 + if str_eq ns ns_xmlns && str_eq local n_xmlns then (String.empty, n_xmlns) 1022 + else (Ht.find o.prefixes ns, local) 1023 + with Not_found -> 1024 + match o.fun_prefix ns with 1025 + | None -> invalid_arg (err_prefix (str_utf_8 ns)) 1026 + | Some prefix -> prefix, local 1027 + 1028 + let bind_prefixes o atts = 1029 + let add acc ((ns, local), uri) = 1030 + if not (str_eq ns ns_xmlns) then acc else 1031 + begin 1032 + let prefix = if str_eq local n_xmlns then String.empty else local in 1033 + Ht.add o.prefixes uri prefix; 1034 + uri :: acc 1035 + end 1036 + in 1037 + List.fold_left add [] atts 1038 + 1039 + let out_data o s = 1040 + let out () s = 1041 + let len = Std_string.length s in 1042 + let start = ref 0 in 1043 + let last = ref 0 in 1044 + let escape e = 1045 + o.outs s !start (!last - !start); 1046 + outs o e; 1047 + incr last; 1048 + start := !last 1049 + in 1050 + while (!last < len) do match Std_string.get s !last with 1051 + | '<' -> escape "&lt;" (* Escape markup delimiters. *) 1052 + | '>' -> escape "&gt;" 1053 + | '&' -> escape "&amp;" 1054 + (* | '\'' -> escape "&apos;" *) (* Not needed we use \x22 for attributes. *) 1055 + | '\x22' -> escape "&quot;" 1056 + | '\n' | '\t' | '\r' -> incr last 1057 + | c when c < ' ' -> escape "\xEF\xBF\xBD" (* illegal, subst. by U+FFFD *) 1058 + | _ -> incr last 1059 + done; 1060 + o.outs s !start (!last - !start) 1061 + in 1062 + String.to_utf_8 out () s 1063 + 1064 + let out_qname o (p, l) = 1065 + if not (str_empty p) then (out_utf_8 o p; o.outc ':'); 1066 + out_utf_8 o l 1067 + 1068 + let out_attribute o (n, v) = 1069 + o.outc ' '; out_qname o (prefix_name o n); outs o "=\x22"; 1070 + out_data o v; 1071 + o.outc '\x22' 1072 + 1073 + let output o s = 1074 + let indent o = match o.indent with 1075 + | None -> () 1076 + | Some c -> for i = 1 to (o.depth * c) do o.outc ' ' done 1077 + in 1078 + let unindent o = match o.indent with None -> () | Some _ -> o.outc '\n' in 1079 + if o.depth = -1 then 1080 + begin match s with 1081 + | `Dtd d -> 1082 + if o.decl then outs o "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"; 1083 + begin match d with 1084 + | Some dtd -> out_utf_8 o dtd; o.outc '\n' 1085 + | None -> () 1086 + end; 1087 + o.depth <- 0 1088 + | `Data _ -> invalid_arg err_data 1089 + | `El_start _ -> invalid_arg err_el_start 1090 + | `El_end -> invalid_arg err_el_end 1091 + end 1092 + else 1093 + begin match s with 1094 + | `El_start (n, atts) -> 1095 + if o.last_el_start then (outs o ">"; unindent o); 1096 + indent o; 1097 + let uris = bind_prefixes o atts in 1098 + let qn = prefix_name o n in 1099 + o.outc '<'; out_qname o qn; List.iter (out_attribute o) atts; 1100 + o.scopes <- (qn, uris) :: o.scopes; 1101 + o.depth <- o.depth + 1; 1102 + o.last_el_start <- true 1103 + | `El_end -> 1104 + begin match o.scopes with 1105 + | (n, uris) :: scopes' -> 1106 + o.depth <- o.depth - 1; 1107 + if o.last_el_start then outs o "/>" else 1108 + begin 1109 + indent o; 1110 + outs o "</"; out_qname o n; o.outc '>'; 1111 + end; 1112 + o.scopes <- scopes'; 1113 + List.iter (Ht.remove o.prefixes) uris; 1114 + o.last_el_start <- false; 1115 + if o.depth = 0 then (if o.nl then o.outc '\n'; o.depth <- -1;) 1116 + else unindent o 1117 + | [] -> invalid_arg err_el_end 1118 + end 1119 + | `Data d -> 1120 + if o.last_el_start then (outs o ">"; unindent o); 1121 + indent o; 1122 + out_data o d; 1123 + unindent o; 1124 + o.last_el_start <- false 1125 + | `Dtd _ -> failwith err_dtd 1126 + end 1127 + 1128 + let output_tree frag o v = 1129 + let rec aux o = function 1130 + | (v :: rest) :: context -> 1131 + begin match frag v with 1132 + | `El (tag, childs) -> 1133 + output o (`El_start tag); 1134 + aux o (childs :: rest :: context) 1135 + | (`Data d) as signal -> 1136 + output o signal; 1137 + aux o (rest :: context) 1138 + end 1139 + | [] :: [] -> () 1140 + | [] :: context -> output o `El_end; aux o context 1141 + | [] -> assert false 1142 + in 1143 + aux o ([v] :: []) 1144 + 1145 + let output_doc_tree frag o (dtd, v) = 1146 + output o (`Dtd dtd); 1147 + output_tree frag o v 1148 + 1149 + end 1150 + 1151 + (* Default streaming XML IO *) 1152 + 1153 + module String = struct 1154 + type t = string 1155 + let empty = "" 1156 + let length = String.length 1157 + let append = ( ^ ) 1158 + let lowercase = String.lowercase_ascii 1159 + let iter f s = 1160 + let len = Std_string.length s in 1161 + let pos = ref ~-1 in 1162 + let i () = 1163 + incr pos; 1164 + if !pos = len then raise Exit else 1165 + Char.code (Std_string.get s !pos) 1166 + in 1167 + try while true do f (uchar_utf8 i) done with Exit -> () 1168 + 1169 + let of_string s = s 1170 + let to_utf_8 f v x = f v x 1171 + let compare = String.compare 1172 + end 1173 + 1174 + module Buffer = struct 1175 + type string = String.t 1176 + type t = Buffer.t 1177 + exception Full 1178 + let create = Buffer.create 1179 + let add_uchar b u = 1180 + try 1181 + (* UTF-8 encodes an uchar in the buffer, assumes u is valid code point. *) 1182 + let buf c = Buffer.add_char b (Char.chr c) in 1183 + if u <= 0x007F then 1184 + (buf u) 1185 + else if u <= 0x07FF then 1186 + (buf (0xC0 lor (u lsr 6)); 1187 + buf (0x80 lor (u land 0x3F))) 1188 + else if u <= 0xFFFF then 1189 + (buf (0xE0 lor (u lsr 12)); 1190 + buf (0x80 lor ((u lsr 6) land 0x3F)); 1191 + buf (0x80 lor (u land 0x3F))) 1192 + else 1193 + (buf (0xF0 lor (u lsr 18)); 1194 + buf (0x80 lor ((u lsr 12) land 0x3F)); 1195 + buf (0x80 lor ((u lsr 6) land 0x3F)); 1196 + buf (0x80 lor (u land 0x3F))) 1197 + with Failure _ -> raise Full 1198 + 1199 + let clear b = Buffer.clear b 1200 + let contents = Buffer.contents 1201 + let length = Buffer.length 1202 + end 1203 + 1204 + include Make(String) (Buffer) 1205 + 1206 + (* Pretty printers *) 1207 + 1208 + let pp = Format.fprintf 1209 + let rec pp_list ?(pp_sep = Format.pp_print_cut) pp_v ppf = function 1210 + | [] -> () 1211 + | v :: vs -> 1212 + pp_v ppf v; if vs <> [] then (pp_sep ppf (); pp_list ~pp_sep pp_v ppf vs) 1213 + 1214 + let pp_name ppf (p, l) = if p <> "" then pp ppf "%s:%s" p l else pp ppf "%s" l 1215 + let pp_attribute ppf (n, v) = pp ppf "@[<1>(%a,@,%S)@]" pp_name n v 1216 + let pp_tag ppf (name, atts) = 1217 + let pp_sep ppf () = pp ppf ";@ " in 1218 + pp ppf "@[<1>(%a,@,@[<1>[%a]@])@]" 1219 + pp_name name (pp_list ~pp_sep pp_attribute) atts 1220 + 1221 + let pp_dtd ppf = function 1222 + | None -> pp ppf "None" 1223 + | Some dtd -> pp ppf "@[<1>(Some@ %S)@]" dtd 1224 + 1225 + let pp_signal ppf = function 1226 + | `Data s -> pp ppf "@[`Data %S@]" s 1227 + | `El_end -> pp ppf "`El_end" 1228 + | `El_start tag -> pp ppf "@[`El_start %a@]" pp_tag tag 1229 + | `Dtd dtd -> pp ppf "@[`Dtd %a@]" pp_dtd dtd 1230 + 1231 + (*---------------------------------------------------------------------------- 1232 + Copyright (c) 2007 The xmlm programmers 1233 + 1234 + Permission to use, copy, modify, and/or distribute this software for any 1235 + purpose with or without fee is hereby granted, provided that the above 1236 + copyright notice and this permission notice appear in all copies. 1237 + 1238 + THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 1239 + WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 1240 + MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 1241 + ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 1242 + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 1243 + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 1244 + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 1245 + ---------------------------------------------------------------------------*)
+827
vendor/opam/xmlm/src/xmlm.mli
··· 1 + (*--------------------------------------------------------------------------- 2 + Copyright (c) 2007 The xmlm programmers. All rights reserved. 3 + Distributed under the ISC license, see terms at the end of the file. 4 + ---------------------------------------------------------------------------*) 5 + 6 + (** Streaming XML codec. 7 + 8 + A well-formed sequence of {{!signal}signals} represents an 9 + {{:http://www.w3.org/TR/REC-xml}XML} document tree traversal in 10 + depth first order (this has nothing to do with XML 11 + well-formedness). Input pulls a well-formed sequence of signals 12 + from a data source and output pushes a well-formed sequence of 13 + signals to a data destination. Functions are provided to easily 14 + transform sequences of signals to/from arborescent data structures. 15 + 16 + Consult the {{!io}features and limitations} and {{!ex}examples} 17 + of use. 18 + 19 + {b References} 20 + {ul 21 + {- Tim Bray. 22 + {e {{:http://www.xml.com/axml/axml.html}The annotated XML Specification}}, 23 + 1998.} 24 + {- Tim Bray et al. 25 + {e {{:http://www.w3.org/TR/xml-names11}Namespaces in XML 1.1 (2nd ed.)}}, 26 + 2006.}} *) 27 + 28 + (** {1:types Basic types and values} *) 29 + 30 + (** The type for character encodings. For [`UTF_16], endianness is 31 + determined from the 32 + {{:http://www.unicode.org/unicode/faq/utf_bom.html#BOM}BOM}. *) 33 + type encoding = [ 34 + | `UTF_8 35 + | `UTF_16 36 + (** Endianness determined from the 37 + {{:http://www.unicode.org/unicode/faq/utf_bom.html#BOM}BOM}. *) 38 + | `UTF_16BE 39 + | `UTF_16LE 40 + | `ISO_8859_1 41 + | `ISO_8859_15 42 + | `US_ASCII ] 43 + 44 + type dtd = string option 45 + (** The type for the optional 46 + {{:http://www.w3.org/TR/REC-xml/#dt-doctype}DTD}. *) 47 + 48 + type name = string * string 49 + (** The type for attribute and element's 50 + {{:http://www.w3.org/TR/xml-names11/#dt-expname}expanded names} 51 + [(uri,local)]. An empty [uri] represents a name without a 52 + namespace name, i.e. an unprefixed name 53 + that is not under the scope of a default namespace. *) 54 + 55 + type attribute = name * string 56 + (** The type for attributes. Name and attribute data. *) 57 + 58 + type tag = name * attribute list 59 + (** The type for an element tag. Tag name and attribute list. *) 60 + 61 + type signal = [ `Dtd of dtd | `El_start of tag | `El_end | `Data of string ] 62 + (** The type for signals. A {e well-formed} sequence of signals belongs 63 + to the language of the [doc] grammar : 64 + {[doc ::= `Dtd tree 65 + tree ::= `El_start child `El_end 66 + child ::= `Data trees | trees 67 + trees ::= tree child | epsilon]} 68 + The [trees] production is used to expresses the fact that there will 69 + never be two consecutive `Data signals in the children of an element. 70 + 71 + Input and output deal only with well-formed sequences or 72 + exceptions are raised. However on output consecutive [`Data] 73 + signals are allowed. *) 74 + 75 + val ns_xml : string 76 + (** Namespace name {{:http://www.w3.org/XML/1998/namespace}value} bound to the 77 + reserved ["xml"] prefix. *) 78 + 79 + val ns_xmlns : string 80 + (** Namespace name {{:http://www.w3.org/2000/xmlns/}value} bound to the 81 + reserved ["xmlns"] prefix. *) 82 + 83 + val pp_dtd : Format.formatter -> dtd -> unit 84 + (** [pp_dtd ppf dtd] prints an unspecified representation of [dtd] on [ppf]. *) 85 + 86 + val pp_name : Format.formatter -> name -> unit 87 + (** [pp_name ppf name] prints an unspecified representation of [name] on 88 + [ppf]. *) 89 + 90 + val pp_attribute : Format.formatter -> attribute -> unit 91 + (** [pp_attribute ppf att] prints an unspecified representation of [att] on 92 + [ppf]. *) 93 + 94 + val pp_tag : Format.formatter -> tag -> unit 95 + (** [pp_tag ppf tag] prints an unspecified representation of [tag] on 96 + [ppf]. *) 97 + 98 + val pp_signal : Format.formatter -> signal -> unit 99 + (** [pp_signal ppf s] prints an unspecified representation of [s] on 100 + [ppf]. *) 101 + 102 + (** {1:api_input Input} *) 103 + 104 + type pos = int * int 105 + (** The type for input positions. Line and column number, both start 106 + with 1. *) 107 + 108 + (** The type for input errors. *) 109 + type error = [ 110 + | `Max_buffer_size 111 + (** Maximal buffer size exceeded ([Sys.max_string_length]). *) 112 + | `Unexpected_eoi 113 + (** Unexpected end of input. *) 114 + | `Malformed_char_stream 115 + (** Malformed underlying character stream. *) 116 + | `Unknown_encoding of string 117 + (** Unknown encoding. *) 118 + | `Unknown_entity_ref of string 119 + (** Unknown entity reference, {{!inentity} details}. *) 120 + | `Unknown_ns_prefix of string 121 + (** Unknown namespace prefix {{!inns} details} *) 122 + | `Illegal_char_ref of string 123 + (** Illegal character reference. *) 124 + | `Illegal_char_seq of string 125 + (** Illegal character sequence. *) 126 + | `Expected_char_seqs of string list * string 127 + (** Expected one of the character sequences in the list but found another. *) 128 + | `Expected_root_element 129 + (** Expected the document's root element. *) ] 130 + 131 + val error_message : error -> string 132 + (** Converts the error to an english error message. *) 133 + 134 + exception Error of pos * error 135 + (** Raised on input errors. *) 136 + 137 + type source = [ 138 + | `Channel of in_channel | `String of int * string | `Fun of (unit -> int) ] 139 + (** The type for input sources. For [`String] starts reading at the 140 + given integer position. For [`Fun] the function must return the 141 + next {e byte} as an [int] and raise [End_of_file] if there is no 142 + such byte. *) 143 + 144 + type input 145 + (** The type for input abstractions. *) 146 + 147 + val make_input : ?enc:encoding option -> ?strip:bool -> 148 + ?ns:(string -> string option) -> 149 + ?entity: (string -> string option) -> source -> input 150 + (** Returns a new input abstraction reading from the given source. 151 + {ul 152 + {- [enc], character encoding of the document, {{!inenc} details}. 153 + Defaults to [None].} 154 + {- [strip], strips whitespace in character data, {{!inwspace} details}. 155 + Defaults to [false].} 156 + {- [ns] is called to bind undeclared namespace prefixes, 157 + {{!inns} details}. Default returns always [None].} 158 + {- [entity] is called to resolve non predefined entity references, 159 + {{!inentity} details}. Default returns always [None].}} *) 160 + 161 + val input : input -> signal 162 + (** Inputs a signal. Repeated invocation of the function with the same 163 + input abstraction will generate a {{!signal}well-formed} sequence 164 + of signals or an {!Error} is raised. Furthermore there will be no 165 + two consecutive [`Data] signals in the sequence and their string 166 + is always non empty. 167 + 168 + This behaviour is {b deprecated}: after a well-formed sequence was 169 + input another may be input, see {!eoi} and {{!iseq}details}. 170 + 171 + {b Raises} {!Error} on input errors. *) 172 + 173 + val input_tree : el:(tag -> 'a list -> 'a) -> data:(string -> 'a) -> 174 + input -> 'a 175 + (** If the next signal is a : 176 + {ul 177 + {- [`Data] signal, inputs it and invokes [data] with the character data.} 178 + {- [`El_start] signal, inputs the sequence of signals until its 179 + matching [`El_end] and invokes [el] and [data] as follows 180 + {ul 181 + {- [el], is called on each [`El_end] signals with the corresponding 182 + [`El_start] tag and the result of the callback invocation for the 183 + element's children.} 184 + {- [data], is called on each [`Data] signals with the character data. 185 + This function won't be called twice consecutively or with the empty 186 + string.}}} 187 + {- Other signals, raises [Invalid_argument].}} 188 + 189 + {b Raises} {!Error} on input errors and [Invalid_argument] 190 + if the next signal is not [`El_start] or [`Data]. *) 191 + 192 + val input_doc_tree : el:(tag -> 'a list -> 'a) -> data:(string -> 'a) -> 193 + input -> (dtd * 'a) 194 + (** Same as {!input_tree} but reads a complete {{!signal}well-formed} 195 + sequence of signals. 196 + 197 + {b Raises} {!Error} on input errors and [Invalid_argument] 198 + if the next signal is not [`Dtd]. *) 199 + 200 + val peek : input -> signal 201 + (** Same as {!val-input} but doesn't remove the signal from the sequence. 202 + 203 + {b Raises} {!Error} on input errors. *) 204 + 205 + val eoi : input -> bool 206 + (** Returns [true] if the end of input is reached. See {{!iseq}details}. 207 + 208 + {b Raises} {!Error} on input errors. *) 209 + 210 + val pos : input -> pos 211 + (** Current position in the input abstraction. *) 212 + 213 + (** {1:api_output Output} *) 214 + 215 + type 'a frag = [ `El of tag * 'a list | `Data of string ] 216 + (** The type for deconstructing data structures of type ['a]. *) 217 + 218 + type dest = [ `Channel of out_channel | `Buffer of Buffer.t 219 + | `Fun of (int -> unit) ] 220 + (** The type for output destinations. For [`Buffer], the buffer won't 221 + be cleared. For [`Fun] the function is called with the output {e 222 + bytes} as [int]s. *) 223 + 224 + type output 225 + (** The type for output abstractions. *) 226 + 227 + val make_output : ?decl:bool -> ?nl:bool -> ?indent:int option -> 228 + ?ns_prefix:(string -> string option) -> dest -> output 229 + (** Returns a new output abstraction writing to the given destination. 230 + {ul 231 + {- [decl], if [true] the {{:http://www.w3.org/TR/REC-xml/#NT-XMLDecl} XML 232 + declaration} is output (defaults to [true]).} 233 + {- [nl], if [true] a newline is output when the root's element [`El_end] 234 + signal is output. 235 + Defaults to [false].} 236 + {- [indent], identation behaviour, see {{!outindent} details}. Defaults to 237 + [None].} 238 + {- [ns_prefix], undeclared namespace prefix bindings, 239 + see {{!outns}details}. Default returns always [None].}} *) 240 + 241 + val output : output -> signal -> unit 242 + (** Outputs a signal. 243 + 244 + This behaviour is {b deprecated}: after a well-formed sequence of 245 + signals was output a new well-formed sequence can be output. 246 + 247 + {b Raises} [Invalid_argument] if the resulting signal sequence on 248 + the output abstraction is not {{!signal}well-formed} or if a 249 + namespace name could not be bound to a prefix. *) 250 + 251 + val output_depth : output -> int 252 + (** [output_depth o] is [o]'s current element nesting level (undefined 253 + before the first [`El_start] and after the last [`El_end]). *) 254 + 255 + val output_tree : ('a -> 'a frag) -> output -> 'a -> unit 256 + (** Outputs signals corresponding to a value by recursively 257 + applying the given value deconstructor. 258 + 259 + {b Raises} see {!val-output}. *) 260 + 261 + val output_doc_tree : ('a -> 'a frag) -> output -> (dtd * 'a) -> unit 262 + (** Same as {!output_tree} but outputs a complete {{!signal}well-formed} 263 + sequence of signals. 264 + 265 + {b Raises} see {!val-output}. *) 266 + 267 + (** {1:sto Functorial interface (deprecated)} 268 + 269 + {b WARNING.} The functioral interface is deprecated and will be 270 + removed. 271 + 272 + {!Make} allows client to specify types for strings and internal 273 + buffers. Among other things this can be used to perform 274 + hash-consing or to process the character stream, e.g. to normalize 275 + unicode characters or to convert to a custom encoding. *) 276 + 277 + type std_string = string 278 + type std_buffer = Buffer.t 279 + 280 + (** Input signature for strings. *) 281 + module type String = sig 282 + 283 + type t 284 + (** The type for strings. *) 285 + 286 + val empty : t 287 + (** The empty string. *) 288 + 289 + val length : t -> int 290 + (** Returns the length of the string. *) 291 + 292 + val append : t -> t -> t 293 + (** Concatenates two strings. *) 294 + 295 + val lowercase : t -> t 296 + (** New string with uppercase letter translated 297 + to lowercase (correctness is only needed for ASCII 298 + {{:http://www.unicode.org/glossary/#code_point}code point}). *) 299 + 300 + val iter : (int -> unit) -> t -> unit 301 + (** Iterates over the unicode 302 + {{:http://www.unicode.org/glossary/#code_point}code point} 303 + of the given string. *) 304 + 305 + val of_string : std_string -> t 306 + (** String from an OCaml string. *) 307 + 308 + val to_utf_8 : ('a -> std_string -> 'a) -> 'a -> t -> 'a 309 + (** [to_utf_8 f v s], is [f (... (f (f v s1) s2) ...) sn]. Where the 310 + concatenation of [s1], [s2], ... [sn] is [s] as an UTF-8 stream. *) 311 + 312 + val compare : t -> t -> int 313 + (** String comparison. Binary comparison is sufficent. *) 314 + end 315 + 316 + (** Input signature for internal buffers. *) 317 + module type Buffer = sig 318 + 319 + type string 320 + (** The type for strings. *) 321 + 322 + type t 323 + (** The type for buffers. *) 324 + 325 + exception Full 326 + (** Raised if the buffer cannot be grown. *) 327 + 328 + val create : int -> t 329 + (** Creates a buffer of the given size. *) 330 + 331 + val add_uchar : t -> int -> unit 332 + (** Adds the given (guaranteed valid) unicode 333 + {{:http://www.unicode.org/glossary/#code_point}code point} to a 334 + buffer. 335 + 336 + {b Raises} {!Full} if the buffer cannot be grown. *) 337 + 338 + val clear : t -> unit 339 + (** Clears the buffer. *) 340 + 341 + val contents : t -> string 342 + (** Returns the buffer contents. *) 343 + 344 + val length : t -> int 345 + (** Returns the number of characters contained in the buffer. *) 346 + end 347 + 348 + (** Output signature of {!Make}. *) 349 + module type S = sig 350 + 351 + (** {1 Basic types and values} *) 352 + 353 + type string 354 + 355 + type encoding = [ 356 + | `UTF_8 357 + | `UTF_16 358 + | `UTF_16BE 359 + | `UTF_16LE 360 + | `ISO_8859_1 361 + | `ISO_8859_15 362 + | `US_ASCII ] 363 + 364 + type dtd = string option 365 + type name = string * string 366 + type attribute = name * string 367 + type tag = name * attribute list 368 + type signal = [ `Dtd of dtd | `El_start of tag | `El_end | `Data of string ] 369 + 370 + val ns_xml : string 371 + val ns_xmlns : string 372 + 373 + (** {1 Input} *) 374 + 375 + type pos = int * int 376 + type error = [ 377 + | `Max_buffer_size 378 + | `Unexpected_eoi 379 + | `Malformed_char_stream 380 + | `Unknown_encoding of string 381 + | `Unknown_entity_ref of string 382 + | `Unknown_ns_prefix of string 383 + | `Illegal_char_ref of string 384 + | `Illegal_char_seq of string 385 + | `Expected_char_seqs of string list * string 386 + | `Expected_root_element ] 387 + 388 + exception Error of pos * error 389 + val error_message : error -> string 390 + 391 + type source = [ 392 + | `Channel of in_channel 393 + | `String of int * std_string 394 + | `Fun of (unit -> int) ] 395 + 396 + type input 397 + 398 + val make_input : ?enc:encoding option -> ?strip:bool -> 399 + ?ns:(string -> string option) -> 400 + ?entity: (string -> string option) -> source -> input 401 + 402 + val input : input -> signal 403 + 404 + val input_tree : el:(tag -> 'a list -> 'a) -> data:(string -> 'a) -> 405 + input -> 'a 406 + 407 + val input_doc_tree : el:(tag -> 'a list -> 'a) -> data:(string -> 'a) -> 408 + input -> (dtd * 'a) 409 + 410 + val peek : input -> signal 411 + val eoi : input -> bool 412 + val pos : input -> pos 413 + 414 + (** {1 Output} *) 415 + 416 + type 'a frag = [ `El of tag * 'a list | `Data of string ] 417 + type dest = [ 418 + | `Channel of out_channel | `Buffer of std_buffer | `Fun of (int -> unit) ] 419 + 420 + type output 421 + val make_output : ?decl:bool -> ?nl:bool -> ?indent:int option -> 422 + ?ns_prefix:(string -> string option) -> dest -> output 423 + 424 + val output_depth : output -> int 425 + val output : output -> signal -> unit 426 + val output_tree : ('a -> 'a frag) -> output -> 'a -> unit 427 + val output_doc_tree : ('a -> 'a frag) -> output -> (dtd * 'a) -> unit 428 + end 429 + 430 + (** Functor building streaming XML IO with the given strings and buffers. *) 431 + module Make (String : String) (Buffer : Buffer with type string = String.t) : S 432 + with type string = String.t 433 + 434 + (** {1:io Features and limitations} 435 + 436 + The module assumes strings are immutable, thus strings 437 + the client gives or receives {e during} the input and output process 438 + must not be modified. 439 + {2:input Input} 440 + {3:inenc Encoding} 441 + 442 + The parser supports ASCII, US-ASCII, 443 + {{:http://www.faqs.org/rfcs/rfc3629.html} UTF-8}, 444 + {{:http://www.faqs.org/rfcs/rfc2781.html} UTF-16}, 445 + {{:http://www.faqs.org/rfcs/rfc2781.html} UTF-16LE}, 446 + {{:http://www.faqs.org/rfcs/rfc2781.html} UTF-16BE} and 447 + {{:http://anubis.dkuug.dk/JTC1/SC2/WG3/docs/n411.pdf}ISO-8559-1} 448 + (Latin-1) encoded documents. But strings returned by 449 + the library are {b always} UTF-8 encoded. 450 + 451 + The encoding can be specified explicitly using the optional 452 + argument [enc]. Otherwise the parser uses UTF-16 or UTF-8 if there is a 453 + {{:http://www.unicode.org/unicode/faq/utf_bom.html#BOM}BOM} at the 454 + beginning of the document. If there is no BOM it uses the encoding 455 + specified in the {{:http://www.w3.org/TR/REC-xml/#NT-XMLDecl} XML 456 + declaration}. Finally, if there is no XML declaration UTF-8 is assumed. 457 + {3:inwspace White space handling} 458 + 459 + The parser performs 460 + {{:http://www.w3.org/TR/REC-xml/#AVNormalize}attribute data 461 + normalization} on {e every} attribute data. This means that 462 + attribute data does not have leading and trailling white space and that 463 + any white space is collapsed and transformed to a single space 464 + character ([U+0020]). 465 + 466 + White space handling of character data depends on the [strip] 467 + argument. If [strip] is [true], character data is treated like 468 + attribute data, white space before and after elements is removed 469 + and any white space is collapsed and transformed to a single 470 + space character ([U+0020]), except if the data is under the scope of a {e 471 + xml:space} attribute whose value is {e preserve}. If [strip] is 472 + [false] all white space data is preserved as present in the 473 + document (however all kinds of 474 + {{:http://www.w3.org/TR/REC-xml/#sec-line-ends}line ends} are 475 + translated to the newline character ([U+000A]). 476 + 477 + {3:inns Namespaces} 478 + 479 + Xmlm's {{!name}names} are 480 + {{:http://www.w3.org/TR/xml-names11/#dt-expname}expanded names}. 481 + The parser automatically handles the document's namespace 482 + declarations. Undeclared namespace prefixes can be bound via the 483 + callback [ns], which must return a namespace name. If [ns] returns 484 + [None] an [`Unknown_ns_prefix] error is raised. 485 + 486 + Attributes used for namespace declarations are preserved by the 487 + parser. They are in the {!ns_xmlns} namespace. Default namespace 488 + declarations made with {i xmlns} have the attribute name 489 + [(Xmlm.ns_xmlns, "xmlns")]. Prefix declarations have the prefix as 490 + the local name, for example {i xmlns:ex} results in the attribute name 491 + [(Xmlm.ns_xmlns, "ex")]. 492 + 493 + Regarding constraints on the usage of the {i xml} and {i xmlns} 494 + prefixes by documents, the parser does not report errors on violations 495 + of the {i must} constraints listed in 496 + {{:http://www.w3.org/TR/xml-names11/#xmlReserved}this paragraph}. 497 + 498 + {3:inentity Character and entity references} 499 + 500 + {{:http://www.w3.org/TR/REC-xml/#dt-charref}Character references} 501 + and {{:http://www.w3.org/TR/REC-xml/#sec-predefined-ent}predefined 502 + entities} are automatically resolved. Other entity references can 503 + be resolved by the callback [entity], which must return an UTF-8 504 + string corresponding to the 505 + replacement character data. The replacement data is {e not} 506 + analysed for further references, it is added to the data as such 507 + modulo white space stripping. If [entity] returns [None] the error 508 + [`Unknown_entity_ref] is returned. 509 + 510 + {3:iseq Sequences of documents (deprecated)} 511 + 512 + {b WARNING.} This feature is deprecated and will be removed. 513 + 514 + When a well-formed sequence of signals is input, no data is consumed beyond 515 + the closing ['>'] of the document's root element. 516 + 517 + If you want to parse a document as 518 + {{:http://www.w3.org/TR/REC-xml/#NT-document}defined} in the XML 519 + specification, call {!eoi} after a well-formed sequence of 520 + signals, it must return [true]. If you expect another document on 521 + the same input abstraction a new well-formed sequence of signals 522 + can be {!val-input}. Use {!eoi} to check if a document follows (this 523 + may consume data). 524 + 525 + Invoking {!eoi} after a well-formed sequence of signals skips 526 + whitespaces, comments and processing instructions until it gets to 527 + either an {{:http://www.w3.org/TR/REC-xml/#NT-XMLDecl} XML 528 + declaration} or a {{:http://www.w3.org/TR/REC-xml/#dt-doctype}DTD} 529 + or the start of a new element or the end of input (in which case 530 + {!eoi} returns [true]). If there is a new document but there is no 531 + XML declaration or the declaration specifies UTF-16, the same 532 + encoding as for the previous document is used. 533 + 534 + {3:inmisc Miscellaneous} 535 + {ul 536 + {- Parses the more liberal and simpler XML 1.1 537 + {{:http://www.w3.org/TR/xml11/#NT-Name}Name} definition (minus [':'] because 538 + of namespaces).} 539 + {- The {{:http://www.w3.org/TR/REC-xml/#dt-doctype}DTD} is parsed 540 + roughly (no guarantee it is well formed) and its information is ignored.} 541 + {- The parser drops 542 + {{:http://www.w3.org/TR/REC-xml/#dt-comment}comments}, 543 + {{:http://www.w3.org/TR/REC-xml/#dt-pi}processing instructions}, and 544 + {{:http://www.w3.org/TR/REC-xml/#sec-rmd}standalone declaration}.} 545 + {- Element attributes are not checked for uniqueness.} 546 + {- Attribute and character data chunks are limited by 547 + [Sys.max_string_length]. 548 + The error [`Max_buffer_size] is raised if the limit is hit.} 549 + {- Tail recursive.} 550 + {- Non validating.} 551 + } 552 + 553 + {2:output Output} 554 + {3:outenc Encoding} 555 + 556 + Outputs only {{:http://www.faqs.org/rfcs/rfc3629.html} UTF-8} 557 + encoded documents. Strings given to 558 + output functions {b must be} UTF-8 encoded, no checks are 559 + performed. Unicode characters that are not legal XML 560 + {{:http://www.w3.org/TR/REC-xml/#NT-Char}characters} are replaced 561 + by the {{:http://unicode.org/glossary/#replacement_character}Unicode 562 + replacement character}. 563 + 564 + {3:outns Namespaces} 565 + 566 + Xmlm's {{:#TYPEname}names} are 567 + {{:http://www.w3.org/TR/xml-names11/#dt-expname}expanded names}. 568 + Expanded names are automatically converted to 569 + {{:http://www.w3.org/TR/xml-names11/#dt-qualname}qualified 570 + names} by the output abstraction. There is no particular api to specify 571 + prefixes and default namespaces, 572 + the actual result depends solely on the output 573 + of attributes belonging to the {!ns_xmlns} namespace. For example to set 574 + the default namespace of an element to {i http://example.org/myns}, 575 + use the following attribute : 576 + {[(* xmlns='http://example.org/myns' *) 577 + let default_ns = (Xmlm.ns_xmlns, "xmlns"), "http://example.org/myns"]} 578 + To bind the prefix ["ex"] to {i http://example.org/ex}, use the 579 + following attribute : 580 + {[(* xmlns:ex='http://example.org/ex' *) 581 + let ex_ns = (Xmlm.ns_xmlns, "ex"), "http://example.org/ex"]} 582 + Note that outputing input signals without 583 + touching namespace declaration attributes will preserve existing 584 + prefixes and bindings provided the same namespace name is not 585 + bound to different prefixes in a given context. 586 + 587 + The callback [ns_prefix] of an output abstraction can be used to 588 + give a prefix to a namespace name lacking a prefix binding in the 589 + current output scope. Given a namespace name the function must return 590 + the prefix to use. Note that this 591 + will {b not} add any namespace declaration attribute to the 592 + output. If the function returns [None], {!val-output} will raise 593 + [Invalid_argument]. The default function returns always [None]. 594 + {3:outindent Indentation} 595 + 596 + Output can be indented by specifying the [indent] argument when an 597 + output abstraction is created. If [indent] is [None] (default) 598 + signal output does not introduce any extra white space. If 599 + [ident] is [Some c], each {!signal} is output on its own line 600 + (for empty elements [`El_start] and [`El_end] are collapsed on a single 601 + line) and nested elements are indented with [c] space 602 + characters. 603 + 604 + {3:oseq Sequences of documents (deprecated)} 605 + 606 + {b WARNING.} This feature is deprecated and will be removed. 607 + 608 + After a well-formed sequence of signals was output, the output 609 + abstraction can be reused to output a new well-formed sequence of 610 + signals. 611 + 612 + {3:outmisc Miscellaneous} 613 + {ul 614 + {- Output on a channel does not flush it.} 615 + {- In attribute and character data you provide, markup 616 + delimiters ['<'],['>'],['&'], and ['\"'] are 617 + automatically escaped to 618 + {{:http://www.w3.org/TR/REC-xml/#sec-predefined-ent}predefined 619 + entities}.} 620 + {- No checks are peformed on the prefix and local part of output 621 + names to verify they are 622 + {{:http://www.w3.org/TR/xml-names11/#NT-NCName}NCName}s. 623 + For example using the tag name [("","dip d")] will produce 624 + a non well-formed document because of the space character.} 625 + {- Tail recursive.}} 626 + 627 + {2 Tips} 628 + {ul 629 + {- The best options to do an input/output round trip 630 + and preserve as much information as possible is to 631 + input with [strip = false] and output with [indent = None].} 632 + {- Complete whitespace control on output is achieved 633 + with [indent = None] and suitable [`Data] signals}} 634 + 635 + {1:ex Examples} 636 + 637 + {2:exseq Sequential processing} 638 + 639 + Sequential processing has the advantage that you don't need to get 640 + the whole document tree in memory to process it. 641 + 642 + The following function reads a {e single} document on an 643 + input channel and outputs it. 644 + {[let id ic oc = 645 + let i = Xmlm.make_input (`Channel ic) in 646 + let o = Xmlm.make_output (`Channel oc) in 647 + let rec pull i o depth = 648 + Xmlm.output o (Xmlm.peek i); 649 + match Xmlm.input i with 650 + | `El_start _ -> pull i o (depth + 1) 651 + | `El_end -> if depth = 1 then () else pull i o (depth - 1) 652 + | `Data _ -> pull i o depth 653 + | `Dtd _ -> assert false 654 + in 655 + Xmlm.output o (Xmlm.input i); (* `Dtd *) 656 + pull i o 0; 657 + if not (Xmlm.eoi i) then invalid_arg "document not well-formed"]} 658 + 659 + The following function reads a {e sequence} of documents on an 660 + input channel and outputs it. 661 + {[let id_seq ic oc = 662 + let i = Xmlm.make_input (`Channel ic) in 663 + let o = Xmlm.make_output ~nl:true (`Channel oc) in 664 + while not (Xmlm.eoi i) do Xmlm.output o (Xmlm.input i) done]} 665 + The following function reads a {e sequence} of documents on the 666 + input channel. In each document's tree it prunes non root elements 667 + whose name belongs to [prune_list]. 668 + {[let prune_docs prune_list ic oc = 669 + let i = Xmlm.make_input (`Channel ic) in 670 + let o = Xmlm.make_output ~nl:true (`Channel oc) in 671 + let copy i o = Xmlm.output o (Xmlm.input i) in 672 + let prune (name, _) = List.mem name prune_list in 673 + let rec process i o d = 674 + let rec skip i d = match Xmlm.input i with 675 + | `El_start _ -> skip i (d + 1) 676 + | `El_end -> if d = 1 then () else skip i (d - 1) 677 + | s -> skip i d 678 + in 679 + match Xmlm.peek i with 680 + | `El_start tag when prune tag -> skip i 0; process i o d 681 + | `El_start _ -> copy i o; process i o (d + 1) 682 + | `El_end -> copy i o; if d = 0 then () else process i o (d - 1) 683 + | `Data _ -> copy i o; process i o d 684 + | `Dtd _ -> assert false 685 + in 686 + let rec docs i o = 687 + copy i o; (* `Dtd *) 688 + copy i o; (* root start *) 689 + process i o 0; 690 + if Xmlm.eoi i then () else docs i o 691 + in 692 + docs i o]} 693 + 694 + {2:extree Tree processing} 695 + 696 + A document's sequence of signals can be easily converted 697 + to an arborescent data structure. Assume your trees are defined by : 698 + {[type tree = E of Xmlm.tag * tree list | D of string]} 699 + The following functions input/output xml documents from/to abstractions 700 + as value of type [tree]. 701 + {[let in_tree i = 702 + let el tag childs = E (tag, childs) in 703 + let data d = D d in 704 + Xmlm.input_doc_tree ~el ~data i 705 + 706 + let out_tree o t = 707 + let frag = function 708 + | E (tag, childs) -> `El (tag, childs) 709 + | D d -> `Data d 710 + in 711 + Xmlm.output_doc_tree frag o t]} 712 + 713 + {2:exrow Tabular data processing} 714 + 715 + We show how to process XML data that represents tabular data (some 716 + people like do that). 717 + 718 + The file we need to deal with represents nominal data about 719 + {{:http://www.w3.org/}W3C bureaucrats}. There are no namespaces 720 + and attributes are ignored. The element structure of the document 721 + is : 722 + {ul {- <list> 723 + {ul {- <bureaucrat> represents a W3C bureaucrat 724 + (zero or more). 725 + 726 + A bureaucrat contains the following elements, in order. 727 + {ul {- <name> its name (mandatory, string).} 728 + {- <surname> its surname (mandatory, string).} 729 + {- <honest> present iff he implemented one of its spec 730 + (optional, empty).} 731 + {- <obfuscation_level> its grade on the 732 + open scale of obfuscation (mandatory, float).} 733 + {- <tr> (zero or more, string), technical reports he 734 + worked on.}}}}}} 735 + 736 + In OCaml we represent a W3C bureaucrat by this type : 737 + {[type w3c_bureaucrat = { 738 + name : string; 739 + surname : string; 740 + honest : bool; 741 + obfuscation_level : float; 742 + trs : string list; }]} 743 + The following functions input and output W3C bureaucrats as lists 744 + of values of type [w3c_bureaucrat]. 745 + {[let in_w3c_bureaucrats src = 746 + let i = Xmlm.make_input ~strip:true src in 747 + let tag n = ("", n), [] in 748 + let error () = invalid_arg "parse error" in 749 + let accept s i = if Xmlm.input i = s then () else error () in 750 + let rec i_seq el acc i = match Xmlm.peek i with 751 + | `El_start _ -> i_seq el ((el i) :: acc) i 752 + | `El_end -> List.rev acc 753 + | _ -> error () 754 + in 755 + let i_el n i = 756 + accept (`El_start (tag n)) i; 757 + let d = match Xmlm.peek i with 758 + | `Data d -> ignore (Xmlm.input i); d 759 + | `El_end -> "" 760 + | _ -> error () 761 + in 762 + accept (`El_end) i; 763 + d 764 + in 765 + let i_bureaucrat i = 766 + try 767 + accept (`El_start (tag "bureaucrat")) i; 768 + let name = i_el "name" i in 769 + let surname = i_el "surname" i in 770 + let honest = match Xmlm.peek i with 771 + | `El_start (("", "honest"), []) -> ignore (i_el "honest" i); true 772 + | _ -> false 773 + in 774 + let obf = float_of_string (i_el "obfuscation_level" i) in 775 + let trs = i_seq (i_el "tr") [] i in 776 + accept (`El_end) i; 777 + { name = name; surname = surname; honest = honest; 778 + obfuscation_level = obf; trs = trs } 779 + with 780 + | Failure _ -> error () (* float_of_string *) 781 + in 782 + accept (`Dtd None) i; 783 + accept (`El_start (tag "list")) i; 784 + let bl = i_seq i_bureaucrat [] i in 785 + accept (`El_end) i; 786 + if not (Xmlm.eoi i) then invalid_arg "more than one document"; 787 + bl 788 + 789 + let out_w3c_bureaucrats dst bl = 790 + let tag n = ("", n), [] in 791 + let o = Xmlm.make_output ~nl:true ~indent:(Some 2) dst in 792 + let out = Xmlm.output o in 793 + let o_el n d = 794 + out (`El_start (tag n)); 795 + if d <> "" then out (`Data d); 796 + out `El_end 797 + in 798 + let o_bureaucrat b = 799 + out (`El_start (tag "bureaucrat")); 800 + o_el "name" b.name; 801 + o_el "surname" b.surname; 802 + if b.honest then o_el "honest" ""; 803 + o_el "obfuscation_level" (string_of_float b.obfuscation_level); 804 + List.iter (o_el "tr") b.trs; 805 + out `El_end 806 + in 807 + out (`Dtd None); 808 + out (`El_start (tag "list")); 809 + List.iter o_bureaucrat bl; 810 + out (`El_end)]} 811 + *) 812 + 813 + (*--------------------------------------------------------------------------- 814 + Copyright (c) 2007 The xmlm programmers 815 + 816 + Permission to use, copy, modify, and/or distribute this software for any 817 + purpose with or without fee is hereby granted, provided that the above 818 + copyright notice and this permission notice appear in all copies. 819 + 820 + THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 821 + WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 822 + MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 823 + ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 824 + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 825 + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 826 + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 827 + ---------------------------------------------------------------------------*)
+1
vendor/opam/xmlm/src/xmlm.mllib
··· 1 + Xmlm
+141
vendor/opam/xmlm/test/examples.ml
··· 1 + (* Examples from the documentation, this code is in public domain. *) 2 + 3 + (* Sequential processing *) 4 + 5 + let id ic oc = 6 + let i = Xmlm.make_input (`Channel ic) in 7 + let o = Xmlm.make_output (`Channel oc) in 8 + let rec pull i o depth = 9 + Xmlm.output o (Xmlm.peek i); 10 + match Xmlm.input i with 11 + | `El_start _ -> pull i o (depth + 1) 12 + | `El_end -> if depth = 1 then () else pull i o (depth - 1) 13 + | `Data _ -> pull i o depth 14 + | `Dtd _ -> assert false 15 + in 16 + Xmlm.output o (Xmlm.input i); (* `Dtd *) 17 + pull i o 0; 18 + if not (Xmlm.eoi i) then invalid_arg "document not well-formed" 19 + 20 + let id_seq ic oc = 21 + let i = Xmlm.make_input (`Channel ic) in 22 + let o = Xmlm.make_output ~nl:true (`Channel oc) in 23 + while not (Xmlm.eoi i) do Xmlm.output o (Xmlm.input i) done 24 + 25 + let prune_docs prune_list ic oc = 26 + let i = Xmlm.make_input (`Channel ic) in 27 + let o = Xmlm.make_output ~nl:true (`Channel oc) in 28 + let copy i o = Xmlm.output o (Xmlm.input i) in 29 + let prune (name, _) = List.mem name prune_list in 30 + let rec process i o d = 31 + let rec skip i d = match Xmlm.input i with 32 + | `El_start _ -> skip i (d + 1) 33 + | `El_end -> if d = 1 then () else skip i (d - 1) 34 + | s -> skip i d 35 + in 36 + match Xmlm.peek i with 37 + | `El_start tag when prune tag -> skip i 0; process i o d 38 + | `El_start _ -> copy i o; process i o (d + 1) 39 + | `El_end -> copy i o; if d = 0 then () else process i o (d - 1) 40 + | `Data _ -> copy i o; process i o d 41 + | `Dtd _ -> assert false 42 + in 43 + let rec docs i o = 44 + copy i o; (* `Dtd *) 45 + copy i o; (* root start *) 46 + process i o 0; 47 + if Xmlm.eoi i then () else docs i o 48 + in 49 + docs i o 50 + 51 + (* Tree processing *) 52 + 53 + type tree = E of Xmlm.tag * tree list | D of string 54 + 55 + let in_tree i = 56 + let el tag childs = E (tag, childs) in 57 + let data d = D d in 58 + Xmlm.input_doc_tree ~el ~data i 59 + 60 + let out_tree o t = 61 + let frag = function 62 + | E (tag, childs) -> `El (tag, childs) 63 + | D d -> `Data d 64 + in 65 + Xmlm.output_doc_tree frag o t 66 + 67 + (* Tabular data processing. *) 68 + 69 + type w3c_bureaucrat = 70 + { name : string; 71 + surname : string; 72 + honest : bool; 73 + obfuscation_level : float; 74 + trs : string list; } 75 + 76 + let in_w3c_bureaucrats src = 77 + let i = Xmlm.make_input ~strip:true src in 78 + let tag n = ("", n), [] in 79 + let error () = invalid_arg "parse error" in 80 + let accept s i = if Xmlm.input i = s then () else error () in 81 + let rec i_seq el acc i = match Xmlm.peek i with 82 + | `El_start _ -> i_seq el ((el i) :: acc) i 83 + | `El_end -> List.rev acc 84 + | _ -> error () 85 + in 86 + let i_el n i = 87 + accept (`El_start (tag n)) i; 88 + let d = match Xmlm.peek i with 89 + | `Data d -> ignore (Xmlm.input i); d 90 + | `El_end -> "" 91 + | _ -> error () 92 + in 93 + accept (`El_end) i; 94 + d 95 + in 96 + let i_bureaucrat i = 97 + try 98 + accept (`El_start (tag "bureaucrat")) i; 99 + let name = i_el "name" i in 100 + let surname = i_el "surname" i in 101 + let honest = match Xmlm.peek i with 102 + | `El_start (("", "honest"), []) -> ignore (i_el "honest" i); true 103 + | _ -> false 104 + in 105 + let obf = float_of_string (i_el "obfuscation_level" i) in 106 + let trs = i_seq (i_el "tr") [] i in 107 + accept (`El_end) i; 108 + { name = name; surname = surname; honest = honest; 109 + obfuscation_level = obf; trs = trs } 110 + with 111 + | Failure _ -> error () (* float_of_string *) 112 + in 113 + accept (`Dtd None) i; 114 + accept (`El_start (tag "list")) i; 115 + let bl = i_seq i_bureaucrat [] i in 116 + accept (`El_end) i; 117 + if not (Xmlm.eoi i) then invalid_arg "more than one document"; 118 + bl 119 + 120 + let out_w3c_bureaucrats dst bl = 121 + let tag n = ("", n), [] in 122 + let o = Xmlm.make_output ~nl:true ~indent:(Some 2) dst in 123 + let out = Xmlm.output o in 124 + let o_el n d = 125 + out (`El_start (tag n)); 126 + if d <> "" then out (`Data d); 127 + out `El_end 128 + in 129 + let o_bureaucrat b = 130 + out (`El_start (tag "bureaucrat")); 131 + o_el "name" b.name; 132 + o_el "surname" b.surname; 133 + if b.honest then o_el "honest" ""; 134 + o_el "obfuscation_level" (string_of_float b.obfuscation_level); 135 + List.iter (o_el "tr") b.trs; 136 + out `El_end 137 + in 138 + out (`Dtd None); 139 + out (`El_start (tag "list")); 140 + List.iter o_bureaucrat bl; 141 + out (`El_end)
+2
vendor/opam/xmlm/test/samples.md
··· 1 + * ftp://ftp.jclark.com/pub/xml/xmltest.zip 2 + * http://www.ximpleware.com/xmls.zip
+81
vendor/opam/xmlm/test/test.ml
··· 1 + (*--------------------------------------------------------------------------- 2 + Copyright (c) 2014 The xmlm programmers. All rights reserved. 3 + Distributed under the ISC license, see terms at the end of the file. 4 + ---------------------------------------------------------------------------*) 5 + 6 + let str = Format.sprintf 7 + let log f = Format.printf (f ^^ "@?") 8 + let fail fmt = 9 + let fail _ = failwith (Format.flush_str_formatter ()) in 10 + Format.kfprintf fail Format.str_formatter fmt 11 + 12 + (* We should add mode more coverage here see e.g. what is done in jsonm. *) 13 + 14 + let test_decode fnd exp = 15 + if fnd <> exp 16 + then fail "found: %a expected: %a" Xmlm.pp_signal fnd Xmlm.pp_signal exp 17 + 18 + let test_seq ?enc ?strip ?ns ?entity ?dtd src seq = 19 + let d = Xmlm.make_input ?enc ?strip ?ns ?entity (`String (0, src)) in 20 + let rec loop d = function [] -> () 21 + | v :: vs -> test_decode (Xmlm.input d) v; loop d vs 22 + in 23 + try 24 + let seq = match dtd with None -> `Dtd None :: seq | Some d -> d :: seq in 25 + loop d seq; 26 + if not (Xmlm.eoi d) then fail "Expected end of input" 27 + with Xmlm.Error ((l,c), e) -> 28 + fail "error:%d:%d: %s" l c (Xmlm.error_message e) 29 + 30 + let name ?(ns = "") n = (ns, n) 31 + let att ?ns n v = name ?ns n, v 32 + let tag ?(atts = []) ?ns n = (name ?ns n), atts 33 + let el ?atts ?ns n content = 34 + (`El_start (tag ?atts ?ns n)) :: List.flatten content @ [`El_end] 35 + 36 + let decoder_strip_atts () = 37 + log "Decoder attribute stripping.\n"; 38 + let test_attv v pv = 39 + test_seq (str "<e a ='%s'></e>" v) (el "e" ~atts:[att "a" pv] []) 40 + in 41 + test_attv " bla bli\n\n blo " "bla bli blo"; 42 + let test_iso_8859_15 v pv = 43 + test_seq ~enc:(Some `ISO_8859_15) (str "<e>%s</e>" v) (el "e" [[`Data pv]]) 44 + in 45 + List.iter 46 + (fun (v, pv) -> test_iso_8859_15 v pv) 47 + [ 48 + ("\065", "\u{0041}"); (* A *) 49 + ("\164", "\u{20AC}"); (* € *) 50 + ("\166", "\u{0160}"); (* Š *) 51 + ("\168", "\u{0161}"); (* š *) 52 + ("\180", "\u{017D}"); (* Ž *) 53 + ("\184", "\u{017E}"); (* ž *) 54 + ("\188", "\u{0152}"); (* Œ *) 55 + ("\189", "\u{0153}"); (* œ *) 56 + ("\190", "\u{0178}"); (* Ÿ *) 57 + ]; 58 + () 59 + 60 + let test () = 61 + Printexc.record_backtrace true; 62 + decoder_strip_atts (); 63 + log "All tests succeeded.\n" 64 + 65 + let () = if not (!Sys.interactive) then test () 66 + 67 + (*--------------------------------------------------------------------------- 68 + Copyright (c) 2014 The xmlm programmers 69 + 70 + Permission to use, copy, modify, and/or distribute this software for any 71 + purpose with or without fee is hereby granted, provided that the above 72 + copyright notice and this permission notice appear in all copies. 73 + 74 + THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 75 + WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 76 + MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 77 + ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 78 + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 79 + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 80 + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 81 + ---------------------------------------------------------------------------*)
+11
vendor/opam/xmlm/test/test_tree.ml
··· 1 + let li d = `El ((("", "li"), []), [`Data d]) 2 + let frag = `El ((("", "ol"), []), [li "bli"; li "bla"; li "blo"]) 3 + 4 + let main () = 5 + let b = Buffer.create 233 in 6 + let o = Xmlm.make_output (`Buffer b) in 7 + Xmlm.output o (`Dtd None); 8 + Xmlm.output_tree (fun x -> x) o frag; 9 + print_endline (Buffer.contents b) 10 + 11 + let () = main ()
+259
vendor/opam/xmlm/test/xhtml.ml
··· 1 + (* XHTML 1.1 character entities. 2 + Transformed from the following data source: 3 + http://www.w3.org/\ 4 + TR/xhtml-modularization/dtd_module_defs.html#a_xhtml_character_entities *) 5 + 6 + let entities = [ 7 + ("nbsp", "\194\160"); 8 + ("iexcl", "\194\161"); 9 + ("cent", "\194\162"); 10 + ("pound", "\194\163"); 11 + ("curren", "\194\164"); 12 + ("yen", "\194\165"); 13 + ("brvbar", "\194\166"); 14 + ("sect", "\194\167"); 15 + ("uml", "\194\168"); 16 + ("copy", "\194\169"); 17 + ("ordf", "\194\170"); 18 + ("laquo", "\194\171"); 19 + ("not", "\194\172"); 20 + ("shy", "\194\173"); 21 + ("reg", "\194\174"); 22 + ("macr", "\194\175"); 23 + ("deg", "\194\176"); 24 + ("plusmn", "\194\177"); 25 + ("sup2", "\194\178"); 26 + ("sup3", "\194\179"); 27 + ("acute", "\194\180"); 28 + ("micro", "\194\181"); 29 + ("para", "\194\182"); 30 + ("middot", "\194\183"); 31 + ("cedil", "\194\184"); 32 + ("sup1", "\194\185"); 33 + ("ordm", "\194\186"); 34 + ("raquo", "\194\187"); 35 + ("frac14", "\194\188"); 36 + ("frac12", "\194\189"); 37 + ("frac34", "\194\190"); 38 + ("iquest", "\194\191"); 39 + ("Agrave", "\195\128"); 40 + ("Aacute", "\195\129"); 41 + ("Acirc", "\195\130"); 42 + ("Atilde", "\195\131"); 43 + ("Auml", "\195\132"); 44 + ("Aring", "\195\133"); 45 + ("AElig", "\195\134"); 46 + ("Ccedil", "\195\135"); 47 + ("Egrave", "\195\136"); 48 + ("Eacute", "\195\137"); 49 + ("Ecirc", "\195\138"); 50 + ("Euml", "\195\139"); 51 + ("Igrave", "\195\140"); 52 + ("Iacute", "\195\141"); 53 + ("Icirc", "\195\142"); 54 + ("Iuml", "\195\143"); 55 + ("ETH", "\195\144"); 56 + ("Ntilde", "\195\145"); 57 + ("Ograve", "\195\146"); 58 + ("Oacute", "\195\147"); 59 + ("Ocirc", "\195\148"); 60 + ("Otilde", "\195\149"); 61 + ("Ouml", "\195\150"); 62 + ("times", "\195\151"); 63 + ("Oslash", "\195\152"); 64 + ("Ugrave", "\195\153"); 65 + ("Uacute", "\195\154"); 66 + ("Ucirc", "\195\155"); 67 + ("Uuml", "\195\156"); 68 + ("Yacute", "\195\157"); 69 + ("THORN", "\195\158"); 70 + ("szlig", "\195\159"); 71 + ("agrave", "\195\160"); 72 + ("aacute", "\195\161"); 73 + ("acirc", "\195\162"); 74 + ("atilde", "\195\163"); 75 + ("auml", "\195\164"); 76 + ("aring", "\195\165"); 77 + ("aelig", "\195\166"); 78 + ("ccedil", "\195\167"); 79 + ("egrave", "\195\168"); 80 + ("eacute", "\195\169"); 81 + ("ecirc", "\195\170"); 82 + ("euml", "\195\171"); 83 + ("igrave", "\195\172"); 84 + ("iacute", "\195\173"); 85 + ("icirc", "\195\174"); 86 + ("iuml", "\195\175"); 87 + ("eth", "\195\176"); 88 + ("ntilde", "\195\177"); 89 + ("ograve", "\195\178"); 90 + ("oacute", "\195\179"); 91 + ("ocirc", "\195\180"); 92 + ("otilde", "\195\181"); 93 + ("ouml", "\195\182"); 94 + ("divide", "\195\183"); 95 + ("oslash", "\195\184"); 96 + ("ugrave", "\195\185"); 97 + ("uacute", "\195\186"); 98 + ("ucirc", "\195\187"); 99 + ("uuml", "\195\188"); 100 + ("yacute", "\195\189"); 101 + ("thorn", "\195\190"); 102 + ("yuml", "\195\191"); 103 + ("lt", "<"); 104 + ("gt", ">"); 105 + ("amp", "&"); 106 + ("apos", "'"); 107 + ("quot", "\""); 108 + ("OElig", "\197\146"); 109 + ("oelig", "\197\147"); 110 + ("Scaron", "\197\160"); 111 + ("scaron", "\197\161"); 112 + ("Yuml", "\197\184"); 113 + ("circ", "\203\134"); 114 + ("tilde", "\203\156"); 115 + ("ensp", "\226\128\130"); 116 + ("emsp", "\226\128\131"); 117 + ("thinsp", "\226\128\137"); 118 + ("zwnj", "\226\128\140"); 119 + ("zwj", "\226\128\141"); 120 + ("lrm", "\226\128\142"); 121 + ("rlm", "\226\128\143"); 122 + ("ndash", "\226\128\147"); 123 + ("mdash", "\226\128\148"); 124 + ("lsquo", "\226\128\152"); 125 + ("rsquo", "\226\128\153"); 126 + ("sbquo", "\226\128\154"); 127 + ("ldquo", "\226\128\156"); 128 + ("rdquo", "\226\128\157"); 129 + ("bdquo", "\226\128\158"); 130 + ("dagger", "\226\128\160"); 131 + ("Dagger", "\226\128\161"); 132 + ("permil", "\226\128\176"); 133 + ("lsaquo", "\226\128\185"); 134 + ("rsaquo", "\226\128\186"); 135 + ("euro", "\226\130\172"); 136 + ("fnof", "\198\146"); 137 + ("Alpha", "\206\145"); 138 + ("Beta", "\206\146"); 139 + ("Gamma", "\206\147"); 140 + ("Delta", "\206\148"); 141 + ("Epsilon", "\206\149"); 142 + ("Zeta", "\206\150"); 143 + ("Eta", "\206\151"); 144 + ("Theta", "\206\152"); 145 + ("Iota", "\206\153"); 146 + ("Kappa", "\206\154"); 147 + ("Lambda", "\206\155"); 148 + ("Mu", "\206\156"); 149 + ("Nu", "\206\157"); 150 + ("Xi", "\206\158"); 151 + ("Omicron", "\206\159"); 152 + ("Pi", "\206\160"); 153 + ("Rho", "\206\161"); 154 + ("Sigma", "\206\163"); 155 + ("Tau", "\206\164"); 156 + ("Upsilon", "\206\165"); 157 + ("Phi", "\206\166"); 158 + ("Chi", "\206\167"); 159 + ("Psi", "\206\168"); 160 + ("Omega", "\206\169"); 161 + ("alpha", "\206\177"); 162 + ("beta", "\206\178"); 163 + ("gamma", "\206\179"); 164 + ("delta", "\206\180"); 165 + ("epsilon", "\206\181"); 166 + ("zeta", "\206\182"); 167 + ("eta", "\206\183"); 168 + ("theta", "\206\184"); 169 + ("iota", "\206\185"); 170 + ("kappa", "\206\186"); 171 + ("lambda", "\206\187"); 172 + ("mu", "\206\188"); 173 + ("nu", "\206\189"); 174 + ("xi", "\206\190"); 175 + ("omicron", "\206\191"); 176 + ("pi", "\207\128"); 177 + ("rho", "\207\129"); 178 + ("sigmaf", "\207\130"); 179 + ("sigma", "\207\131"); 180 + ("tau", "\207\132"); 181 + ("upsilon", "\207\133"); 182 + ("phi", "\207\134"); 183 + ("chi", "\207\135"); 184 + ("psi", "\207\136"); 185 + ("omega", "\207\137"); 186 + ("thetasym", "\207\145"); 187 + ("upsih", "\207\146"); 188 + ("piv", "\207\150"); 189 + ("bull", "\226\128\162"); 190 + ("hellip", "\226\128\166"); 191 + ("prime", "\226\128\178"); 192 + ("Prime", "\226\128\179"); 193 + ("oline", "\226\128\190"); 194 + ("frasl", "\226\129\132"); 195 + ("weierp", "\226\132\152"); 196 + ("image", "\226\132\145"); 197 + ("real", "\226\132\156"); 198 + ("trade", "\226\132\162"); 199 + ("alefsym", "\226\132\181"); 200 + ("larr", "\226\134\144"); 201 + ("uarr", "\226\134\145"); 202 + ("rarr", "\226\134\146"); 203 + ("darr", "\226\134\147"); 204 + ("harr", "\226\134\148"); 205 + ("crarr", "\226\134\181"); 206 + ("lArr", "\226\135\144"); 207 + ("uArr", "\226\135\145"); 208 + ("rArr", "\226\135\146"); 209 + ("dArr", "\226\135\147"); 210 + ("hArr", "\226\135\148"); 211 + ("forall", "\226\136\128"); 212 + ("part", "\226\136\130"); 213 + ("exist", "\226\136\131"); 214 + ("empty", "\226\136\133"); 215 + ("nabla", "\226\136\135"); 216 + ("isin", "\226\136\136"); 217 + ("notin", "\226\136\137"); 218 + ("ni", "\226\136\139"); 219 + ("prod", "\226\136\143"); 220 + ("sum", "\226\136\145"); 221 + ("minus", "\226\136\146"); 222 + ("lowast", "\226\136\151"); 223 + ("radic", "\226\136\154"); 224 + ("prop", "\226\136\157"); 225 + ("infin", "\226\136\158"); 226 + ("ang", "\226\136\160"); 227 + ("and", "\226\136\167"); 228 + ("or", "\226\136\168"); 229 + ("cap", "\226\136\169"); 230 + ("cup", "\226\136\170"); 231 + ("int", "\226\136\171"); 232 + ("there4", "\226\136\180"); 233 + ("sim", "\226\136\188"); 234 + ("cong", "\226\137\133"); 235 + ("asymp", "\226\137\136"); 236 + ("ne", "\226\137\160"); 237 + ("equiv", "\226\137\161"); 238 + ("le", "\226\137\164"); 239 + ("ge", "\226\137\165"); 240 + ("sub", "\226\138\130"); 241 + ("sup", "\226\138\131"); 242 + ("nsub", "\226\138\132"); 243 + ("sube", "\226\138\134"); 244 + ("supe", "\226\138\135"); 245 + ("oplus", "\226\138\149"); 246 + ("otimes", "\226\138\151"); 247 + ("perp", "\226\138\165"); 248 + ("sdot", "\226\139\133"); 249 + ("lceil", "\226\140\136"); 250 + ("rceil", "\226\140\137"); 251 + ("lfloor", "\226\140\138"); 252 + ("rfloor", "\226\140\139"); 253 + ("lang", "\226\140\169"); 254 + ("rang", "\226\140\170"); 255 + ("loz", "\226\151\138"); 256 + ("spades", "\226\153\160"); 257 + ("clubs", "\226\153\163"); 258 + ("hearts", "\226\153\165"); 259 + ("diams", "\226\153\166"); ]
+248
vendor/opam/xmlm/test/xmltrip.ml
··· 1 + (*--------------------------------------------------------------------------- 2 + Copyright (c) 2007 The xmlm programmers. All rights reserved. 3 + Distributed under the ISC license, see terms at the end of the file. 4 + ---------------------------------------------------------------------------*) 5 + 6 + let str = Printf.sprintf 7 + let exec = Filename.basename Sys.executable_name 8 + let pr_err s = Printf.eprintf "%s:%s\n" exec s 9 + let apply f x ~finally y = 10 + let result = try f x with exn -> finally y; raise exn in 11 + finally y; 12 + result 13 + 14 + let fail ((l, c), e) = failwith (str "%d:%d: %s" l c (Xmlm.error_message e)) 15 + 16 + type tree = E of Xmlm.tag * tree list | D of string 17 + 18 + let in_tree i = 19 + let el tag childs = E (tag, childs) in 20 + let data d = D d in 21 + Xmlm.input_doc_tree ~el ~data i 22 + 23 + let out_tree o t = 24 + let frag = function 25 + | E (tag, childs) -> `El (tag, childs) 26 + | D d -> `Data d 27 + in 28 + Xmlm.output_doc_tree frag o t 29 + 30 + let xml_parse tree enc strip entity ns ic () = (* parse only *) 31 + let i = Xmlm.make_input ~enc ~strip ~entity ~ns (`Channel ic) in 32 + let doc i = 33 + if tree then ignore (in_tree i) else 34 + begin 35 + let rec pull i l = match Xmlm.input i with 36 + | `El_start _ -> pull i (l + 1) 37 + | `El_end -> if l = 1 then () else pull i (l - 1) 38 + | `Data _ -> pull i l 39 + | `Dtd _ -> assert false 40 + in 41 + ignore (Xmlm.input i); (* `Dtd *) 42 + pull i 0; 43 + end 44 + in 45 + try while not (Xmlm.eoi i) do doc i done 46 + with Xmlm.Error (p, e) -> fail (p, e) 47 + 48 + let xml_signals _ enc strip entity ns ic _ = (* output signals *) 49 + let i = Xmlm.make_input ~enc ~strip ~entity ~ns (`Channel ic) in 50 + let pp_signal s = Format.printf "@[%a@]@," Xmlm.pp_signal s in 51 + try 52 + Format.printf "@[<v>"; 53 + while not (Xmlm.eoi i) do pp_signal (Xmlm.input i); done; 54 + Format.printf "@]"; 55 + with Xmlm.Error (p, e) -> fail (p, e) 56 + 57 + let xml_outline tree enc strip entity ns ic oc = (* ascii outline *) 58 + let pr s = Printf.fprintf oc s in 59 + let pr_dtd dtd = match dtd with Some s -> pr "+-DTD %S\n" s | _ -> () in 60 + let pr_depth d = for k = 1 to d do pr "| " done in 61 + let pr_data d data = pr_depth d; pr "%S\n" data in 62 + let pr_name c (p, l) = if p <> "" then pr "%s:%s" p l else pr "%s" l in 63 + let pr_att d (n, v) = pr_depth (d + 1); pr "* %a = %S\n" pr_name n v in 64 + let pr_tag d (n, atts) = 65 + pr_depth d; pr "+-%a\n" pr_name n; List.iter (pr_att d) atts 66 + in 67 + let i = Xmlm.make_input ~enc ~strip ~entity ~ns (`Channel ic) in 68 + let doc i = 69 + if tree then 70 + begin 71 + let rec pr_tree d = function 72 + | (n :: next) :: path -> 73 + begin match n with 74 + | D data -> pr_data d data; pr_tree d (next :: path) 75 + | E (tag, childs) -> 76 + pr_tag d tag; pr_tree (d+1) (childs :: next :: path) 77 + end 78 + | [] :: path -> if d = 0 then () else pr_tree (d - 1) path 79 + | _ -> assert false 80 + in 81 + let dtd, t = in_tree i in 82 + pr_dtd dtd; 83 + pr_tree 0 ([t] :: []) 84 + end 85 + else 86 + begin 87 + let rec pull i l = match Xmlm.input i with 88 + | `El_start tag -> pr_tag l tag; pull i (l + 1) 89 + | `El_end -> if l = 1 then () else pull i (l - 1) 90 + | `Data d -> pr_data l d; pull i l 91 + | `Dtd _ -> assert false 92 + in 93 + pr_dtd (match Xmlm.input i with `Dtd d -> d | _ -> assert false); 94 + pull i 0; 95 + end; 96 + flush oc 97 + in 98 + try while not (Xmlm.eoi i) do doc i done 99 + with Xmlm.Error (p, e) -> fail (p, e) 100 + 101 + let xml_xml indent tree enc strip entity ns ic oc = (* xml trip *) 102 + let nl = (indent = None) in 103 + let i = Xmlm.make_input ~enc ~strip ~ns ~entity (`Channel ic) in 104 + let o = Xmlm.make_output ~nl ~indent ~ns_prefix:ns (`Channel oc) in 105 + let doc i o = 106 + if tree then (out_tree o (in_tree i)) else 107 + begin 108 + let rec pull i o depth = 109 + let s = Xmlm.input i in 110 + Xmlm.output o s; 111 + match s with 112 + | `El_start _ -> pull i o (depth + 1) 113 + | `El_end -> if depth = 1 then () else pull i o (depth - 1) 114 + | `Data _ -> pull i o depth 115 + | `Dtd _ -> assert false 116 + in 117 + Xmlm.output o (Xmlm.input i); (* `Dtd *) 118 + pull i o 0 119 + end 120 + in 121 + try while not (Xmlm.eoi i) do doc i o done 122 + with Xmlm.Error (p, e) -> fail (p, e) 123 + 124 + let with_inf f inf v = 125 + try 126 + let ic = if inf <> "" then open_in_bin inf else stdin in 127 + let close ic = if inf <> "" then close_in ic else () in 128 + apply (f ic) v ~finally:close ic 129 + with 130 + | Sys_error e -> pr_err (str " %s" e) 131 + | Failure e -> pr_err (str "%s:%s" inf e) 132 + 133 + let with_outf f ic outf = 134 + try 135 + let oc = if outf <> "" then open_out_bin outf else stdout in 136 + let close oc = if outf <> "" then close_out oc else () in 137 + apply (f ic) oc ~finally:close oc 138 + with 139 + | Sys_error e -> pr_err (str " %s" e) 140 + 141 + let entity_fun eref xhtml = 142 + if not xhtml then (if eref then fun x -> Some x else fun x -> None) else 143 + let h = Hashtbl.create 270 in 144 + List.iter (fun (e, ustr) -> Hashtbl.add h e ustr) Xhtml.entities; 145 + if eref then (fun x -> try Some (Hashtbl.find h x) with Not_found -> Some x) 146 + else (fun x -> try Some (Hashtbl.find h x) with Not_found -> None) 147 + 148 + let process signals tree enc strip eref ns xhtml parse_only outline indent 149 + suffix files = 150 + let entity = entity_fun eref xhtml in 151 + let ns = if ns then fun x -> Some x else fun x -> None in 152 + let f = 153 + if parse_only then 154 + fun inf -> with_inf (xml_parse tree enc strip entity ns) inf () 155 + else 156 + let outf inf = 157 + if inf = "" || suffix = "" then "" (* stdout *) else 158 + str "%s.%s" inf suffix 159 + in 160 + let f = 161 + if outline then xml_outline else 162 + if signals then xml_signals else 163 + (xml_xml indent) 164 + in 165 + fun inf -> 166 + with_inf (with_outf (f tree enc strip entity ns)) inf (outf inf) 167 + in 168 + List.iter f files 169 + 170 + let encoding_of_str enc = match (String.lowercase_ascii enc) with 171 + | "" -> None 172 + | "utf-8" | "utf8" | "utf_8" -> Some `UTF_8 173 + | "utf-16" | "utf16" | "utf_16" -> Some `UTF_16 174 + | "utf-16be" | "utf16be" | "utf16_be" -> Some `UTF_16BE 175 + | "utf-16le" | "utf16le" | "utf16_le" -> Some `UTF_16LE 176 + | "iso-8859-1" | "iso88591" 177 + | "iso_8859_1" | "latin1" | "latin-1" -> Some `ISO_8859_1 178 + | "ascii" | "us-ascii" -> Some `US_ASCII 179 + | e -> pr_err (str "unknown encoding '%s', trying to guess." e); None 180 + 181 + let main () = 182 + let usage = 183 + str "Usage: %s <options> <files>\n\ 184 + Reads xml files and outputs them on stdout.\n\ 185 + Options:" exec 186 + in 187 + let enc = ref "" in 188 + let strip = ref false in 189 + let ns = ref false in 190 + let eref = ref false in 191 + let xhtml = ref false in 192 + let parse_only = ref false in 193 + let tree = ref false in 194 + let signals = ref false in 195 + let outline = ref false in 196 + let indent = ref false in 197 + let suffix = ref "" in 198 + let files = ref [] in 199 + let add_file s = files := s :: !files in 200 + let options = [ 201 + "-enc", Arg.Set_string enc, 202 + "<enc>, use specified encoding, utf-8, utf-16, utf-16be, utf-16le,\n\ 203 + \ iso-8859-1, ascii (otherwise guesses)."; 204 + "-strip", Arg.Set strip, 205 + "strip and collapse white space in character data."; 206 + "-ns", Arg.Set ns, 207 + "replace unbound namespaces prefixes by themselves (on input and output)."; 208 + "-eref", Arg.Set eref, 209 + "replace unknown entity references by their name."; 210 + "-xhtml", Arg.Set xhtml, 211 + "resolve XHTML character entities."; 212 + "-p", Arg.Set parse_only, 213 + "parse only, no output."; 214 + "-t", Arg.Set tree, 215 + "build document tree in memory."; 216 + "-signals", Arg.Set signals, 217 + "outputs the stream of signals instead of xml (excludes -t)."; 218 + "-ot", Arg.Set outline, 219 + "output document ascii outline instead of xml."; 220 + "-indent", Arg.Set indent, 221 + "indent xml output."; 222 + "-trip", Arg.Set_string suffix, 223 + "<suffix>, result for file <file> is output to a file <file.suffix>."; ] 224 + in 225 + Arg.parse options add_file usage; 226 + let files = match (List.rev !files) with [] -> ["" (* stdin *) ] | l -> l in 227 + let enc = encoding_of_str !enc in 228 + let indent = if !indent then Some 2 else None in 229 + process !signals !tree enc !strip !eref !ns !xhtml !parse_only 230 + !outline indent !suffix files 231 + 232 + let () = main () 233 + 234 + (*--------------------------------------------------------------------------- 235 + Copyright (c) 2007 The xmlm programmers 236 + 237 + Permission to use, copy, modify, and/or distribute this software for any 238 + purpose with or without fee is hereby granted, provided that the above 239 + copyright notice and this permission notice appear in all copies. 240 + 241 + THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 242 + WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 243 + MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 244 + ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 245 + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 246 + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 247 + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 248 + ---------------------------------------------------------------------------*)
vendor/opam/xmlm/xmlm.opam

This is a binary file and will not be displayed.