Merge opam/patches/uutf · anil.recoil.org/unpac-myspace-ocaml@b9b1686

+4

vendor/opam/uutf/.gitignore

··· 1 + _b0 2 + _build 3 + tmp 4 + *.install

+4

vendor/opam/uutf/.merlin

··· 1 + PKG b0.kit cmdliner 2 + S src 3 + S test 4 + B _build/**

+2

vendor/opam/uutf/.ocamlinit

··· 1 + #directory "_build/src" 2 + #load "uutf.cmo"

+1

vendor/opam/uutf/.ocp-indent

··· 1 + strict_with=always,match_clause=4,strict_else=never

+51

vendor/opam/uutf/B0.ml

··· 1 + open B0_kit.V000 2 + 3 + (* OCaml library names *) 4 + 5 + let uutf = B0_ocaml.libname "uutf" 6 + let unix = B0_ocaml.libname "unix" 7 + let cmdliner = B0_ocaml.libname "cmdliner" 8 + 9 + (* Libraries *) 10 + 11 + let uutf_lib = B0_ocaml.lib uutf ~srcs:[`Dir ~/"src"] 12 + 13 + (* Tests *) 14 + 15 + let test_uutf = B0_ocaml.test ~/"test/test_uutf.ml" ~requires:[uutf] 16 + 17 + let utftrip = 18 + let srcs = [`File ~/"test/utftrip.ml"] in 19 + let requires = [unix; uutf; cmdliner] in 20 + B0_ocaml.exe "utftrip" ~public:true ~srcs ~requires 21 + 22 + (* Packs *) 23 + 24 + let default = 25 + let meta = 26 + B0_meta.empty 27 + |> ~~ B0_meta.authors ["The uutf programmers"] 28 + |> ~~ B0_meta.maintainers ["Daniel Bünzli <daniel.buenzl i@erratique.ch>"] 29 + |> ~~ B0_meta.homepage "https://erratique.ch/software/uutf" 30 + |> ~~ B0_meta.online_doc "https://erratique.ch/software/uutf/doc/" 31 + |> ~~ B0_meta.licenses ["ISC"] 32 + |> ~~ B0_meta.repo "git+https://erratique.ch/repos/uutf.git" 33 + |> ~~ B0_meta.issues "https://github.com/dbuenzli/uutf/issues" 34 + |> ~~ B0_meta.description_tags 35 + ["unicode"; "text"; "utf-8"; "utf-16"; "codec"; "org:erratique"] 36 + |> B0_meta.tag B0_opam.tag 37 + |> ~~ B0_opam.depopts ["cmdliner", ""] 38 + |> ~~ B0_opam.conflicts 39 + [ "cmdliner", {|< "1.3.0"|}] 40 + |> ~~ B0_opam.depends 41 + [ "ocaml", {|>= "4.08.0"|}; 42 + "ocamlfind", {|build|}; 43 + "ocamlbuild", {|build|}; 44 + "topkg", {|build & >= "1.1.0"|}; 45 + ] 46 + |> B0_meta.add B0_opam.build 47 + {|[["ocaml" "pkg/pkg.ml" "build" "--dev-pkg" "%{dev}%" 48 + "--with-cmdliner" "%{cmdliner:installed}%"]]|} 49 + in 50 + B0_pack.make "default" ~doc:"uutf package" ~meta ~locked:true @@ 51 + B0_unit.list ()

+1

vendor/opam/uutf/BRZO

··· 1 + (srcs-x pkg)

+73

vendor/opam/uutf/CHANGES.md

··· 1 + v1.0.4 2025-03-10 La Forclaz (VS) 2 + --------------------------------- 3 + 4 + - Implement `Uutf.Buffer.*` with `Stdlib.Buffer.*`, no need to bloat 5 + these executables with dozens of different UTF encoders. 6 + - Require OCaml >= 4.08. 7 + - `uutftrip`, handle `cmdliner` deprecations. 8 + 9 + v1.0.3 2022-02-03 La Forclaz (VS) 10 + --------------------------------- 11 + 12 + - Support for OCaml 5.00, thanks to Kate (@kit-ty-kate) for 13 + the patch. 14 + 15 + v1.0.2 2019-02-05 La Forclaz (VS) 16 + --------------------------------- 17 + 18 + - Fix the substring folding functionality introduced in v1.0.0. 19 + It never worked correctly. 20 + 21 + v1.0.1 2017-03-07 La Forclaz (VS) 22 + --------------------------------- 23 + 24 + - OCaml 4.05.0 compatibility (removal of `Uchar.dump`). 25 + 26 + v1.0.0 2016-11-23 Zagreb 27 + ------------------------ 28 + 29 + - `Uutf.String.fold_utf_{8,16be,16le}`, allow substring folding via 30 + optional arguments. Thanks to Raphaël Proust for the idea and the 31 + patch. 32 + - OCaml standard library `Uchar.t` support. 33 + - Removes and substitutes `type Uutf.uchar = int` by the (abstract) 34 + `Uchar.t` type. `Uchar.{of,to}_int` allows to recover the previous 35 + representation. 36 + - Removes `Uutf.{is_uchar,cp_to_string,pp_cp}`. `Uchar.{is_valid,dump}` 37 + can be used instead. 38 + - Safe string support. Manual sources and destinations now work on bytes 39 + rather than strings. 40 + - Build depend on topkg. 41 + - Relicense from BSD3 to ISC. 42 + 43 + v0.9.4 2015-01-23 La Forclaz (VS) 44 + --------------------------------- 45 + 46 + - Add `Uutf.decoder_byte_count` returning the bytes decoded so far. 47 + - The `utftrip` cli utility now uses `Cmdliner` which becomes an 48 + optional dependency of the package. The cli interface is not 49 + compatible with previous versions. 50 + 51 + v0.9.3 2013-08-10 Cambridge (UK) 52 + -------------------------------- 53 + 54 + - Fix wrong decoding sequence when an UTF-8 encoding guess is based on 55 + a two byte UTF-8 sequence. Thanks to Edwin Török for the report. 56 + - OPAM friendly workflow and drop OASIS support. 57 + 58 + v0.9.2 2013-01-04 La Forclaz (VS) 59 + --------------------------------- 60 + 61 + - utftrip, better tool help. 62 + - Fix `Uutf.is_uchar` always returning false. Thanks to Edwin Török 63 + for reporting and providing the fix and test. 64 + 65 + v0.9.1 2012-08-05 Lausanne 66 + -------------------------- 67 + 68 + - OASIS 0.3.0 support. 69 + 70 + v0.9.0 2012-05-05 La Forclaz (VS) 71 + --------------------------------- 72 + 73 + First release.

+6

vendor/opam/uutf/DEVEL.md

··· 1 + This project uses (perhaps the development version of) [`b0`] for 2 + development. Consult [b0 occasionally] for quick hints on how to 3 + perform common development tasks. 4 + 5 + [`b0`]: https://erratique.ch/software/b0 6 + [b0 occasionally]: https://erratique.ch/software/b0/doc/occasionally.html

+13

vendor/opam/uutf/LICENSE.md

··· 1 + Copyright (c) 2016 The uutf programmers 2 + 3 + Permission to use, copy, modify, and/or distribute this software for any 4 + purpose with or without fee is hereby granted, provided that the above 5 + copyright notice and this permission notice appear in all copies. 6 + 7 + THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 8 + WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 9 + MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 10 + ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 11 + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 12 + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 13 + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.

+46

vendor/opam/uutf/README.md

··· 1 + Uutf — Non-blocking streaming Unicode codec for OCaml 2 + ===================================================== 3 + 4 + **Warning.** You are encouraged not to use this library. 5 + 6 + - As of OCaml 4.14, both UTF encoding and decoding are available 7 + in the standard library, see the `String` and `Buffer` modules. 8 + - If you are looking for a stream abstraction compatible with 9 + effect based concurrency look into [`bytesrw`] package. 10 + 11 + --- 12 + 13 + Uutf is a non-blocking streaming codec to decode and encode the UTF-8, 14 + UTF-16, UTF-16LE and UTF-16BE encoding schemes. It can efficiently 15 + work character by character without blocking on IO. Decoders perform 16 + character position tracking and support newline normalization. 17 + 18 + Functions are also provided to fold over the characters of UTF encoded 19 + OCaml string values and to directly encode characters in OCaml 20 + Buffer.t values. 21 + 22 + Uutf has no dependency and is distributed under the ISC license. 23 + 24 + Home page: <http://erratique.ch/software/uutf> 25 + 26 + [`bytesrw`]: https://erratique.ch/software/bytesrw 27 + 28 + 29 + ## Installation 30 + 31 + Uutf can be installed with `opam`: 32 + 33 + opam install uutf 34 + 35 + If you don't use `opam` consult the [`opam`](opam) file for build 36 + instructions. 37 + 38 + ## Documentation 39 + 40 + The documentation can be consulted [online] or via `odig doc uutf`. 41 + 42 + Questions are welcome but better asked on the [OCaml forum] than on the 43 + issue tracker. 44 + 45 + [online]: http://erratique.ch/software/uutf/doc/ 46 + [OCaml forum]: https://discuss.ocaml.org/

+4

vendor/opam/uutf/_tags

··· 1 + true : bin_annot, safe_string 2 + <_b0> : -traverse 3 + <src> : include 4 + <test/utftrip.*> : package(unix), package(cmdliner)

+25

vendor/opam/uutf/doc/index.mld

··· 1 + {0 Uutf {%html: <span class="version">%%VERSION%%</span>%}} 2 + 3 + {b Warning.} You are encouraged not to use this library. 4 + 5 + {ul 6 + {- As of OCaml 4.14, both UTF encoding and decoding are available 7 + in the standard library, see the {!String} and {!Buffer} modules.} 8 + {- If you are looking for a stream abstraction compatible with 9 + effect based concurrency look into the 10 + {{:https://erratique.ch/software/bytesrw}bytesrw} package.}} 11 + 12 + Uutf is a non-blocking streaming codec to decode and encode the UTF-8, 13 + UTF-16, UTF-16LE and UTF-16BE encoding schemes. It can efficiently 14 + work character by character without blocking on IO. Decoders perform 15 + character position tracking and support newline normalization. 16 + 17 + Functions are also provided to fold over the characters of UTF encoded 18 + OCaml string values and to directly encode characters in OCaml 19 + {!Buffer.t} values. 20 + 21 + {1:library_uutf Library [uutf]} 22 + 23 + {!modules: 24 + Uutf 25 + }

+2

vendor/opam/uutf/dune-project

··· 1 + (lang dune 3.0) 2 + (name uutf)

+38

vendor/opam/uutf/opam

··· 1 + opam-version: "2.0" 2 + name: "uutf" 3 + synopsis: "Non-blocking streaming Unicode codec for OCaml" 4 + description: """\ 5 + **Warning.** You are encouraged not to use this library. 6 + 7 + - As of OCaml 4.14, both UTF encoding and decoding are available 8 + in the standard library, see the `String` and `Buffer` modules. 9 + - If you are looking for a stream abstraction compatible with 10 + effect based concurrency look into [`bytesrw`] package.""" 11 + maintainer: "Daniel Bünzli <daniel.buenzl i@erratique.ch>" 12 + authors: "The uutf programmers" 13 + license: "ISC" 14 + tags: ["unicode" "text" "utf-8" "utf-16" "codec" "org:erratique"] 15 + homepage: "https://erratique.ch/software/uutf" 16 + doc: "https://erratique.ch/software/uutf/doc/" 17 + bug-reports: "https://github.com/dbuenzli/uutf/issues" 18 + depends: [ 19 + "ocaml" {>= "4.08.0"} 20 + "ocamlfind" {build} 21 + "ocamlbuild" {build} 22 + "topkg" {build & >= "1.1.0"} 23 + ] 24 + depopts: ["cmdliner"] 25 + conflicts: [ 26 + "cmdliner" {< "1.3.0"} 27 + ] 28 + build: [ 29 + "ocaml" 30 + "pkg/pkg.ml" 31 + "build" 32 + "--dev-pkg" 33 + "%{dev}%" 34 + "--with-cmdliner" 35 + "%{cmdliner:installed}%" 36 + ] 37 + dev-repo: "git+https://erratique.ch/repos/uutf.git" 38 + x-maintenance-intent: ["(latest)"]

+8

vendor/opam/uutf/pkg/META

··· 1 + description = "Non-blocking streaming Unicode codec for OCaml" 2 + version = "%%VERSION_NUM%%" 3 + requires = "" 4 + archive(byte) = "uutf.cma" 5 + archive(native) = "uutf.cmxa" 6 + plugin(byte) = "uutf.cma" 7 + plugin(native) = "uutf.cmxs" 8 + exists_if = "uutf.cma uutf.cmxa"

+12

vendor/opam/uutf/pkg/pkg.ml

··· 1 + #!/usr/bin/env ocaml 2 + #use "topfind" 3 + #require "topkg" 4 + open Topkg 5 + 6 + let cmdliner = Conf.with_pkg "cmdliner" 7 + let () = 8 + Pkg.describe "uutf" @@ fun c -> 9 + let cmdliner = Conf.value c cmdliner in 10 + Ok [ Pkg.mllib "src/uutf.mllib"; 11 + Pkg.bin ~cond:cmdliner "test/utftrip"; 12 + Pkg.doc "doc/index.mld" ~dst:"odoc-pages/index.mld" ]

+6

vendor/opam/uutf/src/dune

··· 1 + (library 2 + (name uutf) 3 + (public_name uutf) 4 + (wrapped false) 5 + (flags (:standard -w -27-32)) 6 + (modules Uutf))

+772

vendor/opam/uutf/src/uutf.ml

··· 1 + (*--------------------------------------------------------------------------- 2 + Copyright (c) 2012 The uutf programmers. All rights reserved. 3 + SPDX-License-Identifier: ISC 4 + ---------------------------------------------------------------------------*) 5 + 6 + let io_buffer_size = 65536 (* IO_BUFFER_SIZE 4.0.0 *) 7 + 8 + let pp = Format.fprintf 9 + let invalid_encode () = invalid_arg "expected `Await encode" 10 + let invalid_bounds j l = 11 + invalid_arg (Printf.sprintf "invalid bounds (index %d, length %d)" j l) 12 + 13 + (* Unsafe string byte manipulations. If you don't believe the author's 14 + invariants, replacing with safe versions makes everything safe in 15 + the module. He won't be upset. *) 16 + 17 + let unsafe_chr = Char.unsafe_chr 18 + let unsafe_blit = Bytes.unsafe_blit 19 + let unsafe_array_get = Array.unsafe_get 20 + let unsafe_byte s j = Char.code (Bytes.unsafe_get s j) 21 + let unsafe_set_byte s j byte = Bytes.unsafe_set s j (Char.unsafe_chr byte) 22 + 23 + (* Unicode characters *) 24 + 25 + let u_bom = Uchar.unsafe_of_int 0xFEFF (* BOM. *) 26 + let u_rep = Uchar.unsafe_of_int 0xFFFD (* replacement character. *) 27 + 28 + (* Unicode encoding schemes *) 29 + 30 + type encoding = [ `UTF_8 | `UTF_16 | `UTF_16BE | `UTF_16LE ] 31 + type decoder_encoding = [ encoding | `US_ASCII | `ISO_8859_1 ] 32 + 33 + let encoding_of_string s = match String.uppercase_ascii s with (* IANA names. *) 34 + | "UTF-8" -> Some `UTF_8 35 + | "UTF-16" -> Some `UTF_16 36 + | "UTF-16LE" -> Some `UTF_16LE 37 + | "UTF-16BE" -> Some `UTF_16BE 38 + | "ANSI_X3.4-1968" | "ISO-IR-6" | "ANSI_X3.4-1986" | "ISO_646.IRV:1991" 39 + | "ASCII" | "ISO646-US" | "US-ASCII" | "US" | "IBM367" | "CP367" | "CSASCII" -> 40 + Some `US_ASCII 41 + | "ISO_8859-1:1987" | "ISO-IR-100" | "ISO_8859-1" | "ISO-8859-1" 42 + | "LATIN1" | "L1" | "IBM819" | "CP819" | "CSISOLATIN1" -> 43 + Some `ISO_8859_1 44 + | _ -> None 45 + 46 + let encoding_to_string = function 47 + | `UTF_8 -> "UTF-8" | `UTF_16 -> "UTF-16" | `UTF_16BE -> "UTF-16BE" 48 + | `UTF_16LE -> "UTF-16LE" | `US_ASCII -> "US-ASCII" 49 + | `ISO_8859_1 -> "ISO-8859-1" 50 + 51 + (* Base character decoders. They assume enough data. *) 52 + 53 + let malformed s j l = `Malformed (Bytes.sub_string s j l) 54 + let malformed_pair be hi s j l = (* missing or half low surrogate at eoi. *) 55 + let bs1 = Bytes.(sub s j l) in 56 + let bs0 = Bytes.create 2 in 57 + let j0, j1 = if be then (0, 1) else (1, 0) in 58 + unsafe_set_byte bs0 j0 (hi lsr 8); 59 + unsafe_set_byte bs0 j1 (hi land 0xFF); 60 + `Malformed Bytes.(unsafe_to_string (cat bs0 bs1)) 61 + 62 + let r_us_ascii s j = 63 + (* assert (0 <= j && j < String.length s); *) 64 + let b0 = unsafe_byte s j in 65 + if b0 <= 127 then `Uchar (Uchar.unsafe_of_int b0) else malformed s j 1 66 + 67 + let r_iso_8859_1 s j = 68 + (* assert (0 <= j && j < String.length s); *) 69 + `Uchar (Uchar.unsafe_of_int @@ unsafe_byte s j) 70 + 71 + let utf_8_len = [| (* uchar byte length according to first UTF-8 byte. *) 72 + 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 73 + 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 74 + 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 75 + 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 76 + 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 77 + 1; 1; 1; 1; 1; 1; 1; 1; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 78 + 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 79 + 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 80 + 0; 0; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 81 + 2; 2; 2; 2; 2; 2; 2; 2; 3; 3; 3; 3; 3; 3; 3; 3; 3; 3; 3; 3; 3; 3; 3; 3; 82 + 4; 4; 4; 4; 4; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0 |] 83 + 84 + let r_utf_8 s j l = 85 + (* assert (0 <= j && 0 <= l && j + l <= String.length s); *) 86 + let uchar c = `Uchar (Uchar.unsafe_of_int c) in 87 + match l with 88 + | 1 -> uchar (unsafe_byte s j) 89 + | 2 -> 90 + let b0 = unsafe_byte s j in let b1 = unsafe_byte s (j + 1) in 91 + if b1 lsr 6 != 0b10 then malformed s j l else 92 + uchar (((b0 land 0x1F) lsl 6) lor (b1 land 0x3F)) 93 + | 3 -> 94 + let b0 = unsafe_byte s j in let b1 = unsafe_byte s (j + 1) in 95 + let b2 = unsafe_byte s (j + 2) in 96 + let c = ((b0 land 0x0F) lsl 12) lor 97 + ((b1 land 0x3F) lsl 6) lor 98 + (b2 land 0x3F) 99 + in 100 + if b2 lsr 6 != 0b10 then malformed s j l else 101 + begin match b0 with 102 + | 0xE0 -> if b1 < 0xA0 || 0xBF < b1 then malformed s j l else uchar c 103 + | 0xED -> if b1 < 0x80 || 0x9F < b1 then malformed s j l else uchar c 104 + | _ -> if b1 lsr 6 != 0b10 then malformed s j l else uchar c 105 + end 106 + | 4 -> 107 + let b0 = unsafe_byte s j in let b1 = unsafe_byte s (j + 1) in 108 + let b2 = unsafe_byte s (j + 2) in let b3 = unsafe_byte s (j + 3) in 109 + let c = (((b0 land 0x07) lsl 18) lor 110 + ((b1 land 0x3F) lsl 12) lor 111 + ((b2 land 0x3F) lsl 6) lor 112 + (b3 land 0x3F)) 113 + in 114 + if b3 lsr 6 != 0b10 || b2 lsr 6 != 0b10 then malformed s j l else 115 + begin match b0 with 116 + | 0xF0 -> if b1 < 0x90 || 0xBF < b1 then malformed s j l else uchar c 117 + | 0xF4 -> if b1 < 0x80 || 0x8F < b1 then malformed s j l else uchar c 118 + | _ -> if b1 lsr 6 != 0b10 then malformed s j l else uchar c 119 + end 120 + | _ -> assert false 121 + 122 + let r_utf_16 s j0 j1 = (* May return a high surrogate. *) 123 + (* assert (0 <= j0 && 0 <= j1 && max j0 j1 < String.length s); *) 124 + let b0 = unsafe_byte s j0 in let b1 = unsafe_byte s j1 in 125 + let u = (b0 lsl 8) lor b1 in 126 + if u < 0xD800 || u > 0xDFFF then `Uchar (Uchar.unsafe_of_int u) else 127 + if u > 0xDBFF then malformed s (min j0 j1) 2 else `Hi u 128 + 129 + let r_utf_16_lo hi s j0 j1 = (* Combines [hi] with a low surrogate. *) 130 + (* assert (0 <= j0 && 0 <= j1 && max j0 j1 < String.length s); *) 131 + let b0 = unsafe_byte s j0 in 132 + let b1 = unsafe_byte s j1 in 133 + let lo = (b0 lsl 8) lor b1 in 134 + if lo < 0xDC00 || lo > 0xDFFF 135 + then malformed_pair (j0 < j1 (* true => be *)) hi s (min j0 j1) 2 136 + else `Uchar (Uchar.unsafe_of_int ((((hi land 0x3FF) lsl 10) lor 137 + (lo land 0x3FF)) + 0x10000)) 138 + 139 + let r_encoding s j l = (* guess encoding with max. 3 bytes. *) 140 + (* assert (0 <= j && 0 <= l && j + l <= String.length s) *) 141 + let some i = if i < l then Some (unsafe_byte s (j + i)) else None in 142 + match (some 0), (some 1), (some 2) with 143 + | Some 0xEF, Some 0xBB, Some 0xBF -> `UTF_8 `BOM 144 + | Some 0xFE, Some 0xFF, _ -> `UTF_16BE `BOM 145 + | Some 0xFF, Some 0xFE, _ -> `UTF_16LE `BOM 146 + | Some 0x00, Some p, _ when p > 0 -> `UTF_16BE (`ASCII p) 147 + | Some p, Some 0x00, _ when p > 0 -> `UTF_16LE (`ASCII p) 148 + | Some u, _, _ when utf_8_len.(u) <> 0 -> `UTF_8 `Decode 149 + | Some _, Some _, _ -> `UTF_16BE `Decode 150 + | Some _, None , None -> `UTF_8 `Decode 151 + | None , None , None -> `UTF_8 `End 152 + | None , Some _, _ -> assert false 153 + | Some _, None , Some _ -> assert false 154 + | None , None , Some _ -> assert false 155 + 156 + (* Decode *) 157 + 158 + type src = [ `Channel of in_channel | `String of string | `Manual ] 159 + type nln = [ `ASCII of Uchar.t | `NLF of Uchar.t | `Readline of Uchar.t ] 160 + type decode = [ `Await | `End | `Malformed of string | `Uchar of Uchar.t] 161 + 162 + let pp_decode ppf = function 163 + | `Uchar u -> pp ppf "@[`Uchar U+%04X@]" (Uchar.to_int u) 164 + | `End -> pp ppf "`End" 165 + | `Await -> pp ppf "`Await" 166 + | `Malformed bs -> 167 + let l = String.length bs in 168 + pp ppf "@[`Malformed ("; 169 + if l > 0 then pp ppf "%02X" (Char.code (bs.[0])); 170 + for i = 1 to l - 1 do pp ppf " %02X" (Char.code (bs.[i])) done; 171 + pp ppf ")@]" 172 + 173 + type decoder = 174 + { src : src; (* input source. *) 175 + mutable encoding : decoder_encoding; (* decoded encoding. *) 176 + nln : nln option; (* newline normalization (if any). *) 177 + nl : Uchar.t; (* newline normalization character. *) 178 + mutable i : Bytes.t; (* current input chunk. *) 179 + mutable i_pos : int; (* input current position. *) 180 + mutable i_max : int; (* input maximal position. *) 181 + t : Bytes.t; (* four bytes temporary buffer for overlapping reads. *) 182 + mutable t_len : int; (* current byte length of [t]. *) 183 + mutable t_need : int; (* number of bytes needed in [t]. *) 184 + mutable removed_bom : bool; (* [true] if an initial BOM was removed. *) 185 + mutable last_cr : bool; (* [true] if last char was CR. *) 186 + mutable line : int; (* line number. *) 187 + mutable col : int; (* column number. *) 188 + mutable byte_count : int; (* byte count. *) 189 + mutable count : int; (* char count. *) 190 + mutable pp : (* decoder post-processor for BOM, position and nln. *) 191 + decoder -> [ `Malformed of string | `Uchar of Uchar.t ] -> decode; 192 + mutable k : decoder -> decode } (* decoder continuation. *) 193 + 194 + (* On decodes that overlap two (or more) [d.i] buffers, we use [t_fill] to copy 195 + the input data to [d.t] and decode from there. If the [d.i] buffers are not 196 + too small this is faster than continuation based byte per byte writes. 197 + 198 + End of input (eoi) is signalled by [d.i_pos = 0] and [d.i_max = min_int] 199 + which implies that [i_rem d < 0] is [true]. *) 200 + 201 + let i_rem d = d.i_max - d.i_pos + 1 (* remaining bytes to read in [d.i]. *) 202 + let eoi d = 203 + d.i <- Bytes.empty; d.i_pos <- 0; d.i_max <- min_int (* set eoi in [d]. *) 204 + 205 + let src d s j l = (* set [d.i] with [s]. *) 206 + if (j < 0 || l < 0 || j + l > Bytes.length s) then invalid_bounds j l else 207 + if (l = 0) then eoi d else 208 + (d.i <- s; d.i_pos <- j; d.i_max <- j + l - 1) 209 + 210 + let refill k d = match d.src with (* get new input in [d.i] and [k]ontinue. *) 211 + | `Manual -> d.k <- k; `Await 212 + | `String _ -> eoi d; k d 213 + | `Channel ic -> 214 + let rc = input ic d.i 0 (Bytes.length d.i) in 215 + (src d d.i 0 rc; k d) 216 + 217 + let t_need d need = d.t_len <- 0; d.t_need <- need 218 + let rec t_fill k d = (* get [d.t_need] bytes (or less if eoi) in [i.t]. *) 219 + let blit d l = 220 + unsafe_blit d.i d.i_pos d.t d.t_len (* write pos. *) l; 221 + d.i_pos <- d.i_pos + l; d.t_len <- d.t_len + l; 222 + in 223 + let rem = i_rem d in 224 + if rem < 0 (* eoi *) then k d else 225 + let need = d.t_need - d.t_len in 226 + if rem < need then (blit d rem; refill (t_fill k) d) else (blit d need; k d) 227 + 228 + let ret k v byte_count d = (* return post-processed [v]. *) 229 + d.k <- k; d.byte_count <- d.byte_count + byte_count; d.pp d v 230 + 231 + (* Decoders. *) 232 + 233 + let rec decode_us_ascii d = 234 + let rem = i_rem d in 235 + if rem <= 0 then (if rem < 0 then `End else refill decode_us_ascii d) else 236 + let j = d.i_pos in 237 + d.i_pos <- d.i_pos + 1; ret decode_us_ascii (r_us_ascii d.i j) 1 d 238 + 239 + let rec decode_iso_8859_1 d = 240 + let rem = i_rem d in 241 + if rem <= 0 then (if rem < 0 then `End else refill decode_iso_8859_1 d) else 242 + let j = d.i_pos in 243 + d.i_pos <- d.i_pos + 1; ret decode_iso_8859_1 (r_iso_8859_1 d.i j) 1 d 244 + 245 + (* UTF-8 decoder *) 246 + 247 + let rec t_decode_utf_8 d = (* decode from [d.t]. *) 248 + if d.t_len < d.t_need 249 + then ret decode_utf_8 (malformed d.t 0 d.t_len) d.t_len d 250 + else ret decode_utf_8 (r_utf_8 d.t 0 d.t_len) d.t_len d 251 + 252 + and decode_utf_8 d = 253 + let rem = i_rem d in 254 + if rem <= 0 then (if rem < 0 then `End else refill decode_utf_8 d) else 255 + let need = unsafe_array_get utf_8_len (unsafe_byte d.i d.i_pos) in 256 + if rem < need then (t_need d need; t_fill t_decode_utf_8 d) else 257 + let j = d.i_pos in 258 + if need = 0 259 + then (d.i_pos <- d.i_pos + 1; ret decode_utf_8 (malformed d.i j 1) 1 d) 260 + else (d.i_pos <- d.i_pos + need; ret decode_utf_8 (r_utf_8 d.i j need) need d) 261 + 262 + (* UTF-16BE decoder *) 263 + 264 + let rec t_decode_utf_16be_lo hi d = (* decode from [d.t]. *) 265 + let bcount = d.t_len + 2 (* hi count *) in 266 + if d.t_len < d.t_need 267 + then ret decode_utf_16be (malformed_pair true hi d.t 0 d.t_len) bcount d 268 + else ret decode_utf_16be (r_utf_16_lo hi d.t 0 1) bcount d 269 + 270 + and t_decode_utf_16be d = (* decode from [d.t]. *) 271 + if d.t_len < d.t_need 272 + then ret decode_utf_16be (malformed d.t 0 d.t_len) d.t_len d 273 + else decode_utf_16be_lo (r_utf_16 d.t 0 1) d 274 + 275 + and decode_utf_16be_lo v d = match v with 276 + | `Uchar _ | `Malformed _ as v -> ret decode_utf_16be v 2 d 277 + | `Hi hi -> 278 + let rem = i_rem d in 279 + if rem < 2 then (t_need d 2; t_fill (t_decode_utf_16be_lo hi) d) else 280 + let j = d.i_pos in 281 + d.i_pos <- d.i_pos + 2; 282 + ret decode_utf_16be (r_utf_16_lo hi d.i j (j + 1)) 4 d 283 + 284 + and decode_utf_16be d = 285 + let rem = i_rem d in 286 + if rem <= 0 then (if rem < 0 then `End else refill decode_utf_16be d) else 287 + if rem < 2 then (t_need d 2; t_fill t_decode_utf_16be d) else 288 + let j = d.i_pos in 289 + d.i_pos <- d.i_pos + 2; decode_utf_16be_lo (r_utf_16 d.i j (j + 1)) d 290 + 291 + (* UTF-16LE decoder, same as UTF-16BE with byte swapped. *) 292 + 293 + let rec t_decode_utf_16le_lo hi d = (* decode from [d.t]. *) 294 + let bcount = d.t_len + 2 (* hi count *) in 295 + if d.t_len < d.t_need 296 + then ret decode_utf_16le (malformed_pair false hi d.t 0 d.t_len) bcount d 297 + else ret decode_utf_16le (r_utf_16_lo hi d.t 1 0) bcount d 298 + 299 + and t_decode_utf_16le d = (* decode from [d.t]. *) 300 + if d.t_len < d.t_need 301 + then ret decode_utf_16le (malformed d.t 0 d.t_len) d.t_len d 302 + else decode_utf_16le_lo (r_utf_16 d.t 1 0) d 303 + 304 + and decode_utf_16le_lo v d = match v with 305 + | `Uchar _ | `Malformed _ as v -> ret decode_utf_16le v 2 d 306 + | `Hi hi -> 307 + let rem = i_rem d in 308 + if rem < 2 then (t_need d 2; t_fill (t_decode_utf_16le_lo hi) d) else 309 + let j = d.i_pos in 310 + d.i_pos <- d.i_pos + 2; 311 + ret decode_utf_16le (r_utf_16_lo hi d.i (j + 1) j) 4 d 312 + 313 + and decode_utf_16le d = 314 + let rem = i_rem d in 315 + if rem <= 0 then (if rem < 0 then `End else refill decode_utf_16le d) else 316 + if rem < 2 then (t_need d 2; t_fill t_decode_utf_16le d) else 317 + let j = d.i_pos in 318 + d.i_pos <- d.i_pos + 2; decode_utf_16le_lo (r_utf_16 d.i (j + 1) j) d 319 + 320 + (* Encoding guessing. The guess is simple but starting the decoder 321 + after is tedious, uutf's decoders are not designed to put bytes 322 + back in the stream. *) 323 + 324 + let guessed_utf_8 d = (* start decoder after `UTF_8 guess. *) 325 + let b3 d = (* handles the third read byte. *) 326 + let b3 = unsafe_byte d.t 2 in 327 + match utf_8_len.(b3) with 328 + | 0 -> ret decode_utf_8 (malformed d.t 2 1) 1 d 329 + | n -> 330 + d.t_need <- n; d.t_len <- 1; unsafe_set_byte d.t 0 b3; 331 + t_fill t_decode_utf_8 d 332 + in 333 + let b2 d = (* handle second read byte. *) 334 + let b2 = unsafe_byte d.t 1 in 335 + let b3 = if d.t_len > 2 then b3 else decode_utf_8 (* decodes `End *) in 336 + match utf_8_len.(b2) with 337 + | 0 -> ret b3 (malformed d.t 1 1) 1 d 338 + | 1 -> ret b3 (r_utf_8 d.t 1 1) 1 d 339 + | n -> (* copy d.t.(1-2) to d.t.(0-1) and decode *) 340 + d.t_need <- n; 341 + unsafe_set_byte d.t 0 b2; 342 + if (d.t_len < 3) then d.t_len <- 1 else 343 + (d.t_len <- 2; unsafe_set_byte d.t 1 (unsafe_byte d.t 2); ); 344 + t_fill t_decode_utf_8 d 345 + in 346 + let b1 = unsafe_byte d.t 0 in (* handle first read byte. *) 347 + let b2 = if d.t_len > 1 then b2 else decode_utf_8 (* decodes `End *) in 348 + match utf_8_len.(b1) with 349 + | 0 -> ret b2 (malformed d.t 0 1) 1 d 350 + | 1 -> ret b2 (r_utf_8 d.t 0 1) 1 d 351 + | 2 -> 352 + if d.t_len < 2 then ret decode_utf_8 (malformed d.t 0 1) 1 d else 353 + if d.t_len < 3 then ret decode_utf_8 (r_utf_8 d.t 0 2) 2 d else 354 + ret b3 (r_utf_8 d.t 0 2) 2 d 355 + | 3 -> 356 + if d.t_len < 3 357 + then ret decode_utf_8 (malformed d.t 0 d.t_len) d.t_len d 358 + else ret decode_utf_8 (r_utf_8 d.t 0 3) 3 d 359 + | 4 -> 360 + if d.t_len < 3 361 + then ret decode_utf_8 (malformed d.t 0 d.t_len) d.t_len d 362 + else (d.t_need <- 4; t_fill t_decode_utf_8 d) 363 + | n -> assert false 364 + 365 + let guessed_utf_16 d be v = (* start decoder after `UTF_16{BE,LE} guess. *) 366 + let decode_utf_16, t_decode_utf_16, t_decode_utf_16_lo, j0, j1 = 367 + if be then decode_utf_16be, t_decode_utf_16be, t_decode_utf_16be_lo, 0, 1 368 + else decode_utf_16le, t_decode_utf_16le, t_decode_utf_16le_lo, 1, 0 369 + in 370 + let b3 k d = 371 + if d.t_len < 3 then decode_utf_16 d (* decodes `End *) else 372 + begin (* copy d.t.(2) to d.t.(0) and decode. *) 373 + d.t_need <- 2; d.t_len <- 1; 374 + unsafe_set_byte d.t 0 (unsafe_byte d.t 2); 375 + t_fill k d 376 + end 377 + in 378 + match v with 379 + | `BOM -> ret (b3 t_decode_utf_16) (`Uchar u_bom) 2 d 380 + | `ASCII u -> ret (b3 t_decode_utf_16) (`Uchar (Uchar.unsafe_of_int u)) 2 d 381 + | `Decode -> 382 + match r_utf_16 d.t j0 j1 with 383 + | `Malformed _ | `Uchar _ as v -> ret (b3 t_decode_utf_16) v 2 d 384 + | `Hi hi -> 385 + if d.t_len < 3 386 + then ret decode_utf_16 (malformed_pair be hi Bytes.empty 0 0) d.t_len d 387 + else (b3 (t_decode_utf_16_lo hi)) d 388 + 389 + let guess_encoding d = (* guess encoding and start decoder. *) 390 + let setup d = match r_encoding d.t 0 d.t_len with 391 + | `UTF_8 r -> 392 + d.encoding <- `UTF_8; d.k <- decode_utf_8; 393 + begin match r with 394 + | `BOM -> ret decode_utf_8 (`Uchar u_bom) 3 d 395 + | `Decode -> guessed_utf_8 d 396 + | `End -> `End 397 + end 398 + | `UTF_16BE r -> 399 + d.encoding <- `UTF_16BE; d.k <- decode_utf_16be; guessed_utf_16 d true r 400 + | `UTF_16LE r -> 401 + d.encoding <- `UTF_16LE; d.k <- decode_utf_16le; guessed_utf_16 d false r 402 + 403 + in 404 + (t_need d 3; t_fill setup d) 405 + 406 + (* Character post-processors. Used for BOM handling, newline 407 + normalization and position tracking. The [pp_remove_bom] is only 408 + used for the first character to remove a possible initial BOM and 409 + handle UTF-16 endianness recognition. *) 410 + 411 + let nline d = d.col <- 0; d.line <- d.line + 1 (* inlined. *) 412 + let ncol d = d.col <- d.col + 1 (* inlined. *) 413 + let ncount d = d.count <- d.count + 1 (* inlined. *) 414 + let cr d b = d.last_cr <- b (* inlined. *) 415 + 416 + let pp_remove_bom utf16 pp d = function(* removes init. BOM, handles UTF-16. *) 417 + | `Malformed _ as v -> d.removed_bom <- false; d.pp <- pp; d.pp d v 418 + | `Uchar u as v -> 419 + match Uchar.to_int u with 420 + | 0xFEFF (* BOM *) -> 421 + if utf16 then (d.encoding <- `UTF_16BE; d.k <- decode_utf_16be); 422 + d.removed_bom <- true; d.pp <- pp; d.k d 423 + | 0xFFFE (* BOM reversed from decode_utf_16be *) when utf16 -> 424 + d.encoding <- `UTF_16LE; d.k <- decode_utf_16le; 425 + d.removed_bom <- true; d.pp <- pp; d.k d 426 + | _ -> 427 + d.removed_bom <- false; d.pp <- pp; d.pp d v 428 + 429 + let pp_nln_none d = function 430 + | `Malformed _ as v -> cr d false; ncount d; ncol d; v 431 + | `Uchar u as v -> 432 + match Uchar.to_int u with 433 + | 0x000A (* LF *) -> 434 + let last_cr = d.last_cr in 435 + cr d false; ncount d; if last_cr then v else (nline d; v) 436 + | 0x000D (* CR *) -> cr d true; ncount d; nline d; v 437 + | (0x0085 | 0x000C | 0x2028 | 0x2029) (* NEL | FF | LS | PS *) -> 438 + cr d false; ncount d; nline d; v 439 + | _ -> 440 + cr d false; ncount d; ncol d; v 441 + 442 + let pp_nln_readline d = function 443 + | `Malformed _ as v -> cr d false; ncount d; ncol d; v 444 + | `Uchar u as v -> 445 + match Uchar.to_int u with 446 + | 0x000A (* LF *) -> 447 + let last_cr = d.last_cr in 448 + cr d false; if last_cr then d.k d else (ncount d; nline d; `Uchar d.nl) 449 + | 0x000D (* CR *) -> cr d true; ncount d; nline d; `Uchar d.nl 450 + | (0x0085 | 0x000C | 0x2028 | 0x2029) (* NEL | FF | LS | PS *) -> 451 + cr d false; ncount d; nline d; `Uchar d.nl 452 + | _ -> 453 + cr d false; ncount d; ncol d; v 454 + 455 + let pp_nln_nlf d = function 456 + | `Malformed _ as v -> cr d false; ncount d; ncol d; v 457 + | `Uchar u as v -> 458 + match Uchar.to_int u with 459 + | 0x000A (* LF *) -> 460 + let last_cr = d.last_cr in 461 + cr d false; if last_cr then d.k d else (ncount d; nline d; `Uchar d.nl) 462 + | 0x000D (* CR *) -> cr d true; ncount d; nline d; `Uchar d.nl 463 + | 0x0085 (* NEL *) -> cr d false; ncount d; nline d; `Uchar d.nl 464 + | (0x000C | 0x2028 | 0x2029) (* FF | LS | PS *) -> 465 + cr d false; ncount d; nline d; v 466 + | _ -> 467 + cr d false; ncount d; ncol d; v 468 + 469 + let pp_nln_ascii d = function 470 + | `Malformed _ as v -> cr d false; ncount d; ncol d; v 471 + | `Uchar u as v -> 472 + match Uchar.to_int u with 473 + | 0x000A (* LF *) -> 474 + let last_cr = d.last_cr in 475 + cr d false; if last_cr then d.k d else (ncount d; nline d; `Uchar d.nl) 476 + | 0x000D (* CR *) -> cr d true; ncount d; nline d; `Uchar d.nl 477 + | (0x0085 | 0x000C | 0x2028 | 0x2029) (* NEL | FF | LS | PS *) -> 478 + cr d false; ncount d; nline d; v 479 + | _ -> 480 + cr d false; ncount d; ncol d; v 481 + 482 + let decode_fun = function 483 + | `UTF_8 -> decode_utf_8 484 + | `UTF_16 -> decode_utf_16be (* see [pp_remove_bom]. *) 485 + | `UTF_16BE -> decode_utf_16be 486 + | `UTF_16LE -> decode_utf_16le 487 + | `US_ASCII -> decode_us_ascii 488 + | `ISO_8859_1 -> decode_iso_8859_1 489 + 490 + let decoder ?nln ?encoding src = 491 + let pp, nl = match nln with 492 + | None -> pp_nln_none, Uchar.unsafe_of_int 0x000A (* not used. *) 493 + | Some (`ASCII nl) -> pp_nln_ascii, nl 494 + | Some (`NLF nl) -> pp_nln_nlf, nl 495 + | Some (`Readline nl) -> pp_nln_readline, nl 496 + in 497 + let encoding, k = match encoding with 498 + | None -> `UTF_8, guess_encoding 499 + | Some e -> (e :> decoder_encoding), decode_fun e 500 + in 501 + let i, i_pos, i_max = match src with 502 + | `Manual -> Bytes.empty, 1, 0 (* implies src_rem d = 0. *) 503 + | `Channel _ -> Bytes.create io_buffer_size, 1, 0 (* idem. *) 504 + | `String s -> Bytes.unsafe_of_string s, 0, String.length s - 1 505 + in 506 + { src = (src :> src); encoding; nln = (nln :> nln option); nl; 507 + i; i_pos; i_max; t = Bytes.create 4; t_len = 0; t_need = 0; 508 + removed_bom = false; last_cr = false; line = 1; col = 0; 509 + byte_count = 0; count = 0; 510 + pp = pp_remove_bom (encoding = `UTF_16) pp; k } 511 + 512 + let decode d = d.k d 513 + let decoder_line d = d.line 514 + let decoder_col d = d.col 515 + let decoder_byte_count d = d.byte_count 516 + let decoder_count d = d.count 517 + let decoder_removed_bom d = d.removed_bom 518 + let decoder_src d = d.src 519 + let decoder_nln d = d.nln 520 + let decoder_encoding d = d.encoding 521 + let set_decoder_encoding d e = 522 + d.encoding <- (e :> decoder_encoding); d.k <- decode_fun e 523 + 524 + (* Encode *) 525 + 526 + type dst = [ `Channel of out_channel | `Buffer of Buffer.t | `Manual ] 527 + type encode = [ `Await | `End | `Uchar of Uchar.t ] 528 + type encoder = 529 + { dst : dst; (* output destination. *) 530 + encoding : encoding; (* encoded encoding. *) 531 + mutable o : Bytes.t; (* current output chunk. *) 532 + mutable o_pos : int; (* next output position to write. *) 533 + mutable o_max : int; (* maximal output position to write. *) 534 + t : Bytes.t; (* four bytes buffer for overlapping writes. *) 535 + mutable t_pos : int; (* next position to read in [t]. *) 536 + mutable t_max : int; (* maximal position to read in [t]. *) 537 + mutable k : (* encoder continuation. *) 538 + encoder -> encode -> [ `Ok | `Partial ] } 539 + 540 + (* On encodes that overlap two (or more) [e.o] buffers, we encode the 541 + character to the temporary buffer [o.t] and continue with 542 + [tmp_flush] to write this data on the different [e.o] buffers. If 543 + the [e.o] buffers are not too small this is faster than 544 + continuation based byte per byte writes. *) 545 + 546 + let o_rem e = e.o_max - e.o_pos + 1 (* remaining bytes to write in [e.o]. *) 547 + let dst e s j l = (* set [e.o] with [s]. *) 548 + if (j < 0 || l < 0 || j + l > Bytes.length s) then invalid_bounds j l; 549 + e.o <- s; e.o_pos <- j; e.o_max <- j + l - 1 550 + 551 + let partial k e = function `Await -> k e | `Uchar _ | `End -> invalid_encode () 552 + let flush k e = match e.dst with(* get free storage in [d.o] and [k]ontinue. *) 553 + | `Manual -> e.k <- partial k; `Partial 554 + | `Channel oc -> output oc e.o 0 e.o_pos; e.o_pos <- 0; k e 555 + | `Buffer b -> 556 + let o = Bytes.unsafe_to_string e.o in 557 + Buffer.add_substring b o 0 e.o_pos; e.o_pos <- 0; k e 558 + 559 + 560 + let t_range e max = e.t_pos <- 0; e.t_max <- max 561 + let rec t_flush k e = (* flush [d.t] up to [d.t_max] in [d.i]. *) 562 + let blit e l = 563 + unsafe_blit e.t e.t_pos e.o e.o_pos l; 564 + e.o_pos <- e.o_pos + l; e.t_pos <- e.t_pos + l 565 + in 566 + let rem = o_rem e in 567 + let len = e.t_max - e.t_pos + 1 in 568 + if rem < len then (blit e rem; flush (t_flush k) e) else (blit e len; k e) 569 + 570 + (* Encoders. *) 571 + 572 + let rec encode_utf_8 e v = 573 + let k e = e.k <- encode_utf_8; `Ok in 574 + match v with 575 + | `Await -> k e 576 + | `End -> flush k e 577 + | `Uchar u as v -> 578 + let u = Uchar.to_int u in 579 + let rem = o_rem e in 580 + if u <= 0x007F then 581 + if rem < 1 then flush (fun e -> encode_utf_8 e v) e else 582 + (unsafe_set_byte e.o e.o_pos u; e.o_pos <- e.o_pos + 1; k e) 583 + else if u <= 0x07FF then 584 + begin 585 + let s, j, k = 586 + if rem < 2 then (t_range e 1; e.t, 0, t_flush k) else 587 + let j = e.o_pos in (e.o_pos <- e.o_pos + 2; e.o, j, k) 588 + in 589 + unsafe_set_byte s j (0xC0 lor (u lsr 6)); 590 + unsafe_set_byte s (j + 1) (0x80 lor (u land 0x3F)); 591 + k e 592 + end 593 + else if u <= 0xFFFF then 594 + begin 595 + let s, j, k = 596 + if rem < 3 then (t_range e 2; e.t, 0, t_flush k) else 597 + let j = e.o_pos in (e.o_pos <- e.o_pos + 3; e.o, j, k) 598 + in 599 + unsafe_set_byte s j (0xE0 lor (u lsr 12)); 600 + unsafe_set_byte s (j + 1) (0x80 lor ((u lsr 6) land 0x3F)); 601 + unsafe_set_byte s (j + 2) (0x80 lor (u land 0x3F)); 602 + k e 603 + end 604 + else 605 + begin 606 + let s, j, k = 607 + if rem < 4 then (t_range e 3; e.t, 0, t_flush k) else 608 + let j = e.o_pos in (e.o_pos <- e.o_pos + 4; e.o, j, k) 609 + in 610 + unsafe_set_byte s j (0xF0 lor (u lsr 18)); 611 + unsafe_set_byte s (j + 1) (0x80 lor ((u lsr 12) land 0x3F)); 612 + unsafe_set_byte s (j + 2) (0x80 lor ((u lsr 6) land 0x3F)); 613 + unsafe_set_byte s (j + 3) (0x80 lor (u land 0x3F)); 614 + k e 615 + end 616 + 617 + let rec encode_utf_16be e v = 618 + let k e = e.k <- encode_utf_16be; `Ok in 619 + match v with 620 + | `Await -> k e 621 + | `End -> flush k e 622 + | `Uchar u -> 623 + let u = Uchar.to_int u in 624 + let rem = o_rem e in 625 + if u < 0x10000 then 626 + begin 627 + let s, j, k = 628 + if rem < 2 then (t_range e 1; e.t, 0, t_flush k) else 629 + let j = e.o_pos in (e.o_pos <- e.o_pos + 2; e.o, j, k) 630 + in 631 + unsafe_set_byte s j (u lsr 8); 632 + unsafe_set_byte s (j + 1) (u land 0xFF); 633 + k e 634 + end else begin 635 + let s, j, k = 636 + if rem < 4 then (t_range e 3; e.t, 0, t_flush k) else 637 + let j = e.o_pos in (e.o_pos <- e.o_pos + 4; e.o, j, k) 638 + in 639 + let u' = u - 0x10000 in 640 + let hi = (0xD800 lor (u' lsr 10)) in 641 + let lo = (0xDC00 lor (u' land 0x3FF)) in 642 + unsafe_set_byte s j (hi lsr 8); 643 + unsafe_set_byte s (j + 1) (hi land 0xFF); 644 + unsafe_set_byte s (j + 2) (lo lsr 8); 645 + unsafe_set_byte s (j + 3) (lo land 0xFF); 646 + k e 647 + end 648 + 649 + let rec encode_utf_16le e v = (* encode_uft_16be with bytes swapped. *) 650 + let k e = e.k <- encode_utf_16le; `Ok in 651 + match v with 652 + | `Await -> k e 653 + | `End -> flush k e 654 + | `Uchar u -> 655 + let u = Uchar.to_int u in 656 + let rem = o_rem e in 657 + if u < 0x10000 then 658 + begin 659 + let s, j, k = 660 + if rem < 2 then (t_range e 1; e.t, 0, t_flush k) else 661 + let j = e.o_pos in (e.o_pos <- e.o_pos + 2; e.o, j, k) 662 + in 663 + unsafe_set_byte s j (u land 0xFF); 664 + unsafe_set_byte s (j + 1) (u lsr 8); 665 + k e 666 + end 667 + else 668 + begin 669 + let s, j, k = 670 + if rem < 4 then (t_range e 3; e.t, 0, t_flush k) else 671 + let j = e.o_pos in (e.o_pos <- e.o_pos + 4; e.o, j, k) 672 + in 673 + let u' = u - 0x10000 in 674 + let hi = (0xD800 lor (u' lsr 10)) in 675 + let lo = (0xDC00 lor (u' land 0x3FF)) in 676 + unsafe_set_byte s j (hi land 0xFF); 677 + unsafe_set_byte s (j + 1) (hi lsr 8); 678 + unsafe_set_byte s (j + 2) (lo land 0xFF); 679 + unsafe_set_byte s (j + 3) (lo lsr 8); 680 + k e 681 + end 682 + 683 + let encode_fun = function 684 + | `UTF_8 -> encode_utf_8 685 + | `UTF_16 -> encode_utf_16be 686 + | `UTF_16BE -> encode_utf_16be 687 + | `UTF_16LE -> encode_utf_16le 688 + 689 + let encoder encoding dst = 690 + let o, o_pos, o_max = match dst with 691 + | `Manual -> Bytes.empty, 1, 0 (* implies o_rem e = 0. *) 692 + | `Buffer _ 693 + | `Channel _ -> Bytes.create io_buffer_size, 0, io_buffer_size - 1 694 + in 695 + { dst = (dst :> dst); encoding = (encoding :> encoding); o; o_pos; o_max; 696 + t = Bytes.create 4; t_pos = 1; t_max = 0; k = encode_fun encoding} 697 + 698 + let encode e v = e.k e (v :> encode) 699 + let encoder_encoding e = e.encoding 700 + let encoder_dst e = e.dst 701 + 702 + (* Manual sources and destinations. *) 703 + 704 + module Manual = struct 705 + let src = src 706 + let dst = dst 707 + let dst_rem = o_rem 708 + end 709 + 710 + (* Strings folders and Buffer encoders *) 711 + 712 + module String = struct 713 + let encoding_guess s = 714 + let s = Bytes.unsafe_of_string s in 715 + match r_encoding s 0 (max (Bytes.length s) 3) with 716 + | `UTF_8 d -> `UTF_8, (d = `BOM) 717 + | `UTF_16BE d -> `UTF_16BE, (d = `BOM) 718 + | `UTF_16LE d -> `UTF_16LE, (d = `BOM) 719 + 720 + type 'a folder = 721 + 'a -> int -> [ `Uchar of Uchar.t | `Malformed of string ] -> 'a 722 + 723 + let fold_utf_8 ?(pos = 0) ?len f acc s = 724 + let rec loop acc f s i last = 725 + if i > last then acc else 726 + let need = unsafe_array_get utf_8_len (unsafe_byte s i) in 727 + if need = 0 then loop (f acc i (malformed s i 1)) f s (i + 1) last else 728 + let rem = last - i + 1 in 729 + if rem < need then f acc i (malformed s i rem) else 730 + loop (f acc i (r_utf_8 s i need)) f s (i + need) last 731 + in 732 + let len = match len with None -> String.length s - pos | Some l -> l in 733 + let last = pos + len - 1 in 734 + loop acc f (Bytes.unsafe_of_string s) pos last 735 + 736 + let fold_utf_16be ?(pos = 0) ?len f acc s = 737 + let rec loop acc f s i last = 738 + if i > last then acc else 739 + let rem = last - i + 1 in 740 + if rem < 2 then f acc i (malformed s i 1) else 741 + match r_utf_16 s i (i + 1) with 742 + | `Uchar _ | `Malformed _ as v -> loop (f acc i v) f s (i + 2) last 743 + | `Hi hi -> 744 + if rem < 4 then f acc i (malformed s i rem) else 745 + loop (f acc i (r_utf_16_lo hi s (i + 2) (i + 3))) f s (i + 4) last 746 + in 747 + let len = match len with None -> String.length s - pos | Some l -> l in 748 + let last = pos + len - 1 in 749 + loop acc f (Bytes.unsafe_of_string s) pos last 750 + 751 + let fold_utf_16le ?(pos = 0) ?len f acc s = 752 + (* [fold_utf_16be], bytes swapped. *) 753 + let rec loop acc f s i last = 754 + if i > last then acc else 755 + let rem = last - i + 1 in 756 + if rem < 2 then f acc i (malformed s i 1) else 757 + match r_utf_16 s (i + 1) i with 758 + | `Uchar _ | `Malformed _ as v -> loop (f acc i v) f s (i + 2) last 759 + | `Hi hi -> 760 + if rem < 4 then f acc i (malformed s i rem) else 761 + loop (f acc i (r_utf_16_lo hi s (i + 3) (i + 2))) f s (i + 4) last 762 + in 763 + let len = match len with None -> String.length s - pos | Some l -> l in 764 + let last = pos + len - 1 in 765 + loop acc f (Bytes.unsafe_of_string s) pos last 766 + end 767 + 768 + module Buffer = struct 769 + let add_utf_8 = Buffer.add_utf_8_uchar 770 + let add_utf_16be = Buffer.add_utf_16be_uchar 771 + let add_utf_16le = Buffer.add_utf_16le_uchar 772 + end

+494

vendor/opam/uutf/src/uutf.mli

··· 1 + (*--------------------------------------------------------------------------- 2 + Copyright (c) 2012 The uutf programmers. All rights reserved. 3 + SPDX-License-Identifier: ISC 4 + ---------------------------------------------------------------------------*) 5 + 6 + (** Non-blocking streaming Unicode codec. 7 + 8 + [Uutf] is a non-blocking streaming codec to {{:#decode}decode} and 9 + {{:#encode}encode} the {{:http://www.ietf.org/rfc/rfc3629.txt} 10 + UTF-8}, {{:http://www.ietf.org/rfc/rfc2781.txt} UTF-16}, UTF-16LE 11 + and UTF-16BE encoding schemes. It can efficiently work character by 12 + character without blocking on IO. Decoders perform 13 + character position tracking and support {{!nln}newline normalization}. 14 + 15 + Functions are also provided to {{!String} fold over} the characters 16 + of UTF encoded OCaml string values and to {{!Buffer}directly encode} 17 + characters in OCaml {!Stdlib.Buffer.t} values. {b Note} that since OCaml 18 + 4.14, that functionality can be found in {!Stdlib.String} and 19 + {!Stdlib.Buffer} and you are encouraged to migrate to it. 20 + 21 + See {{:#examples}examples} of use. 22 + 23 + {b References} 24 + {ul 25 + {- The Unicode Consortium. 26 + {e {{:http://www.unicode.org/versions/latest}The Unicode Standard}}. 27 + (latest version)}} 28 + *) 29 + 30 + (** {1:ucharcsts Special Unicode characters} *) 31 + 32 + val u_bom : Uchar.t 33 + (** [u_bom] is the {{:http://unicode.org/glossary/#byte_order_mark}byte 34 + order mark} (BOM) character ([U+FEFF]). From OCaml 4.06 on, use 35 + {!Uchar.bom}. *) 36 + 37 + val u_rep : Uchar.t 38 + (** [u_rep] is the 39 + {{:http://unicode.org/glossary/#replacement_character}replacement} 40 + character ([U+FFFD]). From OCaml 4.06 on, use 41 + {!Uchar.rep}. *) 42 + 43 + 44 + (** {1:schemes Unicode encoding schemes} *) 45 + 46 + type encoding = [ `UTF_16 | `UTF_16BE | `UTF_16LE | `UTF_8 ] 47 + (** The type for Unicode 48 + {{:http://unicode.org/glossary/#character_encoding_scheme}encoding 49 + schemes}. *) 50 + 51 + type decoder_encoding = [ encoding | `US_ASCII | `ISO_8859_1 ] 52 + (** The type for encoding schemes {e decoded} by [Uutf]. Unicode encoding 53 + schemes plus {{:http://tools.ietf.org/html/rfc20}US-ASCII} and 54 + {{:http://www.ecma-international.org/publications/standards/Ecma-094.htm} 55 + ISO/IEC 8859-1} (latin-1). *) 56 + 57 + val encoding_of_string : string -> decoder_encoding option 58 + (** [encoding_of_string s] converts a (case insensitive) 59 + {{:http://www.iana.org/assignments/character-sets}IANA character set name} 60 + to an encoding. *) 61 + 62 + val encoding_to_string : [< decoder_encoding] -> string 63 + (** [encoding_to_string e] is a 64 + {{:http://www.iana.org/assignments/character-sets}IANA character set name} 65 + for [e]. *) 66 + 67 + (** {1:decode Decode} *) 68 + 69 + type src = [ `Channel of in_channel | `String of string | `Manual ] 70 + (** The type for input sources. With a [`Manual] source the client 71 + must provide input with {!Manual.src}. *) 72 + 73 + type nln = [ `ASCII of Uchar.t | `NLF of Uchar.t | `Readline of Uchar.t ] 74 + (** The type for newline normalizations. The variant argument is the 75 + normalization character. 76 + {ul 77 + {- [`ASCII], normalizes CR ([U+000D]), LF ([U+000A]) and CRLF 78 + (<[U+000D], [U+000A]>).} 79 + {- [`NLF], normalizes the Unicode newline function (NLF). This is 80 + NEL ([U+0085]) and the normalizations of [`ASCII].} 81 + {- [`Readline], normalizes for a Unicode readline function. This is FF 82 + ([U+000C]), LS ([U+2028]), PS ([U+2029]), and the normalizations 83 + of [`NLF].}} 84 + Used with an appropriate normalization character the [`NLF] and 85 + [`Readline] normalizations allow to implement all the different 86 + recommendations of Unicode's newline guidelines (section 5.8 in 87 + Unicode 9.0.0). *) 88 + 89 + type decoder 90 + (** The type for decoders. *) 91 + 92 + val decoder : ?nln:[< nln] -> ?encoding:[< decoder_encoding] -> [< src] -> 93 + decoder 94 + (** [decoder nln encoding src] is a decoder that inputs from [src]. 95 + 96 + {b Byte order mark.} 97 + {{:http://unicode.org/glossary/#byte_order_mark}Byte order mark} 98 + (BOM) constraints are application dependent and prone to 99 + misunderstandings (see the 100 + {{:http://www.unicode.org/faq/utf_bom.html#BOM}FAQ}). Hence, 101 + [Uutf] decoders have a simple rule: an {e initial BOM is always 102 + removed from the input and not counted in character position 103 + tracking}. The function {!decoder_removed_bom} does however return 104 + [true] if a BOM was removed so that all the information can be 105 + recovered if needed. 106 + 107 + For UTF-16BE and UTF-16LE the above rule is a violation of 108 + conformance D96 and D97 of the standard. [Uutf] favors the idea 109 + that if there's a BOM, decoding with [`UTF_16] or the [`UTF_16XX] 110 + corresponding to the BOM should decode the same character sequence 111 + (this is not the case if you stick to the standard). The client 112 + can however regain conformance by consulting the result of 113 + {!decoder_removed_bom} and take appropriate action. 114 + 115 + {b Encoding.} [encoding] specifies the decoded encoding 116 + scheme. If [`UTF_16] is used the endianness is determined 117 + according to the standard: from a 118 + {{:http://unicode.org/glossary/#byte_order_mark}BOM} 119 + if there is one, [`UTF_16BE] otherwise. 120 + 121 + If [encoding] is unspecified it is guessed. The result of a guess 122 + can only be [`UTF_8], [`UTF_16BE] or [`UTF_16LE]. The heuristic 123 + looks at the first three bytes of input (or less if impossible) 124 + and takes the {e first} matching byte pattern in the table below. 125 + {v 126 + xx = any byte 127 + .. = any byte or no byte (input too small) 128 + pp = positive byte 129 + uu = valid UTF-8 first byte 130 + 131 + Bytes | Guess | Rationale 132 + ---------+-----------+----------------------------------------------- 133 + EF BB BF | `UTF_8 | UTF-8 BOM 134 + FE FF .. | `UTF_16BE | UTF-16BE BOM 135 + FF FE .. | `UTF_16LE | UTF-16LE BOM 136 + 00 pp .. | `UTF_16BE | ASCII UTF-16BE and U+0000 is often forbidden 137 + pp 00 .. | `UTF_16LE | ASCII UTF-16LE and U+0000 is often forbidden 138 + uu .. .. | `UTF_8 | ASCII UTF-8 or valid UTF-8 first byte. 139 + xx xx .. | `UTF_16BE | Not UTF-8 => UTF-16, no BOM => UTF-16BE 140 + .. .. .. | `UTF_8 | Single malformed UTF-8 byte or no input. 141 + v} 142 + This heuristic is compatible both with BOM based 143 + recognitition and 144 + {{:http://tools.ietf.org/html/rfc4627#section-3}JSON-like encoding 145 + recognition} that relies on ASCII being present at the beginning 146 + of the stream. Also, {!decoder_removed_bom} will tell the client 147 + if the guess was BOM based. 148 + 149 + {b Newline normalization.} If [nln] is specified, the given 150 + newline normalization is performed, see {!nln}. Otherwise 151 + all newlines are returned as found in the input. 152 + 153 + {b Character position.} The line number, column number, byte count 154 + and character count of the last decoded character (including 155 + [`Malformed] ones) are respectively returned by {!decoder_line}, 156 + {!decoder_col}, {!decoder_byte_count} and {!decoder_count}. Before 157 + the first call to {!val-decode} the line number is [1] and the column 158 + is [0]. Each {!val-decode} returning [`Uchar] or [`Malformed] 159 + increments the column until a newline. On a newline, the line 160 + number is incremented and the column set to zero. For example the 161 + line is [2] and column [0] after the first newline was 162 + decoded. This can be understood as if {!val-decode} was moving an 163 + insertion point to the right in the data. A {e newline} is 164 + anything normalized by [`Readline], see {!nln}. 165 + 166 + [Uutf] assumes that each Unicode scalar value has a column width 167 + of 1. The same assumption may not be made by the display program 168 + (e.g. for [emacs]' compilation mode you need to set 169 + [compilation-error-screen-columns] to [nil]). The problem is in 170 + general difficult to solve without interaction or convention with the 171 + display program's rendering engine. Depending on the context better column 172 + increments can be implemented by using {!Uucp.Break.tty_width_hint} or 173 + {{:http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries} 174 + grapheme cluster boundaries} (see {!Uuseg}). *) 175 + 176 + val decode : decoder -> 177 + [ `Await | `Uchar of Uchar.t | `End | `Malformed of string] 178 + (** [decode d] is: 179 + {ul 180 + {- [`Await] if [d] has a [`Manual] input source and awaits 181 + for more input. The client must use {!Manual.src} to provide it.} 182 + {- [`Uchar u] if a Unicode scalar value [u] was decoded.} 183 + {- [`End] if the end of input was reached.} 184 + {- [`Malformed bytes] if the [bytes] sequence is malformed according to 185 + the decoded encoding scheme. If you are interested in a best-effort 186 + decoding you can still continue to decode after an error until the 187 + decoder synchronizes again on valid bytes. It may however be a good 188 + idea to signal the malformed characters by adding an {!u_rep} 189 + character to the parsed data, see the {{:#examples}examples}.}} 190 + 191 + {b Note.} Repeated invocation always eventually returns [`End], even 192 + in case of errors. *) 193 + 194 + val decoder_encoding : decoder -> decoder_encoding 195 + (** [decoder_encoding d] is [d]'s the decoded encoding scheme of [d]. 196 + 197 + {b Warning.} If the decoder guesses the encoding or uses [`UTF_16], 198 + rely on this value only after the first [`Uchar] was decoded. *) 199 + 200 + (**/**) 201 + 202 + (* This function is dangerous, it may destroy the current continuation. 203 + But it's needed for things like XML parsers. *) 204 + 205 + val set_decoder_encoding : decoder -> [< decoder_encoding] -> unit 206 + (** [set_decoder_encoding d enc] changes the decoded encoding 207 + to [enc] after decoding started. 208 + 209 + {b Warning.} Call only after {!val-decode} was called on [d] and that the 210 + last call to it returned something different from [`Await] or data may 211 + be lost. After encoding guess wait for at least three [`Uchar]s. *) 212 + 213 + (**/**) 214 + 215 + val decoder_line : decoder -> int 216 + (** [decoder_line d] is the line number of the last 217 + decoded (or malformed) character. See {!val-decoder} for details. *) 218 + 219 + val decoder_col : decoder -> int 220 + (** [decoder_col d] is the column number of the last decoded 221 + (or malformed) character. See {!val-decoder} for details. *) 222 + 223 + val decoder_byte_count : decoder -> int 224 + (** [decoder_byte_count d] is the number of bytes already decoded on 225 + [d] (including malformed ones). This is the last {!val-decode}'s 226 + end byte offset counting from the beginning of the stream. *) 227 + 228 + val decoder_count : decoder -> int 229 + (** [decoder_count d] is the number of characters already decoded on [d] 230 + (including malformed ones). See {!val-decoder} for details. *) 231 + 232 + val decoder_removed_bom : decoder -> bool 233 + (** [decoder_removed_bom d] is [true] iff an {e initial} 234 + {{:http://unicode.org/glossary/#byte_order_mark}BOM} was 235 + removed from the input stream. See {!val-decoder} for details. *) 236 + 237 + val decoder_src : decoder -> src 238 + (** [decoder_src d] is [d]'s input source. *) 239 + 240 + val decoder_nln : decoder -> nln option 241 + (** [decoder_nln d] returns [d]'s newline normalization (if any). *) 242 + 243 + val pp_decode : Format.formatter -> 244 + [< `Await | `Uchar of Uchar.t | `End | `Malformed of string] -> unit 245 + (** [pp_decode ppf v] prints an unspecified representation of [v] on 246 + [ppf]. *) 247 + 248 + (** {1:encode Encode} *) 249 + 250 + type dst = [ `Channel of out_channel | `Buffer of Buffer.t | `Manual ] 251 + (** The type for output destinations. With a [`Manual] destination the client 252 + must provide output storage with {!Manual.dst}. *) 253 + 254 + type encoder 255 + (** The type for Unicode encoders. *) 256 + 257 + val encoder : [< encoding] -> [< dst] -> encoder 258 + (** [encoder encoding dst] is an encoder for [encoding] that outputs 259 + to [dst]. 260 + 261 + {b Note.} No initial 262 + {{:http://unicode.org/glossary/#byte_order_mark}BOM} 263 + is encoded. If needed, this duty is left to the client. *) 264 + 265 + val encode : 266 + encoder -> [<`Await | `End | `Uchar of Uchar.t ] -> [`Ok | `Partial ] 267 + (** [encode e v] is : 268 + {ul 269 + {- [`Partial] iff [e] has a [`Manual] destination and needs more output 270 + storage. The client must use {!Manual.dst} to provide a new buffer 271 + and then call {!val-encode} with [`Await] until [`Ok] is returned.} 272 + {- [`Ok] when the encoder is ready to encode a new [`Uchar] or [`End]}} 273 + 274 + For [`Manual] destination, encoding [`End] always returns 275 + [`Partial], the client should continue as usual with [`Await] 276 + until [`Ok] is returned at which point {!Manual.dst_rem} [e] is 277 + guaranteed to be the size of the last provided buffer (i.e. nothing 278 + was written). 279 + 280 + {b Raises.} [Invalid_argument] if an [`Uchar] or [`End] is encoded 281 + after a [`Partial] encode. *) 282 + 283 + val encoder_encoding : encoder -> encoding 284 + (** [encoder_encoding e] is [e]'s encoding. *) 285 + 286 + val encoder_dst : encoder -> dst 287 + (** [encoder_dst e] is [e]'s output destination. *) 288 + 289 + (** {1:manual Manual sources and destinations.} *) 290 + 291 + (** Manual sources and destinations. 292 + 293 + {b Warning.} Use only with [`Manual] decoder and encoders. *) 294 + module Manual : sig 295 + val src : decoder -> Bytes.t -> int -> int -> unit 296 + (** [src d s j l] provides [d] with [l] bytes to read, starting at 297 + [j] in [s]. This byte range is read by calls to {!val-decode} with [d] 298 + until [`Await] is returned. To signal the end of input call the function 299 + with [l = 0]. *) 300 + 301 + val dst : encoder -> Bytes.t -> int -> int -> unit 302 + (** [dst e s j l] provides [e] with [l] bytes to write, starting 303 + at [j] in [s]. This byte range is written by calls to 304 + {!val-encode} with [e] until [`Partial] is returned. Use {!dst_rem} to 305 + know the remaining number of non-written free bytes in [s]. *) 306 + 307 + val dst_rem : encoder -> int 308 + (** [dst_rem e] is the remaining number of non-written, free bytes 309 + in the last buffer provided with {!Manual.dst}. *) 310 + end 311 + 312 + (** {1:strbuf String folders and Buffer encoders} *) 313 + 314 + (** Fold over the characters of UTF encoded OCaml [string] values. 315 + 316 + {b Note.} Since OCaml 4.14, UTF decoders are available in 317 + {!Stdlib.String}. You are encouraged to migrate to them. *) 318 + module String : sig 319 + 320 + (** {1 Encoding guess} *) 321 + 322 + val encoding_guess : string -> [ `UTF_8 | `UTF_16BE | `UTF_16LE ] * bool 323 + (** [encoding_guess s] is the encoding guessed for [s] coupled with 324 + [true] iff there's an initial 325 + {{:http://unicode.org/glossary/#byte_order_mark}BOM}. *) 326 + 327 + (** {1 String folders} 328 + 329 + {b Note.} Initial {{:http://unicode.org/glossary/#byte_order_mark}BOM}s 330 + are also folded over. *) 331 + 332 + type 'a folder = 'a -> int -> [ `Uchar of Uchar.t | `Malformed of string ] -> 333 + 'a 334 + (** The type for character folders. The integer is the index in the 335 + string where the [`Uchar] or [`Malformed] starts. *) 336 + 337 + val fold_utf_8 : ?pos:int -> ?len:int -> 'a folder -> 'a -> string -> 'a 338 + (** [fold_utf_8 f a s ?pos ?len ()] is 339 + [f (] ... [(f (f a pos u]{_0}[) j]{_1}[ u]{_1}[)] ... [)] ... [) 340 + j]{_n}[ u]{_n} 341 + where [u]{_i}, [j]{_i} are characters and their start position 342 + in the UTF-8 encoded substring [s] starting at [pos] and [len] 343 + long. The default value for [pos] is [0] and [len] is 344 + [String.length s - pos]. *) 345 + 346 + val fold_utf_16be : ?pos:int -> ?len:int -> 'a folder -> 'a -> string -> 'a 347 + (** [fold_utf_16be f a s ?pos ?len ()] is 348 + [f (] ... [(f (f a pos u]{_0}[) j]{_1}[ u]{_1}[)] ... [)] ... [) 349 + j]{_n}[ u]{_n} 350 + where [u]{_i}, [j]{_i} are characters and their start position 351 + in the UTF-8 encoded substring [s] starting at [pos] and [len] 352 + long. The default value for [pos] is [0] and [len] is 353 + [String.length s - pos]. *) 354 + 355 + val fold_utf_16le : ?pos:int -> ?len:int -> 'a folder -> 'a -> string -> 'a 356 + (** [fold_utf_16le f a s ?pos ?len ()] is 357 + [f (] ... [(f (f a pos u]{_0}[) j]{_1}[ u]{_1}[)] ... [)] ... [) 358 + j]{_n}[ u]{_n} 359 + where [u]{_i}, [j]{_i} are characters and their start position 360 + in the UTF-8 encoded substring [s] starting at [pos] and [len] 361 + long. The default value for [pos] is [0] and [len] is 362 + [String.length s - pos]. *) 363 + end 364 + 365 + (** UTF encode characters in OCaml {!Buffer.t} values. 366 + 367 + {b Note.} Since OCaml 4.06, these encoders are available in 368 + {!Stdlib.Buffer}. You are encouraged to migrate to them. *) 369 + module Buffer : sig 370 + 371 + (** {1 Buffer encoders} *) 372 + 373 + val add_utf_8 : Buffer.t -> Uchar.t -> unit 374 + (** [add_utf_8 b u] adds the UTF-8 encoding of [u] to [b]. *) 375 + 376 + val add_utf_16be : Buffer.t -> Uchar.t -> unit 377 + (** [add_utf_16be b u] adds the UTF-16BE encoding of [u] to [b]. *) 378 + 379 + val add_utf_16le : Buffer.t -> Uchar.t -> unit 380 + (** [add_utf_16le b u] adds the UTF-16LE encoding of [u] to [b]. *) 381 + end 382 + 383 + (** {1:examples Examples} 384 + 385 + {2:readlines Read lines} 386 + 387 + The value of [lines src] is the list of lines in [src] as UTF-8 388 + encoded OCaml strings. Line breaks are determined according to the 389 + recommendation R4 for a [readline] function in section 5.8 of 390 + Unicode 9.0.0. If a decoding error occurs we silently replace the 391 + malformed sequence by the replacement character {!u_rep} and continue. 392 + {[let lines ?encoding (src : [`Channel of in_channel | `String of string]) = 393 + let rec loop d buf acc = match Uutf.decode d with 394 + | `Uchar u -> 395 + begin match Uchar.to_int u with 396 + | 0x000A -> 397 + let line = Buffer.contents buf in 398 + Buffer.clear buf; loop d buf (line :: acc) 399 + | _ -> 400 + Uutf.Buffer.add_utf_8 buf u; loop d buf acc 401 + end 402 + | `End -> List.rev (Buffer.contents buf :: acc) 403 + | `Malformed _ -> Uutf.Buffer.add_utf_8 buf Uutf.u_rep; loop d buf acc 404 + | `Await -> assert false 405 + in 406 + let nln = `Readline (Uchar.of_int 0x000A) in 407 + loop (Uutf.decoder ~nln ?encoding src) (Buffer.create 512) [] 408 + ]} 409 + Using the [`Manual] interface, [lines_fd] does the same but on a Unix file 410 + descriptor. 411 + {[let lines_fd ?encoding (fd : Unix.file_descr) = 412 + let rec loop fd s d buf acc = match Uutf.decode d with 413 + | `Uchar u -> 414 + begin match Uchar.to_int u with 415 + | 0x000A -> 416 + let line = Buffer.contents buf in 417 + Buffer.clear buf; loop fd s d buf (line :: acc) 418 + | _ -> 419 + Uutf.Buffer.add_utf_8 buf u; loop fd s d buf acc 420 + end 421 + | `End -> List.rev (Buffer.contents buf :: acc) 422 + | `Malformed _ -> Uutf.Buffer.add_utf_8 buf Uutf.u_rep; loop fd s d buf acc 423 + | `Await -> 424 + let rec unix_read fd s j l = try Unix.read fd s j l with 425 + | Unix.Unix_error (Unix.EINTR, _, _) -> unix_read fd s j l 426 + in 427 + let rc = unix_read fd s 0 (Bytes.length s) in 428 + Uutf.Manual.src d s 0 rc; loop fd s d buf acc 429 + in 430 + let s = Bytes.create 65536 (* UNIX_BUFFER_SIZE in 4.0.0 *) in 431 + let nln = `Readline (Uchar.of_int 0x000A) in 432 + loop fd s (Uutf.decoder ~nln ?encoding `Manual) (Buffer.create 512) [] 433 + ]} 434 + 435 + {2:recode Recode} 436 + 437 + The result of [recode src out_encoding dst] has the characters of 438 + [src] written on [dst] with encoding [out_encoding]. If a 439 + decoding error occurs we silently replace the malformed sequence 440 + by the replacement character {!u_rep} and continue. Note that we 441 + don't add an initial 442 + {{:http://unicode.org/glossary/#byte_order_mark}BOM} to [dst], 443 + recoding will thus loose the initial BOM [src] may have. Whether 444 + this is a problem or not depends on the context. 445 + {[let recode ?nln ?encoding out_encoding 446 + (src : [`Channel of in_channel | `String of string]) 447 + (dst : [`Channel of out_channel | `Buffer of Buffer.t]) 448 + = 449 + let rec loop d e = match Uutf.decode d with 450 + | `Uchar _ as u -> ignore (Uutf.encode e u); loop d e 451 + | `End -> ignore (Uutf.encode e `End) 452 + | `Malformed _ -> ignore (Uutf.encode e (`Uchar Uutf.u_rep)); loop d e 453 + | `Await -> assert false 454 + in 455 + let d = Uutf.decoder ?nln ?encoding src in 456 + let e = Uutf.encoder out_encoding dst in 457 + loop d e]} 458 + Using the [`Manual] interface, [recode_fd] does the same but between 459 + Unix file descriptors. 460 + {[let recode_fd ?nln ?encoding out_encoding 461 + (fdi : Unix.file_descr) 462 + (fdo : Unix.file_descr) 463 + = 464 + let rec encode fd s e v = match Uutf.encode e v with `Ok -> () 465 + | `Partial -> 466 + let rec unix_write fd s j l = 467 + let rec write fd s j l = try Unix.single_write fd s j l with 468 + | Unix.Unix_error (Unix.EINTR, _, _) -> write fd s j l 469 + in 470 + let wc = write fd s j l in 471 + if wc < l then unix_write fd s (j + wc) (l - wc) else () 472 + in 473 + unix_write fd s 0 (Bytes.length s - Uutf.Manual.dst_rem e); 474 + Uutf.Manual.dst e s 0 (Bytes.length s); 475 + encode fd s e `Await 476 + in 477 + let rec loop fdi fdo ds es d e = match Uutf.decode d with 478 + | `Uchar _ as u -> encode fdo es e u; loop fdi fdo ds es d e 479 + | `End -> encode fdo es e `End 480 + | `Malformed _ -> encode fdo es e (`Uchar Uutf.u_rep); loop fdi fdo ds es d e 481 + | `Await -> 482 + let rec unix_read fd s j l = try Unix.read fd s j l with 483 + | Unix.Unix_error (Unix.EINTR, _, _) -> unix_read fd s j l 484 + in 485 + let rc = unix_read fdi ds 0 (Bytes.length ds) in 486 + Uutf.Manual.src d ds 0 rc; loop fdi fdo ds es d e 487 + in 488 + let ds = Bytes.create 65536 (* UNIX_BUFFER_SIZE in 4.0.0 *) in 489 + let es = Bytes.create 65536 (* UNIX_BUFFER_SIZE in 4.0.0 *) in 490 + let d = Uutf.decoder ?nln ?encoding `Manual in 491 + let e = Uutf.encoder out_encoding `Manual in 492 + Uutf.Manual.dst e es 0 (Bytes.length es); 493 + loop fdi fdo ds es d e]} 494 + *)

+1

vendor/opam/uutf/src/uutf.mllib

··· 1 + Uutf

+94

vendor/opam/uutf/test/examples.ml

··· 1 + (* Examples from the documentation, this code is in public domain. *) 2 + 3 + (* Read lines *) 4 + 5 + let lines ?encoding (src : [`Channel of in_channel | `String of string]) = 6 + let rec loop d buf acc = match Uutf.decode d with 7 + | `Uchar u -> 8 + begin match Uchar.to_int u with 9 + | 0x000A -> 10 + let line = Buffer.contents buf in 11 + Buffer.clear buf; loop d buf (line :: acc) 12 + | _ -> 13 + Uutf.Buffer.add_utf_8 buf u; loop d buf acc 14 + end 15 + | `End -> List.rev (Buffer.contents buf :: acc) 16 + | `Malformed _ -> Uutf.Buffer.add_utf_8 buf Uutf.u_rep; loop d buf acc 17 + | `Await -> assert false 18 + in 19 + let nln = `Readline (Uchar.of_int 0x000A) in 20 + loop (Uutf.decoder ~nln ?encoding src) (Buffer.create 512) [] 21 + 22 + let lines_fd ?encoding (fd : Unix.file_descr) = 23 + let rec loop fd s d buf acc = match Uutf.decode d with 24 + | `Uchar u -> 25 + begin match Uchar.to_int u with 26 + | 0x000A -> 27 + let line = Buffer.contents buf in 28 + Buffer.clear buf; loop fd s d buf (line :: acc) 29 + | _ -> 30 + Uutf.Buffer.add_utf_8 buf u; loop fd s d buf acc 31 + end 32 + | `End -> List.rev (Buffer.contents buf :: acc) 33 + | `Malformed _ -> Uutf.Buffer.add_utf_8 buf Uutf.u_rep; loop fd s d buf acc 34 + | `Await -> 35 + let rec unix_read fd s j l = try Unix.read fd s j l with 36 + | Unix.Unix_error (Unix.EINTR, _, _) -> unix_read fd s j l 37 + in 38 + let rc = unix_read fd s 0 (Bytes.length s) in 39 + Uutf.Manual.src d s 0 rc; loop fd s d buf acc 40 + in 41 + let s = Bytes.create 65536 (* UNIX_BUFFER_SIZE in 4.0.0 *) in 42 + let nln = `Readline (Uchar.of_int 0x000A) in 43 + loop fd s (Uutf.decoder ~nln ?encoding `Manual) (Buffer.create 512) [] 44 + 45 + (* Recode *) 46 + 47 + let recode ?nln ?encoding out_encoding 48 + (src : [`Channel of in_channel | `String of string]) 49 + (dst : [`Channel of out_channel | `Buffer of Buffer.t]) 50 + = 51 + let rec loop d e = match Uutf.decode d with 52 + | `Uchar _ as u -> ignore (Uutf.encode e u); loop d e 53 + | `End -> ignore (Uutf.encode e `End) 54 + | `Malformed _ -> ignore (Uutf.encode e (`Uchar Uutf.u_rep)); loop d e 55 + | `Await -> assert false 56 + in 57 + let d = Uutf.decoder ?nln ?encoding src in 58 + let e = Uutf.encoder out_encoding dst in 59 + loop d e 60 + 61 + let recode_fd ?nln ?encoding out_encoding 62 + (fdi : Unix.file_descr) 63 + (fdo : Unix.file_descr) 64 + = 65 + let rec encode fd s e v = match Uutf.encode e v with `Ok -> () 66 + | `Partial -> 67 + let rec unix_write fd s j l = 68 + let rec write fd s j l = try Unix.single_write fd s j l with 69 + | Unix.Unix_error (Unix.EINTR, _, _) -> write fd s j l 70 + in 71 + let wc = write fd s j l in 72 + if wc < l then unix_write fd s (j + wc) (l - wc) else () 73 + in 74 + unix_write fd s 0 (Bytes.length s - Uutf.Manual.dst_rem e); 75 + Uutf.Manual.dst e s 0 (Bytes.length s); 76 + encode fd s e `Await 77 + in 78 + let rec loop fdi fdo ds es d e = match Uutf.decode d with 79 + | `Uchar _ as u -> encode fdo es e u; loop fdi fdo ds es d e 80 + | `End -> encode fdo es e `End 81 + | `Malformed _ -> encode fdo es e (`Uchar Uutf.u_rep); loop fdi fdo ds es d e 82 + | `Await -> 83 + let rec unix_read fd s j l = try Unix.read fd s j l with 84 + | Unix.Unix_error (Unix.EINTR, _, _) -> unix_read fd s j l 85 + in 86 + let rc = unix_read fdi ds 0 (Bytes.length ds) in 87 + Uutf.Manual.src d ds 0 rc; loop fdi fdo ds es d e 88 + in 89 + let ds = Bytes.create 65536 (* UNIX_BUFFER_SIZE in 4.0.0 *) in 90 + let es = Bytes.create 65536 (* UNIX_BUFFER_SIZE in 4.0.0 *) in 91 + let d = Uutf.decoder ?nln ?encoding `Manual in 92 + let e = Uutf.encoder out_encoding `Manual in 93 + Uutf.Manual.dst e es 0 (Bytes.length es); 94 + loop fdi fdo ds es d e

+376

vendor/opam/uutf/test/test_uutf.ml

··· 1 + (*--------------------------------------------------------------------------- 2 + Copyright (c) 2012 The uutf programmers. All rights reserved. 3 + SPDX-License-Identifier: ISC 4 + ---------------------------------------------------------------------------*) 5 + 6 + let u_nl = Uchar.of_int 0x000A 7 + let log f = Format.printf (f ^^ "@?") 8 + let fail fmt = 9 + let fail _ = failwith (Format.flush_str_formatter ()) in 10 + Format.kfprintf fail Format.str_formatter fmt 11 + 12 + let fail_decode e f = 13 + fail "expected %a, decoded %a" Uutf.pp_decode e Uutf.pp_decode f 14 + 15 + let uchar_succ u = if Uchar.equal u Uchar.max then u else Uchar.succ u 16 + let iter_uchars f = 17 + for u = 0x0000 to 0xD7FF do f (Uchar.unsafe_of_int u) done; 18 + for u = 0xE000 to 0x10FFFF do f (Uchar.unsafe_of_int u) done 19 + 20 + let codec_test () = 21 + let codec_uchars encoding s bsize = 22 + log "Codec every unicode scalar value in %s with buffer size %d.\n%!" 23 + (Uutf.encoding_to_string encoding) bsize; 24 + let encode_uchars encoding s bsize = 25 + let spos = ref 0 in 26 + let e = Uutf.encoder encoding `Manual in 27 + let rec encode e v = match Uutf.encode e v with `Ok -> () 28 + | `Partial -> 29 + let brem = Bytes.length s - !spos in 30 + let drem = Uutf.Manual.dst_rem e in 31 + let bsize = min bsize brem in 32 + Uutf.Manual.dst e s !spos bsize; 33 + spos := !spos + bsize - drem; 34 + encode e `Await 35 + in 36 + let encode_u u = encode e (`Uchar u) in 37 + iter_uchars encode_u; encode e `End; 38 + !spos - Uutf.Manual.dst_rem e (* encoded length. *) 39 + in 40 + let decode_uchars encoding s slen bsize = 41 + let spos = ref 0 in 42 + let bsize = min bsize slen in 43 + let d = Uutf.decoder ~encoding `Manual in 44 + let rec decode d = match Uutf.decode d with 45 + | `Malformed _ | `Uchar _ | `End as v -> v 46 + | `Await -> 47 + let rem = slen - !spos in 48 + let bsize = min bsize rem in 49 + Uutf.Manual.src d s !spos bsize; 50 + spos := !spos + bsize; 51 + decode d 52 + in 53 + let decode_u u = match decode d with 54 + | `Uchar u' when u = u' -> () 55 + | v -> fail_decode (`Uchar u) v 56 + in 57 + iter_uchars decode_u; 58 + match decode d with 59 + | `End -> () | v -> fail_decode `End v 60 + in 61 + let slen = encode_uchars encoding s bsize in 62 + decode_uchars encoding s slen bsize 63 + in 64 + let full = 4 * 0x10FFFF in (* will hold everything in any encoding. *) 65 + let s = Bytes.create full in 66 + let test encoding = 67 + (* Test with various sizes to increase condition coverage. *) 68 + for i = 1 to 11 do codec_uchars encoding s i done; 69 + codec_uchars encoding s full; 70 + in 71 + test `UTF_8; test `UTF_16BE; test `UTF_16LE 72 + 73 + let buffer_string_codec_test () = 74 + let codec_uchars encoding encode decode b = 75 + log "Buffer/String codec every unicode scalar value in %s.\n%!" 76 + (Uutf.encoding_to_string encoding); 77 + Buffer.clear b; 78 + iter_uchars (encode b); 79 + let s = Buffer.contents b in 80 + let check uchar _ = function 81 + | `Uchar u when Uchar.equal u uchar -> uchar_succ uchar 82 + | v -> fail_decode (`Uchar uchar) v 83 + in 84 + ignore (decode ?pos:None ?len:None check (Uchar.of_int 0x0000) s) 85 + in 86 + let b = Buffer.create (4 * 0x10FFFF) in 87 + codec_uchars `UTF_8 Uutf.Buffer.add_utf_8 Uutf.String.fold_utf_8 b; 88 + codec_uchars `UTF_16BE Uutf.Buffer.add_utf_16be Uutf.String.fold_utf_16be b; 89 + codec_uchars `UTF_16LE Uutf.Buffer.add_utf_16le Uutf.String.fold_utf_16le b 90 + 91 + let pos_test () = 92 + let test encoding s = 93 + log "Test position tracking in %s.\n%!" (Uutf.encoding_to_string encoding); 94 + let pos d (l, c, k) = 95 + match Uutf.decoder_line d, Uutf.decoder_col d, Uutf.decoder_count d with 96 + | (l', c', k') when l = l' && c = c' && k = k' -> ignore (Uutf.decode d) 97 + | (l', c', k') -> 98 + fail "Expected position (%d,%d,%d) found (%d,%d,%d)." l c k l' c' k' 99 + in 100 + let e = Uutf.decoder ~encoding (`String s) in 101 + pos e (1, 0, 0); pos e (1, 1, 1); pos e (1, 2, 2); pos e (2, 0, 3); 102 + pos e (2, 1, 4); pos e (3, 0, 5); pos e (3, 0, 6); pos e (3, 1, 7); 103 + pos e (3, 2, 8); pos e (4, 0, 9); pos e (4, 0, 10); pos e (5, 0, 11); 104 + pos e (6, 0, 12); pos e (6, 0, 12); pos e (6, 0, 12); 105 + let e = Uutf.decoder ~nln:(`ASCII u_nl) ~encoding (`String s) in 106 + pos e (1, 0, 0); pos e (1, 1, 1); pos e (1, 2, 2); pos e (2, 0, 3); 107 + pos e (2, 1, 4); pos e (3, 0, 5); pos e (3, 1, 6); pos e (3, 2, 7); 108 + pos e (4, 0, 8); pos e (5, 0, 9); pos e (6, 0, 10); pos e (6, 0, 10); 109 + pos e (6, 0, 10); 110 + let e = Uutf.decoder ~nln:(`NLF u_nl) ~encoding (`String s) in 111 + pos e (1, 0, 0); pos e (1, 1, 1); pos e (1, 2, 2); pos e (2, 0, 3); 112 + pos e (2, 1, 4); pos e (3, 0, 5); pos e (3, 1, 6); pos e (3, 2, 7); 113 + pos e (4, 0, 8); pos e (5, 0, 9); pos e (6, 0, 10); pos e (6, 0, 10); 114 + pos e (6, 0, 10); 115 + let e = Uutf.decoder ~nln:(`Readline u_nl) ~encoding (`String s) in 116 + pos e (1, 0, 0); pos e (1, 1, 1); pos e (1, 2, 2); pos e (2, 0, 3); 117 + pos e (2, 1, 4); pos e (3, 0, 5); pos e (3, 1, 6); pos e (3, 2, 7); 118 + pos e (4, 0, 8); pos e (5, 0, 9); pos e (6, 0, 10); pos e (6, 0, 10); 119 + pos e (6, 0, 10); 120 + in 121 + test `UTF_8 "LL\nL\r\nLL\r\n\n\x0C"; 122 + test `UTF_16BE 123 + "\x00\x4C\x00\x4C\x00\x0A\x00\x4C\x00\x0D\x00\x0A\x00\x4C\x00\x4C\ 124 + \x00\x0D\x00\x0A\x00\x0A\x00\x0C"; 125 + test `UTF_16LE 126 + "\x4C\x00\x4C\x00\x0A\x00\x4C\x00\x0D\x00\x0A\x00\x4C\x00\x4C\x00\ 127 + \x0D\x00\x0A\x00\x0A\x00\x0C\x00"; 128 + () 129 + 130 + let guess_test () = 131 + log "Test encoding guessing.\n%!"; 132 + let test (s, enc, removed_bom, seq) = 133 + let d = Uutf.decoder (`String s) in 134 + let rec test_seq seq d = match seq, Uutf.decode d with 135 + | `Uchar u :: vs, `Uchar u' when Uchar.equal u u' -> test_seq vs d 136 + | `Malformed bs :: vs, `Malformed bs' when bs = bs' -> test_seq vs d 137 + | [], `End -> () 138 + | v :: _, v' -> fail_decode v v' 139 + | _ , _ -> assert false 140 + in 141 + test_seq seq d; 142 + let guess = Uutf.decoder_encoding d in 143 + if guess <> enc then fail "expected encoding: %s guessed: %s" 144 + (Uutf.encoding_to_string enc) (Uutf.encoding_to_string guess); 145 + let rem_bom = Uutf.decoder_removed_bom d in 146 + if rem_bom <> removed_bom then 147 + fail "expected removed bom: %b found: %b" removed_bom rem_bom 148 + in 149 + let uchar u = `Uchar (Uchar.unsafe_of_int u) in 150 + (* UTF-8 guess *) 151 + test ("", `UTF_8, false, []); 152 + test ("\xEF", `UTF_8, false, [`Malformed "\xEF";]); 153 + test ("\xEF\xBB", `UTF_8, false, [`Malformed "\xEF\xBB";]); 154 + test ("\xEF\xBB\x00", `UTF_8, false, [`Malformed "\xEF\xBB\x00";]); 155 + test ("\xEF\xBB\xBF\xEF\xBB\xBF", `UTF_8, true, [`Uchar Uutf.u_bom;]); 156 + test ("\n\r\n", `UTF_8, false, [`Uchar u_nl; uchar 0x0D; `Uchar u_nl;]); 157 + test ("\n\x80\xEF\xBB\xBF\n", `UTF_8, false, 158 + [`Uchar u_nl; `Malformed "\x80"; `Uchar Uutf.u_bom; `Uchar u_nl]); 159 + test ("\n\n\xEF\xBB\x00\n", `UTF_8, false, 160 + [`Uchar u_nl; `Uchar u_nl; `Malformed "\xEF\xBB\x00"; `Uchar u_nl;]); 161 + test ("\n\xC8\x99", `UTF_8, false, [`Uchar u_nl; uchar 0x0219;]); 162 + test ("\xC8\x99\n", `UTF_8, false, [uchar 0x0219; `Uchar u_nl;]); 163 + test ("\xC8\x99\n\n", `UTF_8, false, 164 + [uchar 0x0219; `Uchar u_nl; `Uchar u_nl]); 165 + test ("\xC8\x99\xC8\x99", `UTF_8, false, [uchar 0x0219; uchar 0x0219]); 166 + test ("\xC8\x99\xF0\x9F\x90\xAB", `UTF_8, false, 167 + [uchar 0x0219; uchar 0x1F42B]); 168 + test ("\xF0\x9F\x90\xAB\n", `UTF_8, false, [uchar 0x1F42B; `Uchar u_nl ]); 169 + (* UTF-16BE guess *) 170 + test ("\xFE\xFF\xDB\xFF\xDF\xFF\x00\x0A", `UTF_16BE, true, 171 + [uchar 0x10FFFF; `Uchar u_nl;]); 172 + test ("\xFE\xFF\xDB\xFF\x00\x0A\x00\x0A", `UTF_16BE, true, 173 + [`Malformed "\xDB\xFF\x00\x0A"; `Uchar u_nl;]); 174 + test ("\xFE\xFF\xDB\xFF\xDF", `UTF_16BE, true, 175 + [`Malformed "\xDB\xFF\xDF";]); 176 + test ("\x80\x81\xDB\xFF\xDF\xFF\xFE\xFF\xDF\xFF\xDB\xFF", `UTF_16BE, false, 177 + [uchar 0x8081; uchar 0x10FFFF; `Uchar Uutf.u_bom; 178 + `Malformed "\xDF\xFF"; `Malformed "\xDB\xFF"]); 179 + test ("\x80\x81\xDF\xFF\xDB\xFF\xFE", `UTF_16BE, false, 180 + [uchar 0x8081; `Malformed "\xDF\xFF"; `Malformed "\xDB\xFF\xFE";]); 181 + test ("\x00\x0A", `UTF_16BE, false, [`Uchar u_nl]); 182 + test ("\x00\x0A\xDB", `UTF_16BE, false, [`Uchar u_nl; `Malformed "\xDB"]); 183 + test ("\x00\x0A\xDB\xFF", `UTF_16BE, false, 184 + [`Uchar u_nl; `Malformed "\xDB\xFF"]); 185 + test ("\x00\x0A\xDB\xFF\xDF", `UTF_16BE, false, 186 + [`Uchar u_nl; `Malformed "\xDB\xFF\xDF"]); 187 + test ("\x00\x0A\xDB\xFF\xDF\xFF", `UTF_16BE, false, 188 + [`Uchar u_nl; uchar 0x10FFFF]); 189 + test ("\x00\x0A\x00\x0A", `UTF_16BE, false, 190 + [`Uchar u_nl; `Uchar u_nl]); 191 + (* UTF-16LE guess *) 192 + test ("\xFF\xFE\xFF\xDB\xFF\xDF\x0A\x00", `UTF_16LE, true, 193 + [uchar 0x10FFFF; `Uchar u_nl;]); 194 + test ("\xFF\xFE\xFF\xDB\x0A\x00\x0A\x00", `UTF_16LE, true, 195 + [`Malformed "\xFF\xDB\x0A\x00"; `Uchar u_nl;]); 196 + test ("\xFF\xFE\xFF\xDB\xDF", `UTF_16LE, true, 197 + [`Malformed "\xFF\xDB\xDF";]); 198 + test ("\x0A\x00", `UTF_16LE, false, [`Uchar u_nl]); 199 + test ("\x0A\x00\xDB", `UTF_16LE, false, [`Uchar u_nl; `Malformed "\xDB"]); 200 + test ("\x0A\x00\xFF\xDB", `UTF_16LE, false, 201 + [`Uchar u_nl; `Malformed "\xFF\xDB"]); 202 + test ("\x0A\x00\xFF\xDB\xDF", `UTF_16LE, false, 203 + [`Uchar u_nl; `Malformed "\xFF\xDB\xDF"]); 204 + test ("\x0A\x00\xFF\xDB\xFF\xDF", `UTF_16LE, false, 205 + [`Uchar u_nl; uchar 0x10FFFF]); 206 + test ("\x0A\x00\x0A\x00", `UTF_16LE, false, 207 + [`Uchar u_nl; `Uchar u_nl]); 208 + () 209 + 210 + let test_sub () = 211 + log "Test Uutf.String.fold_utf_8 substring"; 212 + let trip fold ~pos ~len s = 213 + let b = Buffer.create 100 in 214 + let add _ _ = function 215 + | `Uchar u -> Uutf.Buffer.add_utf_8 b u 216 + | `Malformed _ -> assert false 217 + in 218 + fold ?pos:(Some pos) ?len:(Some len) add () s; 219 + assert (String.sub s pos len = Buffer.contents b); 220 + in 221 + trip Uutf.String.fold_utf_8 ~pos:4 ~len:4 "hop hap mop"; 222 + trip Uutf.String.fold_utf_8 ~pos:0 ~len:1 "hop hap mop"; 223 + trip Uutf.String.fold_utf_8 ~pos:2 ~len:1 "hop"; 224 + () 225 + 226 + module Int = struct type t = int let compare : int -> int -> int = compare end 227 + module Umap = Map.Make (Uchar) 228 + module Bmap = Map.Make (Bytes) 229 + 230 + (* Constructs from the specification, the map from uchars to their valid 231 + UTF-8 byte sequence and the map reverse map from valid UTF-8 byte sequences 232 + to their uchar. *) 233 + let utf8_maps () = 234 + log "Building UTF-8 codec maps from specification.\n"; 235 + let spec = [ (* UTF-8 byte sequences cf. table 3.7 p. 94 Unicode 6. *) 236 + (0x0000,0x007F), [|(0x00,0x7F)|]; 237 + (0x0080,0x07FF), [|(0xC2,0xDF); (0x80,0xBF)|]; 238 + (0x0800,0x0FFF), [|(0xE0,0xE0); (0xA0,0xBF); (0x80,0xBF)|]; 239 + (0x1000,0xCFFF), [|(0xE1,0xEC); (0x80,0xBF); (0x80,0xBF)|]; 240 + (0xD000,0xD7FF), [|(0xED,0xED); (0x80,0x9F); (0x80,0xBF)|]; 241 + (0xE000,0xFFFF), [|(0xEE,0xEF); (0x80,0xBF); (0x80,0xBF)|]; 242 + (0x10000,0x3FFFF), [|(0xF0,0xF0); (0x90,0xBF); (0x80,0xBF); (0x80,0xBF)|]; 243 + (0x40000,0xFFFFF), [|(0xF1,0xF3); (0x80,0xBF); (0x80,0xBF); (0x80,0xBF)|]; 244 + (0x100000,0x10FFFF), [|(0xF4,0xF4); (0x80,0x8F); (0x80,0xBF); (0x80,0xBF)|]] 245 + in 246 + let add_range (umap, bmap) ((umin, umax), bytes) = 247 + let len = Array.length bytes in 248 + let bmin i = if i < len then fst bytes.(i) else max_int in 249 + let bmax i = if i < len then snd bytes.(i) else min_int in 250 + let umap = ref umap in 251 + let bmap = ref bmap in 252 + let uchar = ref umin in 253 + let buf = Bytes.create len in 254 + let add len' = 255 + if len <> len' then () else 256 + begin 257 + let bytes = Bytes.copy buf in 258 + let u = Uchar.of_int !uchar in 259 + umap := Umap.add u bytes !umap; 260 + bmap := Bmap.add bytes u !bmap; 261 + incr uchar; 262 + end 263 + in 264 + for b0 = bmin 0 to bmax 0 do 265 + Bytes.unsafe_set buf 0 (Char.chr b0); 266 + for b1 = bmin 1 to bmax 1 do 267 + Bytes.unsafe_set buf 1 (Char.chr b1); 268 + for b2 = bmin 2 to bmax 2 do 269 + Bytes.unsafe_set buf 2 (Char.chr b2); 270 + for b3 = bmin 3 to bmax 3 do 271 + Bytes.unsafe_set buf 3 (Char.chr b3); 272 + add 4; 273 + done; 274 + add 3; 275 + done; 276 + add 2; 277 + done; 278 + add 1; 279 + done; 280 + assert (!uchar - 1 = umax); 281 + (!umap, !bmap) 282 + in 283 + List.fold_left add_range (Umap.empty, Bmap.empty) spec 284 + 285 + let utf8_encode_test umap = 286 + log "Testing UTF-8 encoding of every unicode scalar value against spec.\n"; 287 + let buf = Buffer.create 4 in 288 + let test u = 289 + let u = Uchar.unsafe_of_int u in 290 + let bytes = try Umap.find u umap with Not_found -> assert false in 291 + let bytes = Bytes.unsafe_to_string bytes in 292 + Buffer.clear buf; Uutf.Buffer.add_utf_8 buf u; 293 + if bytes = Buffer.contents buf then () else 294 + fail "UTF-8 encoding error (U+%04X)" (Uchar.to_int u) 295 + in 296 + for i = 0x0000 to 0xD7FF do test i done; 297 + for i = 0xE000 to 0x10FFFF do test i done 298 + 299 + let utf8_decode_test bmap = 300 + log "Testing the UTF-8 decoding of all <= 4 bytes sequences (be patient).\n"; 301 + let spec seq = try `Uchar (Bmap.find seq bmap) with 302 + | Not_found -> `Malformed (Bytes.unsafe_to_string seq) 303 + in 304 + let test seq = 305 + let sseq = Bytes.unsafe_to_string seq in 306 + let dec = List.rev (Uutf.String.fold_utf_8 (fun a _ c -> c :: a) [] sseq) in 307 + match spec seq, dec with 308 + | `Uchar u, [ `Uchar u' ] when u = u' -> `Decoded 309 + | `Malformed _, (`Malformed _) :: _ -> `Malformed 310 + | v, v' :: _ -> fail_decode v v' 311 + | _ -> fail "This should not have happened on specification '%S'." sseq 312 + in 313 + let s1 = Bytes.create 1 314 + and s2 = Bytes.create 2 315 + and s3 = Bytes.create 3 316 + and s4 = Bytes.create 4 317 + in 318 + for b0 = 0x00 to 0xFF do 319 + Bytes.unsafe_set s1 0 (Char.unsafe_chr b0); 320 + if test s1 = `Decoded then () 321 + else begin 322 + Bytes.unsafe_set s2 0 (Char.unsafe_chr b0); 323 + for b1 = 0x00 to 0xFF do 324 + Bytes.unsafe_set s2 1 (Char.unsafe_chr b1); 325 + if test s2 = `Decoded then () 326 + else begin 327 + Bytes.unsafe_set s3 0 (Char.unsafe_chr b0); 328 + Bytes.unsafe_set s3 1 (Char.unsafe_chr b1); 329 + for b2 = 0x00 to 0xFF do 330 + Bytes.unsafe_set s3 2 (Char.unsafe_chr b2); 331 + if test s3 = `Decoded then () 332 + else begin 333 + Bytes.unsafe_set s4 0 (Char.unsafe_chr b0); 334 + Bytes.unsafe_set s4 1 (Char.unsafe_chr b1); 335 + Bytes.unsafe_set s4 2 (Char.unsafe_chr b2); 336 + for b3 = 0x00 to 0xFF do 337 + Bytes.unsafe_set s4 3 (Char.unsafe_chr b3); 338 + ignore (test s4) 339 + done; 340 + end 341 + done; 342 + end 343 + done; 344 + end 345 + done 346 + 347 + let utf8_test () = (* Proof by exhaustiveness... *) 348 + let umap, bmap = utf8_maps () in 349 + utf8_encode_test umap; 350 + (* utf8_decode_test bmap; *) (* too long, commented. *) 351 + () 352 + 353 + let is_uchar_test () = 354 + log "Testing Uchar.is_valid.\n"; 355 + let test cp expected = 356 + let is = Uchar.is_valid cp in 357 + if is <> expected then 358 + fail "Uutf.is_uchar %04X = %b, expected %b" cp is expected 359 + in 360 + for cp = 0x0000 to 0xD7FF do test cp true done; 361 + for cp = 0xD800 to 0xDFFF do test cp false done; 362 + for cp = 0xE000 to 0x10FFFF do test cp true done; 363 + for cp = 0x110000 to 0x120000 do test cp false done 364 + 365 + let test () = 366 + Printexc.record_backtrace true; 367 + codec_test (); 368 + buffer_string_codec_test (); 369 + pos_test (); 370 + guess_test (); 371 + test_sub (); 372 + utf8_test (); 373 + is_uchar_test (); 374 + log "All tests succeeded.\n" 375 + 376 + let () = if not (!Sys.interactive) then test ()

+391

vendor/opam/uutf/test/utftrip.ml

··· 1 + (*--------------------------------------------------------------------------- 2 + Copyright (c) 2012 The uutf programmers. All rights reserved. 3 + SPDX-License-Identifier: ISC 4 + ---------------------------------------------------------------------------*) 5 + 6 + let str = Printf.sprintf 7 + let pp = Format.fprintf 8 + let pp_pos ppf d = pp ppf "%d.%d:(%d,%06X) " 9 + (Uutf.decoder_line d) (Uutf.decoder_col d) (Uutf.decoder_count d) 10 + (Uutf.decoder_byte_count d) 11 + 12 + let pp_decode inf d ppf v = 13 + pp ppf "@[<h>%s:%a%a@]@\n" inf pp_pos d Uutf.pp_decode v 14 + 15 + let exec = Filename.basename Sys.executable_name 16 + let log f = Format.eprintf ("%s: " ^^ f ^^ "@?") exec 17 + 18 + let input_malformed = ref false 19 + let log_malformed inf d v = 20 + input_malformed := true; log "%a" (pp_decode inf d) v 21 + 22 + (* IO tools *) 23 + 24 + let io_buffer_size = 65536 (* IO_BUFFER_SIZE 4.0.0 *) 25 + let unix_buffer_size = 65536 (* UNIX_BUFFER_SIZE 4.0.0 *) 26 + 27 + let rec unix_read fd s j l = try Unix.read fd s j l with 28 + | Unix.Unix_error (Unix.EINTR, _, _) -> unix_read fd s j l 29 + 30 + let rec unix_write fd s j l = 31 + let rec write fd s j l = try Unix.single_write fd s j l with 32 + | Unix.Unix_error (Unix.EINTR, _, _) -> write fd s j l 33 + in 34 + let wc = write fd s j l in 35 + if wc < l then unix_write fd s (j + wc) (l - wc) else () 36 + 37 + let string_of_channel use_unix ic = 38 + let b = Buffer.create unix_buffer_size in 39 + let input, s = 40 + if use_unix 41 + then unix_read (Unix.descr_of_in_channel ic), Bytes.create unix_buffer_size 42 + else input ic, Bytes.create io_buffer_size 43 + in 44 + let rec loop b input s = 45 + let rc = input s 0 (Bytes.length s) in 46 + if rc = 0 then Buffer.contents b else 47 + (Buffer.add_substring b (Bytes.unsafe_to_string s) 0 rc; loop b input s) 48 + in 49 + loop b input s 50 + 51 + let string_to_channel use_unix oc s = 52 + if not use_unix then output_string oc s else 53 + let s = Bytes.unsafe_of_string s in 54 + unix_write (Unix.descr_of_out_channel oc) s 0 (Bytes.length s) 55 + 56 + let dst_for sout = if sout then `Buffer (Buffer.create 512) else `Channel stdout 57 + let src_for inf sin use_unix = 58 + try 59 + let ic = if inf = "-" then stdin else open_in inf in 60 + if sin then `String (string_of_channel use_unix ic) else `Channel ic 61 + with Sys_error e -> log "%s\n" e; exit 1 62 + 63 + let close_src src = 64 + try match src with `Channel ic when ic <> stdin -> close_in ic | _ -> () with 65 + | Sys_error e -> log "%s\n" e; exit 1 66 + 67 + let src_for_unix inf = 68 + try if inf = "-" then Unix.stdin else Unix.(openfile inf [O_RDONLY] 0) with 69 + | Unix.Unix_error (e, _, v) -> log "%s: %s\n" (Unix.error_message e) v; exit 1 70 + 71 + let close_src_unix fd = try if fd <> Unix.stdin then Unix.close fd with 72 + | Unix.Unix_error (e, _, v) -> log "%s: %s\n" (Unix.error_message e) v; exit 1 73 + 74 + let rec encode_unix fd s e v = match Uutf.encode e v with `Ok -> () 75 + | `Partial -> 76 + unix_write fd s 0 (Bytes.length s - Uutf.Manual.dst_rem e); 77 + Uutf.Manual.dst e s 0 (Bytes.length s); 78 + encode_unix fd s e `Await 79 + 80 + (* Dump *) 81 + 82 + let dump_decode inf d v = 83 + (match v with `Malformed _ -> input_malformed := true | _ -> ()); 84 + (pp_decode inf d) Format.std_formatter v 85 + 86 + let dump_ inf encoding nln src = 87 + let rec loop inf d = match Uutf.decode d with `Await -> assert false 88 + | v -> 89 + dump_decode inf d v; 90 + if v <> `End then loop inf d 91 + in 92 + loop inf (Uutf.decoder ?nln ?encoding src) 93 + 94 + let dump_unix inf encoding nln usize fd = 95 + let rec loop fd s d = match Uutf.decode d with 96 + | `Await -> 97 + let rc = unix_read fd s 0 (Bytes.length s) in 98 + Uutf.Manual.src d s 0 rc; loop fd s d 99 + | v -> dump_decode inf d v; if v <> `End then loop fd s d 100 + in 101 + loop fd (Bytes.create usize) (Uutf.decoder ?nln ?encoding `Manual) 102 + 103 + let dump inf sin use_unix usize ie nln = 104 + if sin || not use_unix then dump_ inf ie nln (src_for inf sin use_unix) else 105 + dump_unix inf ie nln usize (src_for_unix inf) 106 + 107 + (* Guess only *) 108 + 109 + let guess inf = 110 + let d = Uutf.decoder (src_for inf false false) in 111 + ignore (Uutf.decode d); 112 + Format.printf "%s@." (Uutf.encoding_to_string (Uutf.decoder_encoding d)) 113 + 114 + (* Decode only *) 115 + 116 + let decode_ inf encoding nln src = 117 + let malformed = log_malformed inf in 118 + let rec loop d = match Uutf.decode d with `Await -> assert false 119 + | `Uchar _ -> loop d 120 + | `End -> () 121 + | `Malformed _ as v -> malformed d v; loop d 122 + in 123 + loop (Uutf.decoder ?nln ?encoding src); close_src src 124 + 125 + let decode_unix inf encoding nln usize fd = 126 + let malformed = log_malformed inf in 127 + let rec loop fd s d = match Uutf.decode d with 128 + | `Uchar _ -> loop fd s d 129 + | `End -> () 130 + | `Malformed _ as v -> malformed d v; loop fd s d 131 + | `Await -> 132 + let rc = unix_read fd s 0 (Bytes.length s) in 133 + Uutf.Manual.src d s 0 rc; loop fd s d 134 + in 135 + loop fd (Bytes.create usize) (Uutf.decoder ?nln ?encoding `Manual); 136 + close_src_unix fd 137 + 138 + let decode inf sin use_unix usize ie nln = 139 + if sin || not use_unix then decode_ inf ie nln (src_for inf sin use_unix) else 140 + decode_unix inf ie nln usize (src_for_unix inf) 141 + 142 + (* Random encode only *) 143 + 144 + let u_surrogate_count = 0xDFFF - 0xD800 + 1 145 + let uchar_count = (0x10FFFF + 1) - u_surrogate_count 146 + let r_uchar () = 147 + let n = Random.int uchar_count in 148 + Uchar.of_int (if n > 0xD7FF then n + u_surrogate_count else n) 149 + 150 + let r_text encoding encode_f rcount = 151 + encode_f (`Uchar Uutf.u_bom); 152 + for i = 1 to rcount do encode_f (`Uchar (r_uchar ())) done; 153 + encode_f `End 154 + 155 + let encode_f encoding dst = 156 + let e = Uutf.encoder encoding dst in 157 + fun v -> match Uutf.encode e v with `Ok -> () | `Partial -> assert false 158 + 159 + let encode_f_unix usize encoding fd = 160 + let e, s = Uutf.encoder encoding `Manual, Bytes.create usize in 161 + Uutf.Manual.dst e s 0 (Bytes.length s); 162 + encode_unix fd s e 163 + 164 + let r_encode sout use_unix usize rseed rcount oe = 165 + let rseed = match rseed with 166 + | None -> Random.self_init (); Random.int (1 lsl 30 - 1) 167 + | Some rseed -> rseed 168 + in 169 + let dst = dst_for sout in 170 + let oe = match oe with None -> `UTF_8 | Some enc -> enc in 171 + let encode_f = 172 + if sout || not use_unix then encode_f oe dst else 173 + encode_f_unix usize oe Unix.stdout 174 + in 175 + log "Encoding %d random characters with seed %d\n" rcount rseed; 176 + Random.init rseed; r_text oe encode_f rcount; 177 + match dst with `Channel _ -> () 178 + | `Buffer b -> string_to_channel use_unix stdout (Buffer.contents b) 179 + 180 + (* Trip *) 181 + 182 + let trip_ inf nln ie oe src dst = 183 + let malformed d v e = 184 + log_malformed inf d v; ignore (Uutf.encode e (`Uchar Uutf.u_rep)) 185 + in 186 + let rec loop d e = function `Await -> assert false 187 + | `Uchar _ as v -> ignore (Uutf.encode e v); loop d e (Uutf.decode d) 188 + | `End -> ignore (Uutf.encode e `End) 189 + | `Malformed _ as v -> malformed d v e; loop d e (Uutf.decode d) 190 + in 191 + let d = Uutf.decoder ?nln ?encoding:ie src in 192 + let e, first = match oe with 193 + | Some enc -> Uutf.encoder enc dst, (Uutf.decode d) 194 + | None -> 195 + let v = Uutf.decode d in (* get the encoding. *) 196 + let enc = match Uutf.decoder_encoding d with 197 + | #Uutf.encoding as enc -> enc | `ISO_8859_1 | `US_ASCII -> `UTF_8 198 + in 199 + Uutf.encoder enc dst, v 200 + in 201 + if (Uutf.encoder_encoding e = `UTF_16 || Uutf.decoder_removed_bom d) 202 + then ignore (Uutf.encode e (`Uchar Uutf.u_bom)); 203 + loop d e first; close_src src 204 + 205 + let trip_unix inf usize nln ie oe fdi fdo = 206 + let malformed d v e = 207 + log_malformed inf d v; ignore (Uutf.encode e (`Uchar Uutf.u_rep)) 208 + in 209 + let rec loop fdi fdo ds es d e = function 210 + | `Uchar _ as v -> 211 + encode_unix fdo es e v; loop fdi fdo ds es d e (Uutf.decode d) 212 + | `End -> encode_unix fdo es e `End 213 + | `Malformed _ as v -> malformed d v e; loop fdi fdo ds es d e (Uutf.decode d) 214 + | `Await -> 215 + let rc = unix_read fdi ds 0 (Bytes.length ds) in 216 + Uutf.Manual.src d ds 0 rc; loop fdi fdo ds es d e (Uutf.decode d) 217 + in 218 + let d, ds = Uutf.decoder ?nln ?encoding:ie `Manual, Bytes.create usize in 219 + let e, es, first = match oe with 220 + | Some enc -> Uutf.encoder enc `Manual, Bytes.create usize, (Uutf.decode d) 221 + | None -> 222 + let rec decode_past_await d = match Uutf.decode d with 223 + | `Await -> 224 + let rc = unix_read fdi ds 0 (Bytes.length ds) in 225 + Uutf.Manual.src d ds 0 rc; decode_past_await d 226 + | v -> v 227 + in 228 + let v = decode_past_await d in (* get encoding. *) 229 + let enc = match Uutf.decoder_encoding d with 230 + | #Uutf.encoding as enc -> enc | `ISO_8859_1 | `US_ASCII -> `UTF_8 231 + in 232 + Uutf.encoder enc `Manual, Bytes.create usize, v 233 + in 234 + Uutf.Manual.dst e es 0 (Bytes.length es); 235 + if (Uutf.encoder_encoding e = `UTF_16 || Uutf.decoder_removed_bom d) 236 + then encode_unix fdo es e (`Uchar Uutf.u_bom); 237 + loop fdi fdo ds es d e first; close_src_unix fdi 238 + 239 + let trip inf sin sout use_unix usize ie oe nln = 240 + let src = src_for inf sin use_unix in 241 + let dst = dst_for sout in 242 + if sin || sout || not use_unix then trip_ inf nln ie oe src dst else 243 + trip_unix inf usize nln ie oe (src_for_unix inf) Unix.stdout; 244 + match dst with `Channel _ -> () 245 + | `Buffer b -> string_to_channel use_unix stdout (Buffer.contents b) 246 + 247 + (* Cmd *) 248 + 249 + let do_cmd cmd inf sin sout use_unix usize ie oe nln rseed rcount = 250 + match cmd with 251 + | `Ascii -> dump inf sin use_unix usize ie nln 252 + | `Guess -> guess inf 253 + | `Decode -> decode inf sin use_unix usize ie nln 254 + | `Encode -> r_encode sout use_unix usize rseed rcount oe 255 + | `Trip -> trip inf sin sout use_unix usize ie oe nln 256 + 257 + (* Cmdline interface *) 258 + 259 + open Cmdliner 260 + 261 + let enc_enum = 262 + [ "UTF-8", `UTF_8; "UTF-16", `UTF_16; "UTF-16LE", `UTF_16LE; 263 + "UTF-16BE", `UTF_16BE; ] 264 + 265 + let decode_enc_enum = 266 + ("ASCII", `US_ASCII) :: ("latin1", `ISO_8859_1) :: enc_enum 267 + 268 + let ienc = 269 + let doc = str "Decoded (input) encoding, must %s. If unspecified the 270 + encoding is guessed." 271 + (Arg.doc_alts_enum decode_enc_enum) 272 + in 273 + Arg.(value & opt (some (enum decode_enc_enum)) None & 274 + info ["d"; "input-encoding"] ~doc) 275 + 276 + let oenc = 277 + let doc = str "Encoded (output) encoding, must %s. If unspecified the output 278 + encoding is the same as the input encoding except for ASCII 279 + and latin1 where UTF-8 is output." (Arg.doc_alts_enum enc_enum) 280 + in 281 + Arg.(value & opt (some (enum enc_enum)) None & 282 + info ["e"; "output-encoding"] ~doc) 283 + 284 + let nln = 285 + let lf = Uchar.of_int 0x000A in 286 + let nln_enum = ["ascii", `ASCII lf; "nlf", `NLF lf; "readline", `Readline lf] 287 + in 288 + let doc = str "New line normalization to U+000A, must %s. ascii 289 + normalizes CR (U+000D) and CRLF (<U+000D, U+000A>). nlf 290 + normalizes like ascii plus NEL (U+0085). readline 291 + normalizes like nlf plus FF (U+000C), LS (U+2028) and 292 + PS (U+2029)." 293 + (Arg.doc_alts_enum nln_enum) 294 + in 295 + let vopt = Some (`Readline lf) in 296 + Arg.(value & opt ~vopt (some (enum nln_enum)) None & info ["nln"] ~doc) 297 + 298 + let sin = 299 + let doc = "Input everything in a string and decode the string." in 300 + Arg.(value & flag & info [ "input-string" ] ~doc) 301 + 302 + let sout = 303 + let doc = "Encode everything in a string and output the string." in 304 + Arg.(value & flag & info [ "output-string" ] ~doc) 305 + 306 + let use_unix = 307 + let doc = "Use Unix IO." in 308 + Arg.(value & flag & info [ "use-unix" ] ~doc) 309 + 310 + let usize = 311 + let doc = "Unix IO buffer sizes in bytes." in 312 + Arg.(value & opt int unix_buffer_size & info ["unix-size"] ~doc) 313 + 314 + let nat = 315 + let parse s = 316 + try 317 + let v = int_of_string s in 318 + if v > 0 then Ok v else failwith (str "%s must be > 0" s) 319 + with Failure e -> Error e 320 + in 321 + Arg.conv' ~docv:"NAT" (parse, Format.pp_print_int) 322 + 323 + let rseed = 324 + let doc = "Random seed." in 325 + Arg.(value & opt (some nat) None & info ["rseed"] ~doc) 326 + 327 + let rcount = 328 + let doc = "Number of random characters to generate." in 329 + Arg.(value & opt nat 1_000_000 & info ["rcount"] ~doc) 330 + 331 + let file = 332 + let doc = "The input file. Reads from stdin if unspecified." in 333 + Arg.(value & pos 0 string "-" & info [] ~doc ~docv:"FILE") 334 + 335 + let cmd = 336 + let doc = "Output the input text as Unicode scalar values or malformed 337 + sequences, one per line, in the US-ASCII charset with their 338 + position (see POSITION INFORMATION for more details)." 339 + in 340 + let ascii = `Ascii, Arg.info ["a"; "ascii"] ~doc in 341 + let doc = "Only guess an UTF encoding. The result of a guess can only be 342 + UTF-8 or UTF-16{LE,BE}." 343 + in 344 + let guess = `Guess, Arg.info ["g"; "guess"] ~doc in 345 + let doc = "Decode only, no encoding." in 346 + let dec = `Decode, Arg.info ["decode"] ~doc in 347 + let doc = "Encode only (random), no decoding. See option $(b,--rcount)." in 348 + let enc = `Encode, Arg.info ["encode"] ~doc in 349 + Arg.(value & vflag `Trip [ascii; guess; dec; enc]) 350 + 351 + let cmd = 352 + let doc = "Recode UTF-{8,16,16LE,16BE} and latin1 from stdin to stdout." in 353 + let man = [ 354 + `S "DESCRIPTION"; 355 + `P "$(tname) inputs Unicode text from stdin and rewrites it 356 + to stdout in various ways. If no input encoding is specified, 357 + it is guessed. If no output encoding is specified, the input 358 + encoding is used."; 359 + `P "Invalid byte sequences in the input are reported on stderr and 360 + replaced by the Unicode replacement character (U+FFFD) in the output."; 361 + `S "POSITION INFORMATION"; 362 + `P "The format for position information is:"; 363 + `P "filename:line.col:(count,byte)"; 364 + `I ("line", "one-based line number that increments with each newline. 365 + A newline is always determined as being anything that would be 366 + normalized by the option `$(b,--nln)=readline`."); 367 + `I ("col", "zero-based column number that increment with each new 368 + decoded character and zeroes after a newline 369 + is decoded. Note that the column number may not correspond to 370 + user-perceived columns, as any Unicode scalar value, including 371 + combining characters are deemed to have a width of 1."); 372 + `I ("count", "the one-based Unicode scalar value count."); 373 + `I ("byte", "the zero-based end byte offset of the scalar value 374 + in the input stream in hexadecimal."); 375 + `S "EXIT STATUS"; 376 + `P "$(tname) exits with one of the following values:"; 377 + `I ("0", "no error occured"); 378 + `I ("1", "a command line parsing error occured"); 379 + `I ("2", "the input text was malformed"); 380 + `S "BUGS"; 381 + `P "This program is distributed with the Uutf OCaml library. 382 + See http://erratique.ch/software/uutf for contact 383 + information."; ] 384 + in 385 + Cmd.v (Cmd.info "utftrip" ~version:"%%VERSION%%" ~doc ~man) 386 + Term.(const do_cmd $ cmd $ file $ sin $ sout $ use_unix $ usize $ 387 + ienc $ oenc $ nln $ rseed $ rcount) 388 + 389 + let () = match Cmd.eval_value cmd with 390 + | Error _ -> exit 1 391 + | _ -> if !input_malformed then exit 2 else exit 0

+38

vendor/opam/uutf/uutf.opam

··· 1 + opam-version: "2.0" 2 + name: "uutf" 3 + synopsis: "Non-blocking streaming Unicode codec for OCaml" 4 + description: """\ 5 + **Warning.** You are encouraged not to use this library. 6 + 7 + - As of OCaml 4.14, both UTF encoding and decoding are available 8 + in the standard library, see the `String` and `Buffer` modules. 9 + - If you are looking for a stream abstraction compatible with 10 + effect based concurrency look into [`bytesrw`] package.""" 11 + maintainer: "Daniel Bünzli <daniel.buenzl i@erratique.ch>" 12 + authors: "The uutf programmers" 13 + license: "ISC" 14 + tags: ["unicode" "text" "utf-8" "utf-16" "codec" "org:erratique"] 15 + homepage: "https://erratique.ch/software/uutf" 16 + doc: "https://erratique.ch/software/uutf/doc/" 17 + bug-reports: "https://github.com/dbuenzli/uutf/issues" 18 + depends: [ 19 + "ocaml" {>= "4.08.0"} 20 + "ocamlfind" {build} 21 + "ocamlbuild" {build} 22 + "topkg" {build & >= "1.1.0"} 23 + ] 24 + depopts: ["cmdliner"] 25 + conflicts: [ 26 + "cmdliner" {< "1.3.0"} 27 + ] 28 + build: [ 29 + "ocaml" 30 + "pkg/pkg.ml" 31 + "build" 32 + "--dev-pkg" 33 + "%{dev}%" 34 + "--with-cmdliner" 35 + "%{cmdliner:installed}%" 36 + ] 37 + dev-repo: "git+https://erratique.ch/repos/uutf.git" 38 + x-maintenance-intent: ["(latest)"]

Configure Feed

Configure Feed