···11+v17.0.0 2025-09-11 Zagreb
22+-------------------------
33+44+- Unicode 17.0.0 support.
55+66+v16.0.0 2024-09-10 Zagreb
77+-------------------------
88+99+- Unicode 16.0.0 support.
1010+1111+v15.1.0 2022-09-15 Zagreb
1212+-------------------------
1313+1414+- Unicode 15.1.0 support.
1515+- Requires OCaml 4.14.0.
1616+- The `Uunf_string` module was rewritten to use the standard library
1717+ UTF decoders and was moved to the `uunf` library. The `uunf.string`
1818+ library is deprecated, it warns on usage and simply requires `uunf`.
1919+- The sample code was rewritten to use the standard library UTF
2020+ decoders.
2121+2222+v15.0.0 2022-09-15 Zagreb
2323+-------------------------
2424+2525+- Unicode 15.0.0 support.
2626+2727+v14.0.0 2021-09-17 Zagreb
2828+-------------------------
2929+3030+- Unicode 14.0.0 support.
3131+3232+v13.0.0 2020-03-11 La Forclaz (VS)
3333+---------------------------------
3434+3535+- Unicode 13.0.0 support.
3636+- Require OCaml >= 4.03.0.
3737+3838+v12.0.0 2019-03-08 La Forclaz (VS)
3939+----------------------------------
4040+4141+- Unicode 12.0.0 support.
4242+4343+v11.0.0 2018-06-06 Lausanne
4444+---------------------------
4545+4646+- Unicode 11.0.0 support.
4747+- Fix bug when normalizer flushes at the end of stream: a spurious
4848+ `Await` was returned before the final `End`.
4949+5050+v10.0.0 2017-06-20 Cambridge (UK)
5151+---------------------------------
5252+5353+- Unicode 10.0.0 support
5454+- Fix bug in canonical composition algorithm (used by NFC and NFKC forms).
5555+ Thanks to Stephen Dolan for the report.
5656+- Fix regression of `Uucp.ccc` introduced by f4c0363 which went into
5757+ v2.0.{0,1}.
5858+5959+v2.0.1 2016-03-07 La Forclaz (VS)
6060+---------------------------------
6161+6262+- OCaml 4.05.0 compatibility (removal of `Uchar.dump`).
6363+6464+v2.0.0 2016-11-23 Zagreb
6565+------------------------
6666+6767+- Support for Unicode 9.0.0.
6868+- OCaml standard library `Uchar.t` support.
6969+ - Removes and substitutes `type Uunf.uchar = int` by the (abstract)
7070+ `Uchar.t` type. `Uchar.{of,to}_int` allows to recover the previous
7171+ representation.
7272+ - Removes `Uunf.is_scalar_value`. `Uchar.is_valid` can be used instead.
7373+- Safe string support.
7474+- Build depend on topkg.
7575+- Relicense from BSD3 to ISC.
7676+7777+v1.0.0 2015-06-17 Cambridge (UK)
7878+--------------------------------
7979+8080+- Updated for Unicode 8.0.0
8181+- `topkg` support
8282+- `Uunf.add` now eventually returns `` `End`` whenever the latter was
8383+ encoded and the character stream was entirely output. In most existing
8484+ programs this will simply entail to add `` `End`` to the existing
8585+ `` `Await`` case in pattern matches on the result of `Uunf.add`.
8686+- Adds the `Uunf_string` library that allows to directly normalize UTF-X
8787+ OCaml encoded strings. This library depends on `Uutf`.
8888+- Rewrote the utility `unftrip` to use `Cmdliner` which is now
8989+ an optional dependency of the package. The cli interface is
9090+ incompatible with previous versions. Support for random
9191+ Unicode scalar value generation was removed, use `utftrip` from
9292+ the `Uutf` package for that.
9393+- Rewrote the module's data generation to essentially match what is done
9494+ in `Uucp`. Much less ugly, no source file `sed`ding.
9595+9696+v0.9.3 2014-06-16 Cambridge (UK)
9797+--------------------------------
9898+9999+- Updated for Unicode 7.0.0
100100+101101+v0.9.2 2013-10-01 Lausanne
102102+--------------------------
103103+104104+- Updated for Unicode 6.3.0
105105+- OPAM friendly workflow and drop OASIS support.
106106+107107+v0.9.1 2013-01-04 La Forclaz (VS)
108108+---------------------------------
109109+110110+- Updated for Unicode 6.2.0.
111111+- Fix Uunf.is_scalar_value always returning false.
112112+- Make the module completely safe for the client.
113113+- Change command line help of unftrip.
114114+115115+v0.9.0 2012-09-07 Lausanne
116116+--------------------------
117117+118118+First release.
+44
vendor/opam/uunf/DEVEL.md
···11+# New Unicode release
22+33+The file `src/uunf_data.ml` contains generated data. This file needs
44+to be regenerated on new Unicode releases, as well as the `opam` file.
55+66+In order to do so you need to install an updated version of the [uucd]
77+OCaml package which is capable of reading the latest XML Unicode
88+character database.
99+1010+You can then bump the Unicode release number at the top of the `B0.ml`
1111+file. Verify that everything is as expected with:
1212+1313+ b0 -- unicode-version
1414+1515+You should then download a copy of the XML Unicode character database
1616+to the `support/ucd.xml` file which is ignored by git. If you have
1717+`curl` and `unzip` in your `PATH` you can simply issue:
1818+1919+ b0 -- download-ucdxml
2020+2121+You can now proceed to generate the `src/uunf_data.ml` and update the opam file
2222+file by issuing:
2323+2424+ b0 -- generate-data
2525+ b0 -- .opam file > opam
2626+2727+[uucd]: http://erratique.ch/software/uucd
2828+2929+# Reference tests
3030+3131+To test the package on the reference normalization tests of you must
3232+download a copy of the tests to the `test/NormalizationTest.txt` file
3333+which is ignored by git.
3434+3535+If you have `curl` in your `PATH` you can simply issue:
3636+3737+ b0 -- download-tests
3838+3939+this downloads the tests for the unicode version mentioned in `B0.ml`.
4040+4141+You can then check them with:
4242+4343+ b0 test
4444+
+13
vendor/opam/uunf/LICENSE.md
···11+Copyright (c) 2012 The uunf programmers
22+33+Permission to use, copy, modify, and/or distribute this software for any
44+purpose with or without fee is hereby granted, provided that the above
55+copyright notice and this permission notice appear in all copies.
66+77+THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
88+WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
99+MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
1010+ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
1111+WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
1212+ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
1313+OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+46
vendor/opam/uunf/README.md
···11+Uunf — Unicode text normalization for OCaml
22+===========================================
33+%%VERSION%%
44+55+Uunf is an OCaml library for normalizing Unicode text. It supports all
66+Unicode [normalization forms]. The library is independent from any IO
77+mechanism or Unicode text data structure and it can process text
88+without a complete in-memory representation.
99+1010+Uunf is distributed under the ISC license. It has no dependency.
1111+1212+[normalization forms]: http://www.unicode.org/reports/tr15/
1313+1414+Homepage: <http://erratique.ch/software/uunf>
1515+1616+1717+## Installation
1818+1919+Uunf can be installed with `opam`:
2020+2121+ opam install uunf
2222+ opam install uunf cmdliner uutf # For the unftrip tool
2323+2424+If you don't use `opam` consult the [`opam`](opam) file for build
2525+instructions.
2626+2727+2828+## Documentation
2929+3030+The documentation can be consulted [online] or via `odig doc uunf`.
3131+3232+Questions are welcome but better asked on the [OCaml forum] than on
3333+the issue tracker.
3434+3535+[online]: http://erratique.ch/software/uunf/doc/
3636+[OCaml forum]: https://discuss.ocaml.org/
3737+3838+3939+## Sample programs
4040+4141+The [`unftrip`] tool normalises text provided on standard input.
4242+4343+See also the [doc examples].
4444+4545+[`unftrip`]: test/unftrip.ml
4646+[doc examples]: test/examples.ml
+11
vendor/opam/uunf/_tags
···11+true : bin_annot, safe_string
22+33+<_b0> : -traverse
44+55+<src> : include
66+77+<support> : include
88+<support/**> : package(uucd unix)
99+1010+<test> : include
1111+<test/unftrip*> : package(uutf), package(cmdliner)
+10
vendor/opam/uunf/doc/index.mld
···11+{0 Uunf {%html: <span class="version">%%VERSION%%</span>%}}
22+33+Uunf normalizes Unicode text. See {!Uunf} for more details.
44+55+{1:uunf Library [uunf]}
66+77+{!modules:
88+Uunf
99+Uunf_string
1010+}
···11+(*---------------------------------------------------------------------------
22+ Copyright (c) 2012 The uunf programmers. All rights reserved.
33+ SPDX-License-Identifier: ISC
44+ ---------------------------------------------------------------------------*)
55+66+type ret = [ `Uchar of Uchar.t | `End | `Await ]
77+88+let pp_ret ppf v = match (v :> ret) with
99+| `Uchar u -> Format.fprintf ppf "`Uchar U+%04X" (Uchar.to_int u)
1010+| `End -> Format.fprintf ppf "`End"
1111+| `Await -> Format.fprintf ppf "`Await"
1212+1313+let err_exp_await add =
1414+ invalid_arg (Format.asprintf "can't add %a, expected `Await" pp_ret add)
1515+1616+let err_ended add =
1717+ invalid_arg (Format.asprintf "can't add %a, `End already added" pp_ret add)
1818+1919+(* The normalization process is implemented as described in UAX #15
2020+ section 9.1 for normalizing the concatenation of normalized
2121+ strings. We detect ranges of characters in the input sequence
2222+ enclosed between two characters for which NFX_quick_check=YES *and*
2323+ ccc = 0 (6.1.0 wrongly claims that quick_check=YES implies ccc = 0,
2424+ we therefore call this property nfx_boundary). Only these ranges
2525+ (including the left boundary) need to be bufferized to perform the
2626+ normalization process. *)
2727+2828+(* Characters *)
2929+3030+let ux_none = max_int (* no char, outside unicode range. *)
3131+let u_dumb = (* placeholder, overwritten. *)
3232+ `Uchar (Uchar.of_int 0x0000)
3333+3434+(* Normalization properties. *)
3535+3636+let unicode_version = Uunf_data.unicode_version
3737+3838+let nfc_boundary u = Uunf_tmapbool.get Uunf_data.nfc_boundary_map u
3939+let nfd_boundary u = Uunf_tmapbool.get Uunf_data.nfd_boundary_map u
4040+let nfkc_boundary u = Uunf_tmapbool.get Uunf_data.nfkc_boundary_map u
4141+let nfkd_boundary u = Uunf_tmapbool.get Uunf_data.nfkd_boundary_map u
4242+let _ccc u = Uunf_tmapbyte.get Uunf_data.ccc_map u
4343+let ccc u = _ccc (Uchar.to_int u)
4444+let decomp_prop u = Uunf_tmap.get Uunf_data.decomp_map u
4545+let compose_prop u = Uunf_tmap.get Uunf_data.compose_map u
4646+4747+module H = struct (* Hangul arithmetic constants. *)
4848+ let sbase = 0xAC00
4949+ let lbase = 0x1100
5050+ let vbase = 0x1161
5151+ let tbase = 0x11A7
5252+ let scount = 11172
5353+ let lcount = 19
5454+ let vcount = 21
5555+ let tcount = 28
5656+ let ncount = 588
5757+ let scount = 11172
5858+end
5959+6060+let decomp u =
6161+ let u = Uchar.to_int u in
6262+ if u < 0xAC00 || 0xD7A3 < u then decomp_prop u else
6363+ begin (* LV or LVT hangul composite *)
6464+ let sindex = u - H.sbase in
6565+ let l = H.lbase + (sindex / H.ncount) in
6666+ let v = H.vbase + (sindex mod H.ncount) / H.tcount in
6767+ let t = H.tbase + (sindex mod H.tcount) in
6868+ if t = H.tbase then [|l; v|] else [|l; v; t|]
6969+ end
7070+7171+(* N.B. to help stream-safe text implementers we *could* use the bits
7272+ 25-27 of [(decomp u).(0)] to indicate the number of initial non
7373+ starters in the NFKD decomposition of [u] and bits and 28-30 to
7474+ indicate the non starter count increment. *)
7575+7676+let d_compatibility i = i land (1 lsl 24) > 0
7777+let _d_uchar i = i land 0x1FFFFF
7878+let d_uchar i = Uchar.unsafe_of_int (_d_uchar i)
7979+8080+let _composite u1 u2 =
8181+ if 0x1100 <= u1 && u1 <= 0x1112 then
8282+ begin
8383+ if u2 < 0x1161 || 0x1175 < u2 then ux_none else
8484+ let l = u1 - H.lbase in (* LV hangul composite *)
8585+ let v = u2 - H.vbase in
8686+ H.sbase + l * H.ncount + v * H.tcount
8787+ end
8888+ else if 0xAC00 <= u1 && u1 <= 0xD788 && (u1 - 0x0AC00) mod H.tcount = 0 then
8989+ begin
9090+ if u2 < 0x11A8 || u2 > 0x11C3 then ux_none else
9191+ (u1 + u2 - H.tbase) (* LVT hangul composite *)
9292+ end
9393+ else match compose_prop u1 with
9494+ | [||] -> ux_none
9595+ | a (* [u2; c; u2'; c'; ...] sorted *) ->
9696+ let len = Array.length a / 2 in
9797+ let i = ref 0 in
9898+ try
9999+ while (!i < len) do
100100+ if a.(!i * 2) = u2 then raise Exit else incr i;
101101+ done;
102102+ ux_none
103103+ with Exit -> (a.(!i * 2 + 1))
104104+105105+let composite u1 u2 =
106106+ let u = _composite (Uchar.to_int u1) (Uchar.to_int u2) in
107107+ if u = ux_none then None else Some (Uchar.unsafe_of_int u)
108108+109109+(* Normalize *)
110110+111111+type form = [ `NFC | `NFD | `NFKC | `NFKD ]
112112+type state = (* normalizer state. *)
113113+| Start (* no cp seen yet. *)
114114+| Boundary (* cp with boundary = true found in n.uc, no accumulation yet. *)
115115+| Acc (* accumulate until next cp with boundary = true. *)
116116+| Flush (* next cp with boundary = true found, flush previous data. *)
117117+| End (* end of normalization sequence. *)
118118+119119+type t =
120120+ { form : form; (* normalization form. *)
121121+ compat : bool; (* true if compatibility decomposition needed. *)
122122+ compose : bool; (* true if composition needed. *)
123123+ boundary : int -> bool; (* nfx_boundary. *)
124124+ mutable state : state; (* normalizer state. *)
125125+ mutable uc : [`Uchar of Uchar.t]; (* last cp with boundary = true. *)
126126+ mutable acc : int array; (* code point accumulator. *)
127127+ mutable first : int; (* index of first code point in acc. *)
128128+ mutable last : int; (* index of last code point in acc. *)
129129+ mutable is_end : bool;} (* [true] if `End was seen. *)
130130+131131+let create_acc () = Array.make 35 ux_none
132132+let create form =
133133+ let boundary, compat, compose = match form with
134134+ | `NFC -> nfc_boundary, false, true
135135+ | `NFD -> nfd_boundary, false, false
136136+ | `NFKC -> nfkc_boundary, true, true
137137+ | `NFKD -> nfkd_boundary, true, false
138138+ in
139139+ { form = (form :> form); compat; compose; boundary; state = Start;
140140+ uc = u_dumb; acc = create_acc (); first = 0; last = -1; is_end = false}
141141+142142+let get_u n = let `Uchar u = n.uc in Uchar.to_int u
143143+let acc_empty n = n.first > n.last
144144+let form n = n.form
145145+let copy n = { n with acc = Array.copy n.acc }
146146+let reset n =
147147+ n.state <- Start; n.uc <- u_dumb; n.acc <- create_acc ();
148148+ n.first <- 0; n.last <- -1; n.is_end <- false
149149+150150+let grow_acc n =
151151+ let len = Array.length n.acc in
152152+ let acc' = Array.make (2 * len) ux_none in
153153+ Array.blit n.acc 0 acc' 0 len; n.acc <- acc'
154154+155155+let ordered_add n u = (* canonical ordering algorithm via insertion sort. *)
156156+ n.last <- n.last + 1; if n.last = Array.length n.acc then grow_acc n;
157157+ let c = _ccc u in
158158+ if c = 0 then n.acc.(n.last) <- u else
159159+ begin
160160+ let i = ref (n.last - 1) in
161161+ while (!i >= 0 && _ccc (n.acc.(!i)) > c) do
162162+ n.acc.(!i + 1) <- n.acc.(!i); decr i; (* shift right. *)
163163+ done;
164164+ n.acc.(!i + 1) <- u
165165+ end
166166+167167+let rec add n u =
168168+ if 0xAC00 <= u && u <= 0xD7A3 then
169169+ begin (* LV or LVT hangul composite, copied from decomp to avoid alloc. *)
170170+ let sindex = u - H.sbase in
171171+ let l = H.lbase + (sindex / H.ncount) in
172172+ let v = H.vbase + (sindex mod H.ncount) / H.tcount in
173173+ let t = H.tbase + (sindex mod H.tcount) in
174174+ if t = H.tbase then (ordered_add n l; ordered_add n v) else
175175+ (ordered_add n l; ordered_add n v; ordered_add n t)
176176+ end
177177+ else
178178+ begin match decomp_prop u with
179179+ | [||] -> ordered_add n u
180180+ | d ->
181181+ if d_compatibility d.(0) && not n.compat then ordered_add n u else
182182+ begin
183183+ add n (_d_uchar d.(0));
184184+ for i = 1 to Array.length d - 1 do add n d.(i) done
185185+ end
186186+ end
187187+188188+let compose n = (* canonical composition algorithm. *)
189189+ let rec loop ~last_starter ~prev_ccc i =
190190+ if i > n.last then () else
191191+ let ccc_i = _ccc n.acc.(i) in
192192+ let u_comp = _composite n.acc.(last_starter) n.acc.(i) in
193193+ match (u_comp = ux_none || (ccc_i = 0 && last_starter <> i - 1)) with
194194+ | true ->
195195+ let last_starter = if ccc_i = 0 then i else last_starter in
196196+ loop ~last_starter ~prev_ccc:ccc_i (i + 1)
197197+ | false ->
198198+ match prev_ccc <> 0 && prev_ccc >= ccc_i with
199199+ | true -> loop ~last_starter ~prev_ccc:ccc_i (i + 1)
200200+ | false ->
201201+ n.acc.(last_starter) <- u_comp;
202202+ Array.blit n.acc (i + 1) n.acc i (n.last - i);
203203+ n.last <- n.last - 1;
204204+ let prev_ccc = _ccc n.acc.(last_starter) in
205205+ loop ~last_starter ~prev_ccc (last_starter + 1)
206206+ in
207207+ let last_starter = n.first in
208208+ let prev_ccc = _ccc n.acc.(last_starter) in
209209+ loop ~last_starter ~prev_ccc (last_starter + 1)
210210+211211+let flush_next n =
212212+ let ret = `Uchar (Uchar.unsafe_of_int n.acc.(n.first)) in
213213+ if n.first = n.last then (n.first <- 0; n.last <- -1) else
214214+ (n.first <- n.first + 1);
215215+ ret
216216+217217+let flush_start n = if n.compose then compose n; flush_next n
218218+let add n = function
219219+| `Uchar u as uc ->
220220+ let u = Uchar.to_int u in
221221+ begin match n.state with
222222+ | Boundary ->
223223+ if n.boundary u
224224+ then (let prev = n.uc in n.uc <- uc; (prev :> ret))
225225+ else (n.state <- Acc; add n (get_u n); add n u; `Await)
226226+ | Acc ->
227227+ if n.boundary u
228228+ then (n.state <- Flush; n.uc <- uc; flush_start n)
229229+ else (add n u; `Await)
230230+ | Start ->
231231+ if n.boundary u
232232+ then (n.state <- Boundary; n.uc <- uc; `Await)
233233+ else (n.state <- Acc; add n u; `Await)
234234+ | Flush -> err_exp_await uc
235235+ | End -> err_ended uc
236236+ end
237237+| `Await ->
238238+ begin match n.state with
239239+ | Flush ->
240240+ if not (acc_empty n) then flush_next n else
241241+ if n.is_end then (n.state <- End; `End) else
242242+ (n.state <- Boundary; `Await)
243243+ | Start | Boundary | Acc -> `Await
244244+ | End -> `End
245245+ end
246246+| `End ->
247247+ n.is_end <- true;
248248+ begin match n.state with
249249+ | Boundary -> n.state <- End; (n.uc :> ret)
250250+ | Acc -> n.state <- Flush; flush_start n
251251+ | Start -> n.state <- End; `End
252252+ | Flush -> err_exp_await `End
253253+ | End -> err_ended `End
254254+ end
+200
vendor/opam/uunf/src/uunf.mli
···11+(*---------------------------------------------------------------------------
22+ Copyright (c) 2012 The uunf programmers. All rights reserved.
33+ SPDX-License-Identifier: ISC
44+ ---------------------------------------------------------------------------*)
55+66+(** Unicode text normalization.
77+88+ [Uunf] normalizes Unicode text. It supports all Unicode
99+ normalization forms. The module is independent from any IO
1010+ mechanism or Unicode text data structure and it can process text
1111+ without a complete in-memory representation of the data.
1212+1313+ The supported Unicode version is determined by the {!unicode_version}
1414+ value.
1515+1616+ Consult the {{!basics}basics}, {{!limits}limitations} and
1717+ {{!examples}examples} of use.
1818+1919+ {3 References}
2020+ {ul
2121+ {- The Unicode Consortium.
2222+ {e {{:http://www.unicode.org/versions/latest}The Unicode Standard}}.
2323+ (latest version)}
2424+ {- Mark Davis.
2525+ {e {{:http://www.unicode.org/reports/tr15/}UAX #15 Unicode Normalization
2626+ Forms}}. (latest version)}
2727+ {- The Unicode Consortium.
2828+ {e {{:http://www.unicode.org/charts/normalization/}Normalization charts}.
2929+ }}} *)
3030+3131+(** {1 Normalize} *)
3232+3333+type form = [ `NFD | `NFC | `NFKD | `NFKC ]
3434+(** The type for normalization forms.
3535+ {ul
3636+ {- [`NFD] {{:http://www.unicode.org/glossary/#normalization_form_d}
3737+ normalization form D}, canonical decomposition.}
3838+ {- [`NFC] {{:http://www.unicode.org/glossary/#normalization_form_c}
3939+ normalization form C}, canonical decomposition followed by
4040+ canonical composition
4141+ ({{:http://www.w3.org/TR/charmod-norm/}recommended} for the www).}
4242+ {- [`NFKD] {{:http://www.unicode.org/glossary/#normalization_form_kd}
4343+ normalization form KD}, compatibility decomposition.}
4444+ {- [`NFKC] {{:http://www.unicode.org/glossary/#normalization_form_kc}
4545+ normalization form KC}, compatibility decomposition,
4646+ followed by canonical composition.}} *)
4747+4848+type t
4949+(** The type for Unicode text normalizers. *)
5050+5151+type ret = [ `Uchar of Uchar.t | `End | `Await ]
5252+(** The type for normalizer results. See {!add}. *)
5353+5454+val create : [< form ] -> t
5555+(** [create nf] is an Unicode text normalizer for the normal form [nf]. *)
5656+5757+val form : t -> form
5858+(** [form n] is the normalization form of [n]. *)
5959+6060+val add : t -> [ `Uchar of Uchar.t | `Await | `End ] -> ret
6161+(** [add n v] is:
6262+ {ul
6363+ {- [`Uchar u] if [u] is the next character in the normalized
6464+ sequence. The client must then call [add] with [`Await]
6565+ until [`Await] is returned.}
6666+ {- [`Await] when the normalizer is ready to add a new
6767+ [`Uchar] or [`End].}}
6868+6969+ For [v] use [`Uchar u] to add a new character to the sequence
7070+ to normalize and [`End] to signal the end of sequence. After
7171+ adding one of these two values, always call [add] with [`Await]
7272+ until [`Await] is returned.
7373+7474+ {b Raises.} [Invalid_argument] if [`Uchar ] or [`End] is
7575+ added directly after an [`Uchar] was returned by the normalizer
7676+ or if an [`Uchar] is added after [`End] was added. *)
7777+7878+val reset : t -> unit
7979+(** [reset n] resets the normalizer to a state equivalent to the
8080+ state of [Uunf.create (Uunf.form n)]. *)
8181+8282+val copy : t -> t
8383+(** [copy n] is a copy of [n] in its current state. Subsequent
8484+ {!add}s on [n] do not affect the copy. *)
8585+8686+val pp_ret : Format.formatter -> ret -> unit
8787+(** [pp_ret ppf v] prints an unspecified representation of [v] on [ppf]. *)
8888+8989+(** {1:props Normalization properties}
9090+9191+ These properties are used internally to implement the normalizers.
9292+ They are not needed to use the module but are exposed as they may
9393+ be useful to implement other algorithms. *)
9494+9595+val unicode_version : string
9696+(** [unicode_version] is the Unicode version supported by the module. *)
9797+9898+val ccc : Uchar.t -> int
9999+(** [ccc u] is [u]'s
100100+ {{:http://www.unicode.org/glossary/#combining_class}canonical combining
101101+ class} value. *)
102102+103103+val decomp : Uchar.t -> int array
104104+(** [decomp u] is [u]'s
105105+ {{:http://www.unicode.org/glossary/#decomposition_mapping}decomposition
106106+ mapping}. If the empty array is returned, [u] decomposes to itself.
107107+108108+ The first number in the array contains additional information, it
109109+ cannot be used as an {!Uchar.t}. Use {!d_uchar} on the number to get the
110110+ actual character and {!d_compatibility} to find out if this is
111111+ a compatibility decomposition. All other characters of the array
112112+ are guaranteed to be convertible using {!Uchar.of_int}.
113113+114114+ {b Warning.} Do {b not} mutate the array. *)
115115+116116+val d_uchar : int -> Uchar.t
117117+(** See {!decomp}. *)
118118+119119+val d_compatibility : int -> bool
120120+(** See {!decomp}. *)
121121+122122+val composite : Uchar.t -> Uchar.t -> Uchar.t option
123123+(** [composite u1 u2] is the
124124+ {{:http://www.unicode.org/glossary/#primary_composite}primary composite}
125125+ canonically equivalent to the sequence [<u1,u2>], if any. *)
126126+127127+(** {1:limits Limitations}
128128+129129+ An [Uunf] normalizer consumes only a small bounded amount of
130130+ memory on ordinary, {e meaningful} text. However on legal but {e
131131+ degenerate} text like a
132132+ {{:http://www.unicode.org/glossary/#starter}starter} followed by
133133+ 10'000 combining
134134+ {{:http://www.unicode.org/glossary/#nonspacing_mark}non-spacing
135135+ marks} it will have to bufferize all the marks (a workaround is
136136+ to first convert your input to
137137+ {{:http://www.unicode.org/reports/tr15/#Stream_Safe_Text_Format}stream-safe
138138+ text format}). *)
139139+140140+(** {1:basics Basics}
141141+142142+ A normalizer is a stateful filter that inputs a sequence of
143143+ characters and outputs an equivalent sequence in the requested
144144+ normal form.
145145+146146+ The function {!create} returns a new normalizer for a given normal
147147+ form:
148148+{[
149149+let nfd = Uunf.create `NFD
150150+]}
151151+ To add characters to the sequence to normalize, call {!add} on
152152+ [nfd] with [`Uchar _]. To end the sequence, call {!add} on [nfd]
153153+ with [`End]. The normalized sequence of characters is returned,
154154+ character by character, by the successive calls to {!add}.
155155+156156+ The client and the normalizer must wait on each other to limit
157157+ internal buffering: each time the client adds to the sequence by
158158+ calling {!add} with [`Uchar] or [`End] it must continue to call
159159+ {!add} with [`Await] until the normalizer returns [`Await]. In
160160+ practice this leads to the following kind of control flow:
161161+{[
162162+let rec add acc v = match Uunf.add nfd v with
163163+| `Uchar u -> add (u :: acc) `Await
164164+| `Await | `End -> acc
165165+]}
166166+ For example to normalize the character [U+00E9] (é) with [nfd] to a list
167167+ of characters we can write:
168168+{[
169169+let e_acute = Uchar.of_int 0x00E9
170170+let e_acute_nfd = List.rev (add (add [] (`Uchar e_acute)) `End)
171171+]}
172172+ The next section has more examples.
173173+*)
174174+175175+(** {1:examples Examples}
176176+177177+ {2:utf8 UTF-8 normalization}
178178+179179+ [utf_8_normalize nf s] is the UTF-8 encoded normal form [nf] of
180180+ the UTF-8 encoded string [s].
181181+{[
182182+let utf_8_normalize nf s =
183183+ let rec add buf normalizer v = match Uunf.add normalizer v with
184184+ | `Uchar u -> Buffer.add_utf_8_uchar buf u; add buf normalizer `Await
185185+ | `Await | `End -> ()
186186+ in
187187+ let rec loop buf s i max normalizer =
188188+ if i > max then (add buf normalizer `End; Buffer.contents buf) else
189189+ let dec = String.get_utf_8_uchar s i in
190190+ add buf normalizer (`Uchar (Uchar.utf_decode_uchar dec));
191191+ loop buf s (i + Uchar.utf_decode_length dec) max normalizer
192192+ in
193193+ let buf = Buffer.create (String.length s * 3) in
194194+ let normalizer = Uunf.create nf in
195195+ loop buf s 0 (String.length s - 1) normalizer
196196+]}
197197+198198+ Note that this functionality is available directly through
199199+ {!Uunf_string.normalize_utf_8}
200200+*)
···11+(*---------------------------------------------------------------------------
22+ Copyright (c) 2020 The uucp programmers. All rights reserved.
33+ SPDX-License-Identifier: ISC
44+ ---------------------------------------------------------------------------*)
55+66+let pf = Format.fprintf
77+let strf = Format.asprintf
88+let string = Format.pp_print_string
99+let string_X ppf s =
1010+ Format.pp_open_vbox ppf 1; string ppf "\"";
1111+ for i = 0 to String.length s - 1 do
1212+ if i mod 16 = 0 && i > 0 then pf ppf "\\@\n";
1313+ pf ppf "\\x%02x" (Char.code s.[i])
1414+ done;
1515+ string ppf "\""; Format.pp_close_box ppf ()
1616+1717+let string_XN ppf = function "" -> string ppf "snil" | x -> string_X ppf x
1818+let bool = Format.pp_print_bool
1919+let sp = Format.pp_print_space
2020+let semi ppf () = string ppf ";"; sp ppf ()
2121+let int = Format.pp_print_int
2222+let iter i ?(sep = sp) pp ppf x =
2323+ let fst = ref true in
2424+ i (fun v -> (if !fst then fst := false else sep ppf ()); pp ppf v) x
2525+2626+let as_array i pp ppf = pf ppf "@[<2>[|%a|]@]" (iter i ~sep:semi pp)
2727+let array pp = as_array Array.iter pp
2828+let array_N pp ppf = function [||] -> string ppf "nil" | x -> array pp ppf x
2929+3030+module R = struct
3131+ type _ record =
3232+ | [] : unit record
3333+ | (::) :
3434+ (string * (Format.formatter -> 'a -> unit)) * 'b record ->
3535+ ('a -> 'b) record
3636+end
3737+3838+let record record ppf =
3939+ let field name pp_v ppf v = pf ppf "@[<1>%s =@ %a@]" name pp_v v in
4040+ let open R in (* 4.03 compat *)
4141+ let rec go : type a. (unit -> unit) -> a R.record -> a = fun k -> function
4242+ | [] -> pf ppf "@[<2>{ %a }@]" (fun _ -> k) ()
4343+ | [name, pp_v] ->
4444+ fun v -> go (fun () -> k (); field name pp_v ppf v) []
4545+ | (name, pp_v) :: record ->
4646+ fun v -> go (fun () -> k (); field name pp_v ppf v; semi ppf ()) record
4747+ in
4848+ go ignore record
+29
vendor/opam/uunf/src/uunf_string.ml
···11+(*---------------------------------------------------------------------------
22+ Copyright (c) 2015 The uunf programmers. All rights reserved.
33+ SPDX-License-Identifier: ISC
44+ ---------------------------------------------------------------------------*)
55+66+let normalize_utf_x get_utf add_utf nf s =
77+ let rec add add_utf buf normalizer v = match Uunf.add normalizer v with
88+ | `Uchar u -> add_utf buf u; add add_utf buf normalizer `Await
99+ | `Await | `End -> ()
1010+ in
1111+ let rec loop get_utf add_utf buf s i max normalizer =
1212+ if i > max then (add add_utf buf normalizer `End; Buffer.contents buf) else
1313+ let dec = get_utf s i in
1414+ let u = Uchar.utf_decode_uchar dec in
1515+ add add_utf buf normalizer (`Uchar u);
1616+ loop get_utf add_utf buf s (i + Uchar.utf_decode_length dec) max normalizer
1717+ in
1818+ let b = Buffer.create (String.length s * 3) in
1919+ let normalizer = Uunf.create nf in
2020+ loop get_utf add_utf b s 0 (String.length s - 1) normalizer
2121+2222+let normalize_utf_8 nf s =
2323+ normalize_utf_x String.get_utf_8_uchar Buffer.add_utf_8_uchar nf s
2424+2525+let normalize_utf_16be nf s =
2626+ normalize_utf_x String.get_utf_16be_uchar Buffer.add_utf_16be_uchar nf s
2727+2828+let normalize_utf_16le nf s =
2929+ normalize_utf_x String.get_utf_16le_uchar Buffer.add_utf_16le_uchar nf s
+25
vendor/opam/uunf/src/uunf_string.mli
···11+(*---------------------------------------------------------------------------
22+ Copyright (c) 2015 The uunf programmers. All rights reserved.
33+ SPDX-License-Identifier: ISC
44+ ---------------------------------------------------------------------------*)
55+66+(** Unicode text normalization on UTF OCaml strings.
77+88+ {!Uunf} functions acting directly on UTF encoded OCaml strings.
99+1010+ {b Warning.} All these function silently replace malformed encoded Unicode
1111+ data by a {!Stdlib.Uchar.rep} character. *)
1212+1313+(** {1:norm Normalize} *)
1414+1515+val normalize_utf_8 : Uunf.form -> string -> string
1616+(** [normalize_utf_8 nf s] is the UTF-8 encoded string [s] in normal
1717+ form [nf]. *)
1818+1919+val normalize_utf_16be : Uunf.form -> string -> string
2020+(** [normalize_utf_16be nf s] is the UTF-16BE encoded string [s] in
2121+ normal form [nf]. *)
2222+2323+val normalize_utf_16le : Uunf.form -> string -> string
2424+(** [normalize_utf_16le nf s] is the UTF-16LE encoded string [s] in
2525+ normal form [nf]. *)
+58
vendor/opam/uunf/src/uunf_tmap.ml
···11+(*---------------------------------------------------------------------------
22+ Copyright (c) 2012 The uunf programmers. All rights reserved.
33+ SPDX-License-Identifier: ISC
44+ ---------------------------------------------------------------------------*)
55+66+(* Trie character maps *)
77+88+type 'a t =
99+ { default : 'a; (* default value. *)
1010+ l0 : 'a array array array } (* 0x1FFFFF as 0x1FF - 0xFF - 0xF. *)
1111+1212+let nil = [||]
1313+let l0_shift = 12
1414+let l0_size = 272 (* 0x10F + 1 *)
1515+let l1_shift = 4
1616+let l1_mask = 0xFF
1717+let l1_size = 256 (* 0xFF + 1 *)
1818+let l2_mask = 0xF
1919+let l2_size = 16 (* 0xF + 1 *)
2020+let get m u =
2121+ let l1 = Array.unsafe_get m.l0 (u lsr l0_shift) in
2222+ if l1 == nil then m.default else
2323+ let l2 = Array.unsafe_get l1 (u lsr l1_shift land l1_mask) in
2424+ if l2 == nil then m.default else
2525+ Array.unsafe_get l2 (u land l2_mask)
2626+2727+let create default = { default; l0 = Array.make l0_size nil }
2828+let set m u v =
2929+ if v = m.default then () else
3030+ let i = u lsr l0_shift in
3131+ if m.l0.(i) == nil then m.l0.(i) <- Array.make l1_size nil;
3232+ let j = u lsr l1_shift land l1_mask in
3333+ if m.l0.(i).(j) == nil then m.l0.(i).(j) <- Array.make l2_size m.default;
3434+ m.l0.(i).(j).(u land l2_mask) <- v
3535+3636+let size v_size m = match m.l0 with
3737+| [||] -> 3 + 1 + v_size m.default
3838+| l0 ->
3939+ let size = ref (3 + v_size m.default + 1 + Array.length l0) in
4040+ for i = 0 to Array.length l0 - 1 do match l0.(i) with
4141+ | [||] -> ()
4242+ | l1 ->
4343+ size := !size + (1 + Array.length l1);
4444+ for j = 0 to Array.length l1 - 1 do match l1.(j) with
4545+ | [||] -> ()
4646+ | l2 ->
4747+ size := !size + (1 + Array.length l2);
4848+ for k = 0 to Array.length l2 - 1 do
4949+ size := !size + v_size l2.(k)
5050+ done;
5151+ done;
5252+ done;
5353+ !size
5454+5555+let dump pp_v ppf m =
5656+ let open Uunf_fmt in
5757+ record ["default", pp_v; "l0", pp_v |> array_N |> array_N |> array]
5858+ ppf m.default m.l0
+73
vendor/opam/uunf/src/uunf_tmapbool.ml
···11+(*---------------------------------------------------------------------------
22+ Copyright (c) 2012 The uunf programmers. All rights reserved.
33+ SPDX-License-Identifier: ISC
44+ ---------------------------------------------------------------------------*)
55+66+(* Trie character boolean maps *)
77+88+type t =
99+ { default : bool; (* default value. *)
1010+ l0 : string array array } (* 0x1FFFFF as 0x1FF - 0xF - 0xFF *)
1111+1212+let nil = [||]
1313+let snil = ""
1414+let l0_shift = 12
1515+let l0_size = 272 (* 0x10F + 1 *)
1616+let l1_shift = 8
1717+let l1_mask = 0xF
1818+let l1_size = 16 (* 0xF + 1 *)
1919+let l2_mask = 0xFF
2020+let l2_size = 32 (* 0xFF + 1 / 8 *)
2121+let get m u =
2222+ let l1 = Array.unsafe_get m.l0 (u lsr l0_shift) in
2323+ if l1 == nil then m.default else
2424+ let l2 = Array.unsafe_get l1 (u lsr l1_shift land l1_mask) in
2525+ if l2 == snil then m.default else
2626+ let k = u land l2_mask in
2727+ let byte_num = k lsr 3 (* / 8 *) in
2828+ let bit_num = k land 7 (* mod 8 *) in
2929+ let byte = Char.code (String.unsafe_get l2 byte_num) in
3030+ byte land (1 lsl bit_num) > 0
3131+3232+let create default = { default; l0 = Array.make l0_size nil }
3333+let set m u b =
3434+ let l2_make m = Bytes.make l2_size (if m.default then '\xFF' else '\x00') in
3535+ if b = m.default then () else
3636+ let i = u lsr l0_shift in
3737+ if m.l0.(i) == nil then m.l0.(i) <- Array.make l1_size snil;
3838+ let j = u lsr l1_shift land l1_mask in
3939+ if m.l0.(i).(j) == snil then
4040+ m.l0.(i).(j) <- Bytes.unsafe_to_string (l2_make m);
4141+ let k = u land l2_mask in
4242+ let byte_num = k lsr 3 (* / 8 *) in
4343+ let bit_num = k land 7 (* mod 8 *) in
4444+ let byte = Char.code m.l0.(i).(j).[byte_num] in
4545+ let new_byte =
4646+ if b then (Char.unsafe_chr (byte lor (1 lsl bit_num))) else
4747+ (Char.unsafe_chr (byte land lnot (1 lsl bit_num)))
4848+ in
4949+ Bytes.set (Bytes.unsafe_of_string m.l0.(i).(j)) byte_num new_byte
5050+5151+let size m = match m.l0 with
5252+| [||] -> 3 + 1
5353+| l0 ->
5454+ let size = ref (3 + 1 + Array.length l0) in
5555+ for i = 0 to Array.length l0 - 1 do match l0.(i) with
5656+ | [||] -> ()
5757+ | l1 ->
5858+ size := !size + 1 + Array.length l1;
5959+ for j = 0 to Array.length l1 - 1 do
6060+ size := !size + 1 + ((String.length l1.(j) * 8) / Sys.word_size)
6161+ done;
6262+ done;
6363+ !size
6464+6565+let iter_blobs i m = Array.(iter (iter i)) m.l0
6666+6767+let dump_pp pp_v ppf m =
6868+ let open Uunf_fmt in
6969+ record ["default", bool; "l0", pp_v |> array_N |> array]
7070+ ppf m.default m.l0
7171+7272+let pp_v = Uunf_fmt.string_XN
7373+let dump = dump_pp pp_v
+62
vendor/opam/uunf/src/uunf_tmapbyte.ml
···11+(*---------------------------------------------------------------------------
22+ Copyright (c) 2012 The uunf programmers. All rights reserved.
33+ SPDX-License-Identifier: ISC
44+ ---------------------------------------------------------------------------*)
55+66+(* Trie character byte maps *)
77+88+type t =
99+ { default : int; (* default value. *)
1010+ l0 : string array array } (* 0x1FFFFF as 0x1FF - 0xF - 0xFF *)
1111+1212+let nil = [||]
1313+let snil = ""
1414+let l0_shift = 12
1515+let l0_size = 272 (* 0x10F + 1 *)
1616+let l1_shift = 8
1717+let l1_mask = 0xF
1818+let l1_size = 16 (* 0xF + 1 *)
1919+let l2_mask = 0xFF
2020+let l2_size = 256 (* 0xFF + 1 *)
2121+let get m u =
2222+ let l1 = Array.get m.l0 (u lsr l0_shift) in
2323+ if l1 == nil then m.default else
2424+ let l2 = Array.unsafe_get l1 (u lsr l1_shift land l1_mask) in
2525+ if l2 == snil then m.default else
2626+ Char.code (String.unsafe_get l2 (u land l2_mask))
2727+2828+let create default = { default; l0 = Array.make l0_size nil }
2929+let set m u byte =
3030+ let l2_make m = Bytes.make l2_size (Char.chr m.default) in
3131+ if byte = m.default then () else
3232+ let i = u lsr l0_shift in
3333+ if m.l0.(i) == nil then m.l0.(i) <- Array.make l1_size snil;
3434+ let j = u lsr l1_shift land l1_mask in
3535+ if m.l0.(i).(j) == snil then
3636+ m.l0.(i).(j) <- Bytes.unsafe_to_string (l2_make m);
3737+ let k = u land l2_mask in
3838+ Bytes.set (Bytes.unsafe_of_string m.l0.(i).(j)) k (Char.unsafe_chr byte)
3939+4040+let size m = match m.l0 with
4141+| [||] -> 3 + 1
4242+| l0 ->
4343+ let size = ref (3 + 1 + Array.length l0) in
4444+ for i = 0 to Array.length l0 - 1 do match l0.(i) with
4545+ | [||] -> ()
4646+ | l1 ->
4747+ size := !size + 1 + Array.length l1;
4848+ for j = 0 to Array.length l1 - 1 do
4949+ size := !size + 1 + ((String.length l1.(j) * 8) / Sys.word_size)
5050+ done;
5151+ done;
5252+ !size
5353+5454+let iter_blobs i m = Array.(iter (iter i)) m.l0
5555+5656+let dump_pp pp_v ppf m =
5757+ let open Uunf_fmt in
5858+ record ["default", int; "l0", pp_v |> array_N |> array]
5959+ ppf m.default m.l0
6060+6161+let pp_v = Uunf_fmt.string_XN
6262+let dump = dump_pp pp_v
+95
vendor/opam/uunf/support/gen.ml
···11+(*---------------------------------------------------------------------------
22+ Copyright 2012 The uunf programmers. All rights reserved.
33+ SPDX-License-Identifier: ISC
44+ ---------------------------------------------------------------------------*)
55+66+(* Extracts normalization data from the Unicode Character Database *)
77+88+let log fmt = Printf.eprintf (fmt ^^ "%!")
99+let pp = Format.fprintf
1010+let str = Printf.sprintf
1111+let str_of_size s =
1212+ let b = s * (Sys.word_size / 8) in
1313+ if b < 1_048_576 then str "%.1f Ko" (float b /. 1024.) else
1414+ if b < 1_073_741_824 then str "%.1f Mo" (float b /. 1024. ** 2.) else
1515+ str "%.1f Go" (float b /. 1024. ** 3.)
1616+1717+(* Characters *)
1818+1919+let is_hangul_syllabe u = 0xAC00 <= u && u <= 0xD7A3
2020+2121+let iter_uchar_ints f =
2222+ let rec loop u =
2323+ let i = Uchar.to_int u in
2424+ if Uchar.equal u Uchar.max then f i else
2525+ (f i; loop (Uchar.succ u))
2626+ in
2727+ loop Uchar.min
2828+2929+(* Compact maps from characters to booleans. *)
3030+3131+let bool_prop_maps prop =
3232+ let tm = Uunf_tmapbool.create true in
3333+ let fm = Uunf_tmapbool.create false in
3434+ let add_uchar u =
3535+ let b = prop u in
3636+ Uunf_tmapbool.set tm u b;
3737+ Uunf_tmapbool.set fm u b;
3838+ in
3939+ iter_uchar_ints add_uchar; tm, fm
4040+4141+let assert_bool_prop_maps prop tm fm =
4242+ let assert_uchar u =
4343+ let fail () = failwith (str "bool prop map failure for U+%04X" u) in
4444+ let b = prop u in
4545+ if b <> Uunf_tmapbool.get tm u then fail ();
4646+ if b <> Uunf_tmapbool.get fm u then fail ();
4747+ in
4848+ iter_uchar_ints assert_uchar
4949+5050+(* Compact maps from characters to bytes. *)
5151+5252+let byte_prop_map ~default prop =
5353+ let m = Uunf_tmapbyte.create default in
5454+ let add_uchar u = Uunf_tmapbyte.set m u (prop u) in
5555+ iter_uchar_ints add_uchar; m
5656+5757+let assert_byte_prop_map prop m =
5858+ let assert_uchar u =
5959+ if (prop u) = Uunf_tmapbyte.get m u then () else
6060+ failwith (str "byte prop map failure for U+%04X" u)
6161+ in
6262+ iter_uchar_ints assert_uchar
6363+6464+(* Compact maps from characters to arbitrary values. *)
6565+6666+let prop_map ~default prop =
6767+ let m = Uunf_tmap.create default in
6868+ let add_uchar u = Uunf_tmap.set m u (prop u) in
6969+ iter_uchar_ints add_uchar; m
7070+7171+let assert_prop_map prop m =
7272+ let assert_uchar u =
7373+ if (prop u) = Uunf_tmap.get m u then () else
7474+ failwith (str "prop map failure for U+%04X" u)
7575+ in
7676+ iter_uchar_ints assert_uchar
7777+7878+let ucd_get ucd u p pstr = match Uucd.cp_prop ucd u p with
7979+| None -> invalid_arg (str "no %s property for U+%04X" pstr u)
8080+| Some v -> v
8181+8282+(* Generate a module *)
8383+8484+let year = (Unix.gmtime (Unix.gettimeofday ())).Unix.tm_year + 1900
8585+8686+let pp_mod pp_mod ppf m =
8787+ pp ppf
8888+"\
8989+(*---------------------------------------------------------------------------
9090+ Copyright (c) %d The uunf programmers. All rights reserved.
9191+ SPDX-License-Identifier: ISC
9292+ ---------------------------------------------------------------------------*)
9393+9494+(* WARNING do not edit. This file was automatically generated. *)
9595+@\n@[%a@]@\n" year pp_mod m
+173
vendor/opam/uunf/support/gen_norm.ml
···11+(*---------------------------------------------------------------------------
22+ Copyright (c) 2015 The uunf programmers. All rights reserved.
33+ SPDX-License-Identifier: ISC
44+ ---------------------------------------------------------------------------*)
55+66+let strf = Printf.sprintf
77+let pp = Format.fprintf
88+let str = Format.asprintf
99+1010+(* Structure sharing *)
1111+1212+let intern (type a) ?eqh iter pp_v ppf x =
1313+ let module H = Hashtbl.Make (struct
1414+ type t = a
1515+ let equal, hash = match eqh with Some fg -> fg | _ -> (=), Hashtbl.hash
1616+ end) in
1717+ let t = H.create 23 and n = ref 0 in
1818+ x |> iter (fun v -> if not (H.mem t v) then begin
1919+ let name = str "v%03d" !n in
2020+ H.add t v name; incr n;
2121+ pp ppf "@[<2>let %s =@ %a@]@\n" name pp_v v
2222+ end);
2323+ (fun ppf v -> match H.find_opt t v with
2424+ | Some name -> pp ppf "%s" name
2525+ | None -> pp_v ppf v)
2626+2727+(* Normalization properties. *)
2828+2929+let pp_boundary nf ucd ppf nf_quick_check =
3030+ Gen.log "%s boundary property as character boolean trie map" nf;
3131+ let prop_str = strf "%s_quick_check" nf in
3232+ let prop u = match Gen.ucd_get ucd u nf_quick_check prop_str with
3333+ | `Maybe | `False -> false
3434+ | `True -> (Gen.ucd_get ucd u Uucd.canonical_combining_class "ccc") = 0
3535+ in
3636+ let tm, fm = Gen.bool_prop_maps prop in
3737+ let tm_size, fm_size = Uunf_tmapbool.size tm, Uunf_tmapbool.size fm in
3838+ let use_fm = tm_size > fm_size in
3939+ Gen.log ", asserting data.\n"; Gen.assert_bool_prop_maps prop tm fm;
4040+ Gen.log " boolean trie map (default true) size: %s\n"
4141+ (Gen.str_of_size tm_size);
4242+ Gen.log " boolean trie map (default false) size: %s\n"
4343+ (Gen.str_of_size fm_size);
4444+ Gen.log " Using map with default %b.\n\n" (not use_fm);
4545+ let m = if use_fm then fm else tm in
4646+ let pp_v = intern Uunf_tmapbool.iter_blobs Uunf_tmapbool.pp_v ppf m in
4747+ pp ppf "@[<2>let %s_boundary_map =@ %a@]@\n@\n"
4848+ nf (Uunf_tmapbool.dump_pp pp_v) m;
4949+ ()
5050+5151+let pp_ccc ppf ucd =
5252+ Gen.log "ccc property as character byte trie map";
5353+ let prop u = Gen.ucd_get ucd u Uucd.canonical_combining_class "ccc" in
5454+ let m = Gen.byte_prop_map ~default:0 prop in
5555+ let t_size = Uunf_tmapbyte.size m in
5656+ Gen.log ", asserting data.\n"; Gen.assert_byte_prop_map prop m;
5757+ Gen.log " trie map size: %s\n\n" (Gen.str_of_size t_size);
5858+ let pp_v = intern Uunf_tmapbyte.iter_blobs Uunf_tmapbyte.pp_v ppf m in
5959+ pp ppf "@[<2>let ccc_map =@ %a@]@\n@\n"
6060+ (Uunf_tmapbyte.dump_pp pp_v) m;
6161+ ()
6262+6363+let pp_decomp ppf ucd =
6464+ Gen.log "decomposition mapping as trie map";
6565+ let default = Uunf_tmap.nil in
6666+ let prop u =
6767+ match Gen.ucd_get ucd u Uucd.decomposition_mapping "decomposition mapping"
6868+ with
6969+ | `Self -> default
7070+ | `Cps cps ->
7171+ let t = Gen.ucd_get ucd u Uucd.decomposition_type "decomposition_type"in
7272+ if Gen.is_hangul_syllabe u then begin
7373+ if t <> `Can then invalid_arg (strf "hangul not canon decomp %X" u);
7474+ default
7575+ end else begin
7676+ let d = Array.of_list cps in
7777+ let compat = t <> `Can in
7878+ if compat then d.(0) <- (1 lsl 24) lor d.(0);
7979+ d
8080+ end
8181+ in
8282+ let m = Gen.prop_map ~default prop in
8383+ let size_v = function [||] -> 0 | a -> 1 + Array.length a in
8484+ let t_size = Uunf_tmap.size size_v m in
8585+ let pp_decomp ppf = function
8686+ | [||] -> pp ppf "nil"
8787+ | a ->
8888+ pp ppf "[|@,";
8989+ for i = 0 to Array.length a - 1 do pp ppf "@,0x%X;@," a.(i) done;
9090+ pp ppf "@,|]"
9191+ in
9292+ Gen.log ", asserting data.\n"; Gen.assert_prop_map prop m;
9393+ Gen.log " trie map size: %s\n\n" (Gen.str_of_size t_size);
9494+ pp ppf "@[<2>let decomp_map =@ %a@]@\n@\n" (Uunf_tmap.dump pp_decomp) m;
9595+ ()
9696+9797+module Cpmap = Uucd.Cpmap
9898+9999+let pp_compose ppf ucd =
100100+ Gen.log "composition to primary composites as trie map";
101101+ let m = ref Cpmap.empty in
102102+ let add_map cp1 cp2 c =
103103+ let l = try Cpmap.find cp1 !m with Not_found -> [] in
104104+ m := Cpmap.add cp1 ((cp2, c) :: l) !m
105105+ in
106106+ let add u =
107107+ match Gen.ucd_get ucd u Uucd.decomposition_mapping "decomposition_mapping"
108108+ with
109109+ | `Self -> ()
110110+ | `Cps cps ->
111111+ let fce = "full_decomposition_exclusion" in
112112+ if Gen.ucd_get ucd u Uucd.full_composition_exclusion fce then () else
113113+ let t = Gen.ucd_get ucd u Uucd.decomposition_type "decomposition_type"in
114114+ if t <> `Can then () else
115115+ if Gen.is_hangul_syllabe u then () else
116116+ match cps with
117117+ | [cp1; cp2] -> add_map cp1 cp2 u
118118+ | _ -> invalid_arg (strf "cannot handle composition for %X" u);
119119+ in
120120+ Gen.iter_uchar_ints add;
121121+ let default = Uunf_tmap.nil in
122122+ let max_comps = ref 0 in
123123+ let prop u =
124124+ try
125125+ let comps = List.sort compare (Cpmap.find u !m) in
126126+ let len = List.length comps in
127127+ let a = Array.make (len * 2) 0 in
128128+ let set i (cp2, c) = a.(2 * i) <- cp2; a.(2 * i + 1) <- c in
129129+ List.iteri set comps;
130130+ max_comps := max !max_comps len;
131131+ a
132132+ with Not_found -> Uunf_tmap.nil
133133+ in
134134+ let m = Gen.prop_map ~default prop in
135135+ let size_v = function [||] -> 0 | a -> 1 + Array.length a in
136136+ let t_size = Uunf_tmap.size size_v m in
137137+ let pp_d ppf = function
138138+ | [||] -> pp ppf "nil"
139139+ | a ->
140140+ pp ppf "[|@,";
141141+ for i = 0 to Array.length a - 1 do pp ppf "@,0x%X;@," a.(i) done;
142142+ pp ppf "@,|]"
143143+ in
144144+ Gen.log ", asserting data.\n"; Gen.assert_prop_map prop m;
145145+ Gen.log " trie map size: %s\n" (Gen.str_of_size t_size);
146146+ Gen.log " max num. of possible composition for a base char: %d\n\n"
147147+ !max_comps;
148148+ pp ppf "@[<2>let compose_map =@ %a@]@\n@\n" (Uunf_tmap.dump pp_d) m;
149149+ ()
150150+151151+let pp_version ppf ucd =
152152+ let version = match String.split_on_char ' ' ucd.Uucd.description with
153153+ | [tok] -> tok
154154+ | [_; tok] -> tok
155155+ | _ -> ucd.Uucd.description
156156+ in
157157+ pp ppf "@[<2>let unicode_version = \"%s\"@]@\n@\n" version
158158+159159+let pp_norms ppf ucd =
160160+ pp_version ppf ucd;
161161+ pp ppf "open Uunf_tmapbool;;@\n@\n";
162162+ pp_boundary "nfc" ucd ppf Uucd.nfc_quick_check;
163163+ pp_boundary "nfd" ucd ppf Uucd.nfd_quick_check;
164164+ pp_boundary "nfkc" ucd ppf Uucd.nfkc_quick_check;
165165+ pp_boundary "nfkd" ucd ppf Uucd.nfkd_quick_check;
166166+ pp ppf "open Uunf_tmapbyte;;@\n@\n";
167167+ pp_ccc ppf ucd;
168168+ pp ppf "open Uunf_tmap;;@\n@\n";
169169+ pp_decomp ppf ucd;
170170+ pp_compose ppf ucd;
171171+ ()
172172+173173+let pp_mod ppf ucd = Gen.pp_mod pp_norms ppf ucd
+62
vendor/opam/uunf/support/generate_data.ml
···11+(*---------------------------------------------------------------------------
22+ Copyright (c) 2015 The uunf programmers. All rights reserved.
33+ SPDX-License-Identifier: ISC
44+ ---------------------------------------------------------------------------*)
55+66+(* Extracts data from the Unicode Character Database *)
77+88+let str = Format.sprintf
99+let exec = Filename.basename Sys.executable_name
1010+1111+let ucd_or_die inf = try
1212+ let ic = if inf = "-" then stdin else open_in inf in
1313+ let d = Uucd.decoder (`Channel ic) in
1414+ match Uucd.decode d with
1515+ | `Ok db -> db
1616+ | `Error e ->
1717+ let (l0, c0), (l1, c1) = Uucd.decoded_range d in
1818+ Printf.eprintf "%s:%d.%d-%d.%d: %s\n%!" inf l0 c0 l1 c1 e;
1919+ exit 1
2020+with Sys_error e -> Printf.eprintf "%s\n%!" e; exit 1
2121+2222+let process inf outf =
2323+ let ucd = (Gen.log "Loading Unicode character database.\n"; ucd_or_die inf) in
2424+ let generate pp outf ucd =
2525+ try
2626+ let oc = if outf = "-" then stdout else open_out outf in
2727+ try
2828+ let ppf = Format.formatter_of_out_channel oc in
2929+ pp ppf ucd;
3030+ Format.pp_print_flush ppf ();
3131+ close_out oc
3232+ with Sys_error _ as e -> close_out oc; raise e
3333+ with Sys_error e -> Printf.eprintf "%s\n%!" e; exit 1
3434+ in
3535+ Gen.log "Note: reported sizes do not take sharing into account.\n";
3636+ generate Gen_norm.pp_mod outf ucd;
3737+ ()
3838+3939+let main () =
4040+ let usage = str
4141+ "Usage: %s [OPTION]... [DBFILE]\n\
4242+ \ Generates data modules from an Unicode character database XML file.\n\
4343+ \ DBFILE defaults to support/ucd.xml\n\
4444+ Options:" exec
4545+ in
4646+ let inf = ref None in
4747+ let set_inf f =
4848+ if !inf = None then inf := Some f else
4949+ raise (Arg.Bad "only one Unicode character database file can be specified")
5050+ in
5151+ let outf = ref None in
5252+ let set r = Arg.String (fun s -> r := Some s) in
5353+ let options = [
5454+ "-o", set outf, "<FILE> output file, defaults to src/uunf_data.ml";
5555+ ]
5656+ in
5757+ Arg.parse (Arg.align options) set_inf usage;
5858+ let inf = match !inf with None -> "support/ucd.xml" | Some inf -> inf in
5959+ let outf = match !outf with None -> "src/uunf_data.ml" | Some outf -> outf in
6060+ process inf outf
6161+6262+let () = main ()
+19
vendor/opam/uunf/test/examples.ml
···11+(*---------------------------------------------------------------------------
22+ Copyright (c) 2012 The uunf programmers. All rights reserved.
33+ SPDX-License-Identifier: CC0-1.0
44+ ---------------------------------------------------------------------------*)
55+66+let utf_8_normalize nf s =
77+ let rec add buf normalizer v = match Uunf.add normalizer v with
88+ | `Uchar u -> Buffer.add_utf_8_uchar buf u; add buf normalizer `Await
99+ | `Await | `End -> ()
1010+ in
1111+ let rec loop buf s i max normalizer =
1212+ if i > max then (add buf normalizer `End; Buffer.contents buf) else
1313+ let dec = String.get_utf_8_uchar s i in
1414+ add buf normalizer (`Uchar (Uchar.utf_decode_uchar dec));
1515+ loop buf s (i + Uchar.utf_decode_length dec) max normalizer
1616+ in
1717+ let buf = Buffer.create (String.length s * 3) in
1818+ let normalizer = Uunf.create nf in
1919+ loop buf s 0 (String.length s - 1) normalizer
+251
vendor/opam/uunf/test/test_uunf.ml
···11+(*---------------------------------------------------------------------------
22+ Copyright (c) 2012 The uunf programmers. All rights reserved.
33+ SPDX-License-Identifier: ISC
44+ ---------------------------------------------------------------------------*)
55+66+open B0_testing
77+open B0_std
88+open Result.Syntax
99+1010+(* Uunf tests, including Unicode's Normalization Conformance tests *)
1111+1212+let uchar_dump ppf u = Format.fprintf ppf "U+%04X" (Uchar.to_int u)
1313+1414+(* Conformance data decoding and tests *)
1515+1616+type conformance_test = int list array * string (* columns + comment. *)
1717+module Uset = Set.Make (Uchar) (* not a diet set, but will do here. *)
1818+1919+let uchar_of_string v = (* parses a scalar value. *)
2020+ let is_hex c = (0x30 <= c && c <= 0x39) || (0x41 <= c && c <= 0x46) in
2121+ let cp = ref 0 in
2222+ for k = 0 to (String.length v) - 1 do
2323+ let c = Char.code v.[k] in
2424+ if not (is_hex c) then failwith "" else
2525+ cp := !cp * 16 + (if c <= 0x39 then c - 48 else c - 55)
2626+ done;
2727+ Uchar.of_int !cp
2828+2929+let uchars_of_string v = List.map uchar_of_string (String.split_on_char ' ' v)
3030+3131+let decode_conformance_data test_data_file =
3232+ Test.log "Reading test data from %a" (Fmt.code' Fpath.pp) test_data_file;
3333+ let split_string sep s =
3434+ List.filter (fun s -> s <> "") (String.split_on_char sep s)
3535+ in
3636+ let rec loop tests collect_decomps decomps = function
3737+ | [] -> List.rev tests, decomps
3838+ | l :: ls ->
3939+ try match split_string '#' l with
4040+ | "@Part1 " :: _ -> loop tests true decomps ls
4141+ | "@Part2 " :: _ -> loop tests false decomps ls
4242+ | p :: _ :: _ when p.[0] = '@' -> loop tests collect_decomps decomps ls
4343+ | [] | _ :: [] -> loop tests collect_decomps decomps ls
4444+ | test :: comment :: _ ->
4545+ begin match split_string ';' test with
4646+ | c1 :: c2 :: c3 :: c4 :: c5 :: _ ->
4747+ let test = [| uchars_of_string c1; uchars_of_string c2;
4848+ uchars_of_string c3; uchars_of_string c4;
4949+ uchars_of_string c5; |], comment
5050+ in
5151+ let decomps =
5252+ if not collect_decomps then decomps else
5353+ match (fst test).(0) with [ uchar ] ->
5454+ Uset.add uchar decomps
5555+ | _ -> failwith ""
5656+ in
5757+ loop (test :: tests) collect_decomps decomps ls
5858+ | _ -> failwith ""
5959+ end
6060+ with Failure _ ->
6161+ Test.log "Unable to parse line:\n`%s'\n" l;
6262+ loop tests collect_decomps decomps ls
6363+ in
6464+ let* s = Os.File.read test_data_file in
6565+ Ok (loop [] false Uset.empty (String.split_on_char '\n' s))
6666+6767+6868+let conformance = Test.Arg.make ()
6969+7070+let test_conformance_normalizations =
7171+ Test.test' conformance "conformance normalization invariants" @@
7272+ fun (tests, _) ->
7373+ let nc, nfc = Array.init 5 (fun _ -> Uunf.create `NFC), Array.make 5 [] in
7474+ let nd, nfd = Array.init 5 (fun _ -> Uunf.create `NFD), Array.make 5 [] in
7575+ let nkc, nfkc = Array.init 5 (fun _ -> Uunf.create `NFKC), Array.make 5 [] in
7676+ let nkd, nfkd = Array.init 5 (fun _ -> Uunf.create `NFKD), Array.make 5 [] in
7777+ let rec add n acc v = match Uunf.add n v with
7878+ | `Uchar u -> add n (u :: acc) `Await
7979+ | `Await | `End -> acc
8080+ in
8181+ let parallel_add i v =
8282+ nfc.(i) <- add nc.(i) nfc.(i) v;
8383+ nfd.(i) <- add nd.(i) nfd.(i) v;
8484+ nfkc.(i) <- add nkc.(i) nfkc.(i) v;
8585+ nfkd.(i) <- add nkd.(i) nfkd.(i) v
8686+ in
8787+ let test (cs, comment) =
8888+ for i = 0 to 4 do
8989+ Uunf.reset nc.(i); nfc.(i) <- [];
9090+ Uunf.reset nd.(i); nfd.(i) <- [];
9191+ Uunf.reset nkc.(i); nfkc.(i) <- [];
9292+ Uunf.reset nkd.(i); nfkd.(i) <- [];
9393+ List.iter (fun u -> parallel_add i (`Uchar u)) cs.(i);
9494+ parallel_add i `End;
9595+ nfc.(i) <- List.rev nfc.(i);
9696+ nfd.(i) <- List.rev nfd.(i);
9797+ nfkc.(i) <- List.rev nfkc.(i);
9898+ nfkd.(i) <- List.rev nfkd.(i);
9999+ done;
100100+ if cs.(1) <> nfc.(0) then Test.fail "NFC: c2 <> toNFC(c1) for%s" comment;
101101+ if cs.(1) <> nfc.(1) then Test.fail "NFC: c2 <> toNFC(c2) for%s" comment;
102102+ if cs.(1) <> nfc.(2) then Test.fail "NFC: c2 <> toNFC(c3) for%s" comment;
103103+ if cs.(3) <> nfc.(3) then Test.fail "NFC: c4 <> toNFC(c4) for%s" comment;
104104+ if cs.(3) <> nfc.(4) then Test.fail "NFC: c4 <> toNFC(c5) for%s" comment;
105105+ if cs.(2) <> nfd.(0) then Test.fail "NFD: c3 <> toNFD(c1) for%s" comment;
106106+ if cs.(2) <> nfd.(1) then Test.fail "NFD: c3 <> toNFD(c2) for%s" comment;
107107+ if cs.(2) <> nfd.(2) then Test.fail "NFD: c3 <> toNFD(c3) for%s" comment;
108108+ if cs.(4) <> nfd.(3) then Test.fail "NFD: c5 <> toNFD(c4) for%s" comment;
109109+ if cs.(4) <> nfd.(4) then Test.fail "NFD: c5 <> toNFD(c5) for%s" comment;
110110+ if cs.(3) <> nfkc.(0) then Test.fail "NFKC: c4 <> toNFKC(c1) for%s" comment;
111111+ if cs.(3) <> nfkc.(1) then Test.fail "NFKC: c4 <> toNFKC(c2) for%s" comment;
112112+ if cs.(3) <> nfkc.(2) then Test.fail "NFKC: c4 <> toNFKC(c3) for%s" comment;
113113+ if cs.(3) <> nfkc.(3) then Test.fail "NFKC: c4 <> toNFKC(c4) for%s" comment;
114114+ if cs.(3) <> nfkc.(4) then Test.fail "NFKC: c4 <> toNFKC(c5) for%s" comment;
115115+ if cs.(4) <> nfkd.(0) then Test.fail "NFKD: c5 <> toNFKD(c1) for%s" comment;
116116+ if cs.(4) <> nfkd.(1) then Test.fail "NFKD: c5 <> toNFKD(c2) for%s" comment;
117117+ if cs.(4) <> nfkd.(2) then Test.fail "NFKD: c5 <> toNFKD(c3) for%s" comment;
118118+ if cs.(4) <> nfkd.(3) then Test.fail "NFKD: c5 <> toNFKD(c4) for%s" comment;
119119+ if cs.(4) <> nfkd.(4) then Test.fail "NFKD: c5 <> toNFKD(c5) for%s" comment;
120120+ in
121121+ List.iter test tests
122122+123123+let test_conformance_non_decomposables =
124124+ Test.test' conformance "conformance of non-decomposable characters" @@
125125+ fun (_, decomps) ->
126126+ let nc = Uunf.create `NFC in
127127+ let nd = Uunf.create `NFD in
128128+ let nkc = Uunf.create `NFKC in
129129+ let nkd = Uunf.create `NFKD in
130130+ let norm n u =
131131+ let rec add acc v = match Uunf.add n v with
132132+ | `Uchar u -> add (u :: acc) `Await
133133+ | `Await | `End -> acc
134134+ in
135135+ List.rev (add (add [] (`Uchar u)) `End)
136136+ in
137137+ let check u =
138138+ if Uset.mem u decomps then () else
139139+ begin
140140+ let ul = [u] in
141141+ Uunf.reset nc; Uunf.reset nd; Uunf.reset nkc; Uunf.reset nkd;
142142+ if norm nc u <> ul then
143143+ Test.fail "NFC: %a <> toNFC(%a)" uchar_dump u uchar_dump u;
144144+ if norm nd u <> ul then
145145+ Test.fail "NFD: %a <> toNFD(%a)" uchar_dump u uchar_dump u;
146146+ if norm nkc u <> ul then
147147+ Test.fail "NFKC: %a <> toNFKC(%a)" uchar_dump u uchar_dump u;
148148+ if norm nkd u <> ul then
149149+ Test.fail "NFKD: %a <> toNFKD(%a)" uchar_dump u uchar_dump u;
150150+ end
151151+ in
152152+ (* For each unicode scalar value *)
153153+ let rec loop u =
154154+ if Uchar.equal Uchar.max u then check u else
155155+ (check u; loop (Uchar.succ u))
156156+ in
157157+ loop Uchar.min
158158+159159+let test_ccc =
160160+ Test.test "Uunf.ccc" @@ fun () ->
161161+ Test.int (Uunf.ccc (Uchar.of_int 0x0020)) 0 ~__POS__;
162162+ Test.int (Uunf.ccc (Uchar.of_int 0x0301)) 230 ~__POS__;
163163+ ()
164164+165165+let various_norm_tests ?__POS__ test =
166166+ Test.block ?__POS__ @@ fun () ->
167167+ test [0x1E69] `NFD [0x0073; 0x0323; 0x0307] ;
168168+ test [0x1E69] `NFC [0x1E69];
169169+ test [0x1E0B; 0x0323] `NFD [0x0064; 0x0323; 0x0307];
170170+ test [0x1E0B; 0x0323] `NFC [0x1E0D; 0x0307];
171171+ test [0xFB01] `NFD [0xFB01];
172172+ test [0xFB01] `NFC [0xFB01];
173173+ test [0xFB01] `NFKD [0x0066; 0x0069];
174174+ test [0xFB01] `NFKC [0x0066; 0x0069];
175175+ test [0x0032; 0x2075] `NFD [0x0032; 0x2075];
176176+ test [0x0032; 0x2075] `NFC [0x0032; 0x2075];
177177+ test [0x0032; 0x2075] `NFKD [0x0032; 0x0035];
178178+ test [0x0032; 0x2075] `NFKC [0x0032; 0x0035];
179179+ test [0x1E9B; 0x0323] `NFD [0x017F; 0x0323; 0x307];
180180+ test [0x1E9B; 0x0323] `NFC [0x1E9B; 0x0323; ];
181181+ test [0x1E9B; 0x0323] `NFKD [0x0073; 0x0323; 0x0307];
182182+ test [0x1E9B; 0x0323] `NFKC [0x1E69];
183183+ test [0x0041; 0x007A; 0x0335; 0x0327; 0x0324; 0x0301; 0x0041] `NFC
184184+ [0x0041; 0x017A; 0x0335; 0x0327; 0x0324; 0x0041];
185185+ (* found by crowbar *)
186186+ test [0x01C6; 0x032D] `NFKC [0x0064; 0x017E; 0x032D];
187187+ test [0xFF80; 0x1FD3; 0xFF9E; 0x1FD3;] `NFKC [0x30BF; 0x0390; 0x3099; 0x0390];
188188+ (* found again by crowbar *)
189189+ test [0xC100; 0x20D2; 0x11C1; 0x11C1] `NFC [0xC100; 0x20D2; 0x11C1; 0x11C1];
190190+ ()
191191+192192+let test_specific =
193193+ Test.test "specific normalizations" @@ fun () ->
194194+ let test src nf dst =
195195+ let n = Uunf.create nf in
196196+ let rec add acc v = match Uunf.add n v with
197197+ | `Uchar u -> add (u :: acc) `Await
198198+ | `Await | `End -> acc
199199+ in
200200+ let add_uchar acc u = add acc (`Uchar (Uchar.of_int u)) in
201201+ let nseq = List.rev (add (List.fold_left add_uchar [] src) `End) in
202202+ let dst = List.map Uchar.of_int dst in
203203+ if nseq <> dst then Test.fail ""
204204+ in
205205+ various_norm_tests test ~__POS__
206206+207207+let test_uunf_string =
208208+ Test.test "Uunf_string" @@ fun () ->
209209+ let test enc normalize =
210210+ let b = Buffer.create 42 in
211211+ let enc us =
212212+ let rec loop = function
213213+ | u :: us -> enc b (Uchar.of_int u); loop us
214214+ | [] -> Buffer.contents b
215215+ in
216216+ Buffer.reset b; loop us
217217+ in
218218+ let test src nf dst = assert ((normalize nf (enc src)) = (enc dst)) in
219219+ various_norm_tests test ~__POS__
220220+ in
221221+ test Buffer.add_utf_8_uchar Uunf_string.normalize_utf_8;
222222+ test Buffer.add_utf_16be_uchar Uunf_string.normalize_utf_16be;
223223+ test Buffer.add_utf_16le_uchar Uunf_string.normalize_utf_16le;
224224+ ()
225225+226226+let test_flushing_end_seq =
227227+ Test.test "flushing end of stream" @@ fun () ->
228228+ let n = Uunf.create `NFKC in
229229+ let uchar u = `Uchar (Uchar.of_int u) in
230230+ if Uunf.add n (uchar 0x2105) <> `Await then Test.fail "";
231231+ if Uunf.add n `Await <> `Await then Test.fail "";
232232+ if Uunf.add n `End <> (uchar 0x0063) then Test.fail "";
233233+ if Uunf.add n `Await <> (uchar 0x002F) then Test.fail "";
234234+ if Uunf.add n `Await <> (uchar 0x006F) then Test.fail "";
235235+ if Uunf.add n `Await <> `End then Test.fail "";
236236+ ()
237237+238238+let main () =
239239+ let test_data_file =
240240+ let default = Fpath.v "test/NormalizationTest.txt" in
241241+ let doc = "Unicode normalization conformance test file." in
242242+ Cmdliner.Arg.(value & pos 0 B0_std_cli.filepath default & info [] ~doc)
243243+ in
244244+ Test.main' test_data_file @@ fun test_data_file ->
245245+ match decode_conformance_data test_data_file with
246246+ | Error e -> Test.failstop "%s" e
247247+ | Ok data ->
248248+ let args = Test.Arg.[value conformance data] in
249249+ Test.autorun ~args ()
250250+251251+let () = if !Sys.interactive then () else exit (main ())
+170
vendor/opam/uunf/test/unftrip.ml
···11+(*---------------------------------------------------------------------------
22+ Copyright (c) 2012 The uunf programmers. All rights reserved.
33+ SPDX-License-Identifier: ISC
44+ ---------------------------------------------------------------------------*)
55+66+let strf = Printf.sprintf
77+let pp = Format.fprintf
88+let pp_pos ppf d = pp ppf "%d.%d:(%d,%06X) "
99+ (Uutf.decoder_line d) (Uutf.decoder_col d) (Uutf.decoder_count d)
1010+ (Uutf.decoder_byte_count d)
1111+1212+let pp_malformed ppf bs =
1313+ let l = String.length bs in
1414+ pp ppf "@[malformed bytes @[(";
1515+ if l > 0 then pp ppf "%02X" (Char.code (bs.[0]));
1616+ for i = 1 to l - 1 do pp ppf "@ %02X" (Char.code (bs.[i])) done;
1717+ pp ppf ")@]@]"
1818+1919+let pp_dump_uchar ppf u = Format.fprintf ppf "U+%04X" (Uchar.to_int u)
2020+2121+let exec = Filename.basename Sys.executable_name
2222+let log f = Format.eprintf ("%s: " ^^ f ^^ "@?") exec
2323+2424+let input_malformed = ref false
2525+let log_malformed inf d bs =
2626+ input_malformed := true;
2727+ log "%s:%a: %a@." inf pp_pos d pp_malformed bs
2828+2929+(* Output *)
3030+3131+let uchar_dump ppf = function
3232+| `End -> () | `Uchar u -> pp ppf "%a@\n" pp_dump_uchar u
3333+3434+let uchar_encoder enc =
3535+ let enc = match enc with `ISO_8859_1 | `US_ASCII -> `UTF_8
3636+ | #Uutf.encoding as enc -> enc
3737+ in
3838+ let e = Uutf.encoder enc (`Channel stdout) in
3939+ fun v -> ignore (Uutf.encode e v)
4040+4141+let out_fun ascii oe =
4242+ if ascii then uchar_dump Format.std_formatter else uchar_encoder oe
4343+4444+(* Trip *)
4545+4646+let u_rep = `Uchar Uutf.u_rep
4747+let id inf d first_dec out = (* no normalization. *)
4848+ let rec loop d = function
4949+ | `Uchar _ as v -> out v; loop d (Uutf.decode d)
5050+ | `End as v -> out v
5151+ | `Malformed bs -> log_malformed inf d bs; out u_rep; loop d (Uutf.decode d)
5252+ | `Await -> assert false
5353+ in
5454+ if Uutf.decoder_removed_bom d then out (`Uchar Uutf.u_bom);
5555+ loop d first_dec
5656+5757+let normalize nf inf d first_dec out = (* normalize to nf. *)
5858+ let n = Uunf.create nf in
5959+ let rec add v = match Uunf.add n v with
6060+ | `Uchar cp as u -> out u; add `Await
6161+ | `Await | `End -> ()
6262+ in
6363+ let rec loop d = function
6464+ | `Uchar _ as v -> add v; loop d (Uutf.decode d)
6565+ | `End as v -> add v; out `End
6666+ | `Malformed bs -> log_malformed inf d bs; add u_rep; loop d (Uutf.decode d)
6767+ | `Await -> assert false
6868+ in
6969+ if Uutf.decoder_removed_bom d then add (`Uchar Uutf.u_bom);
7070+ loop d first_dec
7171+7272+let trip nf inf enc ascii =
7373+ try
7474+ let ic = if inf = "-" then stdin else open_in inf in
7575+ let d = Uutf.decoder ?encoding:enc (`Channel ic) in
7676+ let first_dec = Uutf.decode d in (* guess encoding if needed. *)
7777+ let out = out_fun ascii (Uutf.decoder_encoding d) in
7878+ begin match nf with
7979+ | None -> id inf d first_dec out
8080+ | Some nf -> normalize nf inf d first_dec out
8181+ end;
8282+ if inf <> "-" then close_in ic;
8383+ flush stdout;
8484+ with Sys_error e -> log "%s@." e; exit 1
8585+8686+(* Version *)
8787+8888+let unicode_version () = Format.printf "%s@." Uunf.unicode_version
8989+9090+(* Cmd *)
9191+9292+let do_cmd cmd nf inf enc ascii = match cmd with
9393+| `Unicode_version -> unicode_version ()
9494+| `Trip -> trip nf inf enc ascii
9595+9696+(* Cmdline interface *)
9797+9898+open Cmdliner
9999+100100+let cmd =
101101+ let doc = "Output supported Unicode version." in
102102+ let unicode_version = `Unicode_version, Arg.info ["unicode-version"] ~doc in
103103+ Arg.(value & vflag `Trip [unicode_version])
104104+105105+let nf_doc = "NORMALIZATION"
106106+let nf =
107107+ let docs = nf_doc in
108108+ let doc = "Normalization Form D (NFD), canonical decomposition." in
109109+ let nfd = Some `NFD, Arg.info ["nfd"] ~doc ~docs in
110110+ let doc = "Normalization Form C (NFC), canonical decomposition followed by \
111111+ canonical composition." in
112112+ let nfc = Some `NFC, Arg.info ["nfc"] ~doc ~docs in
113113+ let doc = "Normalization form KD (NFKD), compatibility decomposition." in
114114+ let nfkd = Some `NFKD, Arg.info ["nfkd"] ~doc ~docs in
115115+ let doc = "Normalization form KC (NFKC), compatibility decomposition \
116116+ followed by canonical composition." in
117117+ let nfkc = Some `NFKC, Arg.info ["nfkc"] ~doc ~docs in
118118+ Arg.(value & vflag None [nfd; nfc; nfkd; nfkc])
119119+120120+let file =
121121+ let doc = "The input file. Reads from stdin if unspecified." in
122122+ Arg.(value & pos 0 string "-" & info [] ~doc ~docv:"FILE")
123123+124124+let enc =
125125+ let enc = [ "UTF-8", `UTF_8; "UTF-16", `UTF_16; "UTF-16LE", `UTF_16LE;
126126+ "UTF-16BE", `UTF_16BE; "ASCII", `US_ASCII; "latin1", `ISO_8859_1 ]
127127+ in
128128+ let doc = strf "Input encoding, must %s. If unspecified the encoding is \
129129+ guessed. The output encoding is the same as the input \
130130+ encoding except for ASCII and latin1 where UTF-8 is output."
131131+ (Arg.doc_alts_enum enc)
132132+ in
133133+ Arg.(value & opt (some (enum enc)) None & info [ "e"; "encoding" ] ~doc)
134134+135135+let ascii =
136136+ let doc = "Output the input text as newline (U+000A) separated Unicode
137137+ scalar values written in the US-ASCII charset."
138138+ in
139139+ Arg.(value & flag & info ["a"; "ascii"] ~doc)
140140+141141+let cmd =
142142+ let doc = "normalize Unicode text" in
143143+ let man = [
144144+ `S "DESCRIPTION";
145145+ `P "$(tname) inputs Unicode text from stdin and rewrites it to stdout
146146+ according to a specified Unicode normalization form (see UAX 15).";
147147+ `P "If no normalization form is specified the character stream is left
148148+ intact.";
149149+ `P "Invalid byte sequences in the input are reported on stderr and
150150+ replaced by the Unicode replacement character (U+FFFD) in the output.";
151151+ `S nf_doc;
152152+ `S "OPTIONS";
153153+ `S "EXIT STATUS";
154154+ `P "$(tname) exits with one of the following values:";
155155+ `I ("0", "no error occurred");
156156+ `I ("1", "a command line parsing error occurred");
157157+ `I ("2", "the input text was malformed");
158158+ `S "BUGS";
159159+ `P "This program is distributed with the Uunf OCaml library.
160160+ See http://erratique.ch/software/uunf for contact information." ]
161161+ in
162162+ Cmd.v (Cmd.info "unftrip" ~version:"%%VERSION%%" ~doc ~man)
163163+ Term.(const do_cmd $ cmd $ nf $ file $ enc $ ascii)
164164+165165+let main () = match Cmd.eval cmd with
166166+| 0 -> if !input_malformed then exit 2 else exit 0
167167+| c when c = Cmd.Exit.cli_error -> exit 1
168168+| c -> exit c
169169+170170+let () = if !Sys.interactive then () else main ()
+49
vendor/opam/uunf/uunf.opam
···11+opam-version: "2.0"
22+name: "uunf"
33+synopsis: "Unicode text normalization for OCaml"
44+description: """\
55+Uunf is an OCaml library for normalizing Unicode text. It supports all
66+Unicode [normalization forms]. The library is independent from any IO
77+mechanism or Unicode text data structure and it can process text
88+without a complete in-memory representation.
99+1010+Uunf is distributed under the ISC license. It has no dependency.
1111+1212+[normalization forms]: http://www.unicode.org/reports/tr15/
1313+1414+Homepage: <http://erratique.ch/software/uunf>"""
1515+maintainer: "Daniel Bünzli <daniel.buenzl i@erratique.ch>"
1616+authors: "The uunf programmers"
1717+license: "ISC"
1818+tags: ["unicode" "text" "normalization" "org:erratique"]
1919+homepage: "https://erratique.ch/software/uunf"
2020+doc: "https://erratique.ch/software/uunf/doc/Uunf"
2121+bug-reports: "https://github.com/dbuenzli/uunf/issues"
2222+depends: [
2323+ "ocaml" {>= "4.14.0"}
2424+ "ocamlfind" {build}
2525+ "ocamlbuild" {build}
2626+ "topkg" {build & >= "1.1.0"}
2727+ "uucd" {dev & >= "17.0.0" & < "18.0.0"}
2828+]
2929+depopts: ["uutf" "cmdliner"]
3030+conflicts: [
3131+ "uutf" {< "1.0.0"}
3232+ "cmdliner" {< "1.1.0"}
3333+]
3434+build: [
3535+ "ocaml"
3636+ "pkg/pkg.ml"
3737+ "build"
3838+ "--dev-pkg"
3939+ "%{dev}%"
4040+ "--with-uutf"
4141+ "%{uutf:installed}%"
4242+ "--with-cmdliner"
4343+ "%{cmdliner:installed}%"
4444+]
4545+post-messages:
4646+ "If the build fails with \"ocamlopt.opt got signal and exited\", issue 'ulimit -s unlimited' and retry."
4747+ {failure & (arch = "ppc64" | arch = "arm64")}
4848+dev-repo: "git+https://erratique.ch/repos/uunf.git"
4949+x-maintenance-intent: ["(latest)"]