OCaml HTML5 parser/serialiser based on Python's JustHTML
1(* HTML5 numeric character reference decoding *)
2
3(* HTML5 spec: numeric character reference replacements (§13.2.5.73) *)
4let numeric_replacements = [|
5 (0x00, 0xFFFD); (* NULL -> REPLACEMENT CHARACTER *)
6 (0x80, 0x20AC); (* -> EURO SIGN *)
7 (0x82, 0x201A); (* -> SINGLE LOW-9 QUOTATION MARK *)
8 (0x83, 0x0192); (* -> LATIN SMALL LETTER F WITH HOOK *)
9 (0x84, 0x201E); (* -> DOUBLE LOW-9 QUOTATION MARK *)
10 (0x85, 0x2026); (* -> HORIZONTAL ELLIPSIS *)
11 (0x86, 0x2020); (* -> DAGGER *)
12 (0x87, 0x2021); (* -> DOUBLE DAGGER *)
13 (0x88, 0x02C6); (* -> MODIFIER LETTER CIRCUMFLEX ACCENT *)
14 (0x89, 0x2030); (* -> PER MILLE SIGN *)
15 (0x8A, 0x0160); (* -> LATIN CAPITAL LETTER S WITH CARON *)
16 (0x8B, 0x2039); (* -> SINGLE LEFT-POINTING ANGLE QUOTATION MARK *)
17 (0x8C, 0x0152); (* -> LATIN CAPITAL LIGATURE OE *)
18 (0x8E, 0x017D); (* -> LATIN CAPITAL LETTER Z WITH CARON *)
19 (0x91, 0x2018); (* -> LEFT SINGLE QUOTATION MARK *)
20 (0x92, 0x2019); (* -> RIGHT SINGLE QUOTATION MARK *)
21 (0x93, 0x201C); (* -> LEFT DOUBLE QUOTATION MARK *)
22 (0x94, 0x201D); (* -> RIGHT DOUBLE QUOTATION MARK *)
23 (0x95, 0x2022); (* -> BULLET *)
24 (0x96, 0x2013); (* -> EN DASH *)
25 (0x97, 0x2014); (* -> EM DASH *)
26 (0x98, 0x02DC); (* -> SMALL TILDE *)
27 (0x99, 0x2122); (* -> TRADE MARK SIGN *)
28 (0x9A, 0x0161); (* -> LATIN SMALL LETTER S WITH CARON *)
29 (0x9B, 0x203A); (* -> SINGLE RIGHT-POINTING ANGLE QUOTATION MARK *)
30 (0x9C, 0x0153); (* -> LATIN SMALL LIGATURE OE *)
31 (0x9E, 0x017E); (* -> LATIN SMALL LETTER Z WITH CARON *)
32 (0x9F, 0x0178); (* -> LATIN CAPITAL LETTER Y WITH DIAERESIS *)
33|]
34
35let find_replacement cp =
36 let rec search i =
37 if i >= Array.length numeric_replacements then None
38 else
39 let (k, v) = numeric_replacements.(i) in
40 if k = cp then Some v
41 else if k > cp then None
42 else search (i + 1)
43 in
44 search 0
45
46(* Encode a Unicode codepoint to UTF-8 using uutf *)
47let codepoint_to_utf8 cp =
48 let buf = Buffer.create 4 in
49 Uutf.Buffer.add_utf_8 buf (Uchar.of_int cp);
50 Buffer.contents buf
51
52let replacement_char = "\xEF\xBF\xBD" (* U+FFFD in UTF-8 *)
53
54let decode text ~is_hex =
55 match int_of_string_opt ((if is_hex then "0x" else "") ^ text) with
56 | None -> None
57 | Some cp ->
58 (* Apply HTML5 replacements *)
59 let cp = match find_replacement cp with
60 | Some replacement -> replacement
61 | None -> cp
62 in
63 (* Invalid ranges per HTML5 spec *)
64 if cp > 0x10FFFF then
65 Some replacement_char
66 else if cp >= 0xD800 && cp <= 0xDFFF then
67 (* Surrogate range *)
68 Some replacement_char
69 else if cp = 0 then
70 Some replacement_char
71 else
72 Some (codepoint_to_utf8 cp)