OCaml HTML5 parser/serialiser based on Python's JustHTML
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at main 72 lines 2.7 kB view raw
1(* HTML5 numeric character reference decoding *) 2 3(* HTML5 spec: numeric character reference replacements (§13.2.5.73) *) 4let numeric_replacements = [| 5 (0x00, 0xFFFD); (* NULL -> REPLACEMENT CHARACTER *) 6 (0x80, 0x20AC); (* -> EURO SIGN *) 7 (0x82, 0x201A); (* -> SINGLE LOW-9 QUOTATION MARK *) 8 (0x83, 0x0192); (* -> LATIN SMALL LETTER F WITH HOOK *) 9 (0x84, 0x201E); (* -> DOUBLE LOW-9 QUOTATION MARK *) 10 (0x85, 0x2026); (* -> HORIZONTAL ELLIPSIS *) 11 (0x86, 0x2020); (* -> DAGGER *) 12 (0x87, 0x2021); (* -> DOUBLE DAGGER *) 13 (0x88, 0x02C6); (* -> MODIFIER LETTER CIRCUMFLEX ACCENT *) 14 (0x89, 0x2030); (* -> PER MILLE SIGN *) 15 (0x8A, 0x0160); (* -> LATIN CAPITAL LETTER S WITH CARON *) 16 (0x8B, 0x2039); (* -> SINGLE LEFT-POINTING ANGLE QUOTATION MARK *) 17 (0x8C, 0x0152); (* -> LATIN CAPITAL LIGATURE OE *) 18 (0x8E, 0x017D); (* -> LATIN CAPITAL LETTER Z WITH CARON *) 19 (0x91, 0x2018); (* -> LEFT SINGLE QUOTATION MARK *) 20 (0x92, 0x2019); (* -> RIGHT SINGLE QUOTATION MARK *) 21 (0x93, 0x201C); (* -> LEFT DOUBLE QUOTATION MARK *) 22 (0x94, 0x201D); (* -> RIGHT DOUBLE QUOTATION MARK *) 23 (0x95, 0x2022); (* -> BULLET *) 24 (0x96, 0x2013); (* -> EN DASH *) 25 (0x97, 0x2014); (* -> EM DASH *) 26 (0x98, 0x02DC); (* -> SMALL TILDE *) 27 (0x99, 0x2122); (* -> TRADE MARK SIGN *) 28 (0x9A, 0x0161); (* -> LATIN SMALL LETTER S WITH CARON *) 29 (0x9B, 0x203A); (* -> SINGLE RIGHT-POINTING ANGLE QUOTATION MARK *) 30 (0x9C, 0x0153); (* -> LATIN SMALL LIGATURE OE *) 31 (0x9E, 0x017E); (* -> LATIN SMALL LETTER Z WITH CARON *) 32 (0x9F, 0x0178); (* -> LATIN CAPITAL LETTER Y WITH DIAERESIS *) 33|] 34 35let find_replacement cp = 36 let rec search i = 37 if i >= Array.length numeric_replacements then None 38 else 39 let (k, v) = numeric_replacements.(i) in 40 if k = cp then Some v 41 else if k > cp then None 42 else search (i + 1) 43 in 44 search 0 45 46(* Encode a Unicode codepoint to UTF-8 using uutf *) 47let codepoint_to_utf8 cp = 48 let buf = Buffer.create 4 in 49 Uutf.Buffer.add_utf_8 buf (Uchar.of_int cp); 50 Buffer.contents buf 51 52let replacement_char = "\xEF\xBF\xBD" (* U+FFFD in UTF-8 *) 53 54let decode text ~is_hex = 55 match int_of_string_opt ((if is_hex then "0x" else "") ^ text) with 56 | None -> None 57 | Some cp -> 58 (* Apply HTML5 replacements *) 59 let cp = match find_replacement cp with 60 | Some replacement -> replacement 61 | None -> cp 62 in 63 (* Invalid ranges per HTML5 spec *) 64 if cp > 0x10FFFF then 65 Some replacement_char 66 else if cp >= 0xD800 && cp <= 0xDFFF then 67 (* Surrogate range *) 68 Some replacement_char 69 else if cp = 0 then 70 Some replacement_char 71 else 72 Some (codepoint_to_utf8 cp)