Detect which human language a document uses from OCaml, from the Nu Html validator
languages
unicode
ocaml
1(*---------------------------------------------------------------------------
2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>
3 SPDX-License-Identifier: MIT
4 ---------------------------------------------------------------------------*)
5
6let detect_language input_text =
7 let detector = Langdetect.create_default () in
8 let results = Langdetect.detect detector input_text in
9 List.iter
10 (fun (r : Langdetect.result) -> Printf.printf "%s %.4f\n" r.lang r.prob)
11 results
12
13let read_all_stdin () =
14 let buf = Buffer.create 4096 in
15 try
16 while true do
17 Buffer.add_channel buf stdin 4096
18 done;
19 Buffer.contents buf
20 with End_of_file -> Buffer.contents buf
21
22let read_file path =
23 let ic = open_in path in
24 let n = in_channel_length ic in
25 let s = really_input_string ic n in
26 close_in ic;
27 s
28
29let run file_opt =
30 let text =
31 match file_opt with
32 | Some path -> read_file path
33 | None -> read_all_stdin ()
34 in
35 if String.length (String.trim text) = 0 then
36 `Error (false, "No input text provided")
37 else begin
38 detect_language text;
39 `Ok ()
40 end
41
42open Cmdliner
43
44let file_arg =
45 let doc = "Input file to detect language from. If not provided, reads from stdin." in
46 Arg.(value & pos 0 (some file) None & info [] ~docv:"FILE" ~doc)
47
48let cmd =
49 let doc = "Detect the language of text" in
50 let man =
51 [
52 `S Manpage.s_description;
53 `P "Detects the natural language of input text using n-gram frequency analysis.";
54 `P "Outputs detected language codes and their probabilities as space-separated values, one per line, sorted by probability (highest first).";
55 `S Manpage.s_examples;
56 `P "Detect language from a file:";
57 `Pre " langdetect document.txt";
58 `P "Detect language from stdin:";
59 `Pre " echo 'Hello world' | langdetect";
60 ]
61 in
62 let info = Cmd.info "langdetect" ~version:"%%VERSION%%" ~doc ~man in
63 Cmd.v info Term.(ret (const run $ file_arg))
64
65let () = exit (Cmd.eval cmd)