Detect which human language a document uses from OCaml, from the Nu Html validator
languages unicode ocaml
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

at main 65 lines 2.0 kB view raw
1(*--------------------------------------------------------------------------- 2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org> 3 SPDX-License-Identifier: MIT 4 ---------------------------------------------------------------------------*) 5 6let detect_language input_text = 7 let detector = Langdetect.create_default () in 8 let results = Langdetect.detect detector input_text in 9 List.iter 10 (fun (r : Langdetect.result) -> Printf.printf "%s %.4f\n" r.lang r.prob) 11 results 12 13let read_all_stdin () = 14 let buf = Buffer.create 4096 in 15 try 16 while true do 17 Buffer.add_channel buf stdin 4096 18 done; 19 Buffer.contents buf 20 with End_of_file -> Buffer.contents buf 21 22let read_file path = 23 let ic = open_in path in 24 let n = in_channel_length ic in 25 let s = really_input_string ic n in 26 close_in ic; 27 s 28 29let run file_opt = 30 let text = 31 match file_opt with 32 | Some path -> read_file path 33 | None -> read_all_stdin () 34 in 35 if String.length (String.trim text) = 0 then 36 `Error (false, "No input text provided") 37 else begin 38 detect_language text; 39 `Ok () 40 end 41 42open Cmdliner 43 44let file_arg = 45 let doc = "Input file to detect language from. If not provided, reads from stdin." in 46 Arg.(value & pos 0 (some file) None & info [] ~docv:"FILE" ~doc) 47 48let cmd = 49 let doc = "Detect the language of text" in 50 let man = 51 [ 52 `S Manpage.s_description; 53 `P "Detects the natural language of input text using n-gram frequency analysis."; 54 `P "Outputs detected language codes and their probabilities as space-separated values, one per line, sorted by probability (highest first)."; 55 `S Manpage.s_examples; 56 `P "Detect language from a file:"; 57 `Pre " langdetect document.txt"; 58 `P "Detect language from stdin:"; 59 `Pre " echo 'Hello world' | langdetect"; 60 ] 61 in 62 let info = Cmd.info "langdetect" ~version:"%%VERSION%%" ~doc ~man in 63 Cmd.v info Term.(ret (const run $ file_arg)) 64 65let () = exit (Cmd.eval cmd)