···11-(* Profile generator - converts JSON language profiles to OCaml module *)
22-33-let read_file path =
44- let ic = open_in path in
55- let n = in_channel_length ic in
66- let s = really_input_string ic n in
77- close_in ic;
88- s
99-1010-(* Simple JSON parser for profile format {"freq": {...}} *)
1111-let parse_freq_json content =
1212- (* Find the freq object *)
1313- let freq_start =
1414- try String.index content '{' + 1
1515- with Not_found -> failwith "No opening brace"
1616- in
1717- let content = String.sub content freq_start (String.length content - freq_start) in
1818- (* Skip to inner object *)
1919- let inner_start =
2020- try String.index content '{' + 1
2121- with Not_found -> failwith "No freq object"
2222- in
2323- let inner_end =
2424- try String.rindex content '}'
2525- with Not_found -> failwith "No closing brace"
2626- in
2727- let inner = String.sub content inner_start (inner_end - inner_start) in
2828-2929- (* Parse key:value pairs *)
3030- let pairs = ref [] in
3131- let i = ref 0 in
3232- let len = String.length inner in
3333- while !i < len do
3434- (* Skip whitespace *)
3535- while !i < len && (inner.[!i] = ' ' || inner.[!i] = '\n' || inner.[!i] = '\r' || inner.[!i] = '\t' || inner.[!i] = ',') do
3636- incr i
3737- done;
3838- if !i >= len then ()
3939- else begin
4040- (* Expect quote for key *)
4141- if inner.[!i] <> '"' then incr i
4242- else begin
4343- incr i;
4444- let key_start = !i in
4545- (* Find end of key *)
4646- while !i < len && inner.[!i] <> '"' do
4747- if inner.[!i] = '\\' then i := !i + 2
4848- else incr i
4949- done;
5050- let key = String.sub inner key_start (!i - key_start) in
5151- incr i; (* skip closing quote *)
5252- (* Skip colon *)
5353- while !i < len && (inner.[!i] = ':' || inner.[!i] = ' ') do incr i done;
5454- (* Parse number *)
5555- let num_start = !i in
5656- while !i < len && inner.[!i] >= '0' && inner.[!i] <= '9' do incr i done;
5757- let num_str = String.sub inner num_start (!i - num_start) in
5858- if num_str <> "" then begin
5959- let num = int_of_string num_str in
6060- pairs := (key, num) :: !pairs
6161- end
6262- end
6363- end
6464- done;
6565- !pairs
6666-6767-(* Escape string for OCaml, preserving UTF-8 characters *)
6868-let escape_ocaml_string s =
6969- let buf = Buffer.create (String.length s * 2) in
7070- String.iter (fun c ->
7171- match c with
7272- | '"' -> Buffer.add_string buf "\\\""
7373- | '\\' -> Buffer.add_string buf "\\\\"
7474- | '\n' -> Buffer.add_string buf "\\n"
7575- | '\r' -> Buffer.add_string buf "\\r"
7676- | '\t' -> Buffer.add_string buf "\\t"
7777- | c when Char.code c < 32 ->
7878- Buffer.add_string buf (Printf.sprintf "\\x%02x" (Char.code c))
7979- (* Keep all other characters including UTF-8 bytes as-is *)
8080- | c -> Buffer.add_char buf c
8181- ) s;
8282- Buffer.contents buf
8383-8484-let generate_profile_module lang_code pairs =
8585- let buf = Buffer.create 65536 in
8686- Buffer.add_string buf "(* Auto-generated language profile - do not edit *)\n\n";
8787- Buffer.add_string buf (Printf.sprintf "let lang = %S\n\n" lang_code);
8888- Buffer.add_string buf "let freq = [\n";
8989- List.iter (fun (ngram, count) ->
9090- (* Use custom escaping to preserve UTF-8 *)
9191- Buffer.add_string buf (Printf.sprintf " (\"%s\", %d);\n" (escape_ocaml_string ngram) count)
9292- ) (List.rev pairs);
9393- Buffer.add_string buf "]\n";
9494- Buffer.contents buf
9595-9696-let () =
9797- if Array.length Sys.argv < 3 then begin
9898- Printf.eprintf "Usage: %s <profiles_dir> <output_dir>\n" Sys.argv.(0);
9999- exit 1
100100- end;
101101-102102- let profiles_dir = Sys.argv.(1) in
103103- let output_dir = Sys.argv.(2) in
104104-105105- (* Process each profile *)
106106- let entries = Sys.readdir profiles_dir in
107107- let lang_codes = ref [] in
108108-109109- Array.iter (fun entry ->
110110- let path = Filename.concat profiles_dir entry in
111111- if Sys.is_directory path then ()
112112- else begin
113113- try
114114- let content = read_file path in
115115- let pairs = parse_freq_json content in
116116- let lang_code =
117117- (* Normalize lang code: zh-cn -> zh_cn *)
118118- String.map (fun c -> if c = '-' then '_' else c) entry
119119- in
120120- let ml_content = generate_profile_module entry pairs in
121121- let out_path = Filename.concat output_dir (Printf.sprintf "profile_%s.ml" lang_code) in
122122- let oc = open_out out_path in
123123- output_string oc ml_content;
124124- close_out oc;
125125- lang_codes := (entry, lang_code) :: !lang_codes
126126- with e ->
127127- Printf.eprintf "Error processing %s: %s\n%!" entry (Printexc.to_string e);
128128- exit 1
129129- end
130130- ) entries;
131131-132132- (* Sort language codes for deterministic output *)
133133- let sorted_codes = List.sort (fun (a, _) (b, _) -> String.compare a b) !lang_codes in
134134-135135- (* Generate profiles index module *)
136136- let index_path = Filename.concat output_dir "profiles.ml" in
137137- let oc = open_out index_path in
138138- Printf.fprintf oc "(* Auto-generated profiles index - do not edit *)\n\n";
139139- Printf.fprintf oc "let all_profiles = [\n";
140140- List.iter (fun (orig_code, ml_code) ->
141141- Printf.fprintf oc " (%S, Profile_%s.freq);\n" orig_code ml_code
142142- ) sorted_codes;
143143- Printf.fprintf oc "]\n";
144144- close_out oc
-277
lib/langdetect/langdetect.ml
···11-(** Language detection library based on n-gram frequency analysis.
22-33- This is an OCaml port of the Cybozu langdetect algorithm. *)
44-55-module StringMap = Map.Make(String)
66-77-(** Language detection result *)
88-type result = {
99- lang: string;
1010- prob: float;
1111-}
1212-1313-(** Detection parameters *)
1414-type config = {
1515- alpha: float; (** Smoothing parameter (default 0.5) *)
1616- n_trial: int; (** Number of random trials (default 7) *)
1717- max_text_length: int; (** Maximum text length to process *)
1818- conv_threshold: float; (** Convergence threshold *)
1919- prob_threshold: float; (** Minimum probability to report *)
2020-}
2121-2222-let default_config = {
2323- alpha = 0.5;
2424- n_trial = 7;
2525- max_text_length = 10000;
2626- conv_threshold = 0.99999;
2727- prob_threshold = 0.1;
2828-}
2929-3030-(** N-gram extraction parameters *)
3131-let n_gram_max = 3
3232-let base_freq = 10000
3333-let iteration_limit = 1000
3434-let alpha_width = 0.05
3535-3636-(** Detector state *)
3737-type t = {
3838- config: config;
3939- (* Map from n-gram -> array of probabilities per language *)
4040- word_lang_prob: float array StringMap.t;
4141- (* List of language codes *)
4242- lang_list: string array;
4343- (* Random seed for reproducibility *)
4444- mutable seed: int option;
4545-}
4646-4747-(** Normalize a Unicode code point for n-gram extraction *)
4848-let normalize_uchar uchar =
4949- let code = Uchar.to_int uchar in
5050- (* Basic Latin: keep only letters *)
5151- if code < 128 then begin
5252- let c = Char.chr code in
5353- if (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') then
5454- Some (String.make 1 c)
5555- else
5656- None (* Treat as space/separator *)
5757- end
5858- else begin
5959- (* Keep non-ASCII characters as-is *)
6060- let buf = Buffer.create 4 in
6161- Buffer.add_utf_8_uchar buf uchar;
6262- Some (Buffer.contents buf)
6363- end
6464-6565-(** Extract n-grams from UTF-8 text.
6666- N-grams are sequences of 1-3 Unicode characters. *)
6767-let extract_ngrams ?(max_len=10000) text word_lang_prob =
6868- let ngrams = ref [] in
6969- (* Buffer stores up to 3 most recent character strings *)
7070- let char_buffer = Array.make n_gram_max "" in
7171- let char_count = ref 0 in
7272- let processed = ref 0 in
7373-7474- (* Process each UTF-8 character *)
7575- let decoder = Uutf.decoder ~encoding:`UTF_8 (`String text) in
7676- let rec process () =
7777- if !processed >= max_len then ()
7878- else match Uutf.decode decoder with
7979- | `Await -> () (* String source never awaits *)
8080- | `End -> ()
8181- | `Malformed _ -> process () (* Skip malformed sequences *)
8282- | `Uchar uchar ->
8383- incr processed;
8484- match normalize_uchar uchar with
8585- | None ->
8686- (* Separator - reset buffer *)
8787- char_buffer.(0) <- "";
8888- char_buffer.(1) <- "";
8989- char_buffer.(2) <- "";
9090- char_count := 0;
9191- process ()
9292- | Some char_str ->
9393- (* Shift buffer left and add new char *)
9494- char_buffer.(0) <- char_buffer.(1);
9595- char_buffer.(1) <- char_buffer.(2);
9696- char_buffer.(2) <- char_str;
9797- incr char_count;
9898-9999- (* Extract 1, 2, 3 grams based on how many chars we have *)
100100- let available = min !char_count n_gram_max in
101101- for n = 1 to available do
102102- let ngram =
103103- let start_idx = n_gram_max - n in
104104- let parts = ref [] in
105105- for i = start_idx to n_gram_max - 1 do
106106- parts := char_buffer.(i) :: !parts
107107- done;
108108- String.concat "" (List.rev !parts)
109109- in
110110- if StringMap.mem ngram word_lang_prob then
111111- ngrams := ngram :: !ngrams
112112- done;
113113- process ()
114114- in
115115- process ();
116116- Array.of_list (List.rev !ngrams)
117117-118118-(** Initialize uniform probability distribution *)
119119-let init_prob n_langs =
120120- let prob = Array.make n_langs (1.0 /. float_of_int n_langs) in
121121- prob
122122-123123-(** Update language probabilities with an n-gram *)
124124-let update_lang_prob prob ngram word_lang_prob alpha =
125125- match StringMap.find_opt ngram word_lang_prob with
126126- | None -> false
127127- | Some lang_prob_map ->
128128- let weight = alpha /. float_of_int base_freq in
129129- for i = 0 to Array.length prob - 1 do
130130- prob.(i) <- prob.(i) *. (weight +. lang_prob_map.(i))
131131- done;
132132- true
133133-134134-(** Normalize probabilities and return max *)
135135-let normalize_prob prob =
136136- let sum = Array.fold_left (+.) 0.0 prob in
137137- if sum <= 0.0 then 0.0
138138- else begin
139139- let max_p = ref 0.0 in
140140- for i = 0 to Array.length prob - 1 do
141141- prob.(i) <- prob.(i) /. sum;
142142- if prob.(i) > !max_p then max_p := prob.(i)
143143- done;
144144- !max_p
145145- end
146146-147147-(** Simple pseudo-random number generator *)
148148-let random_state = ref 12345
149149-150150-let set_seed seed =
151151- random_state := seed
152152-153153-let next_random () =
154154- random_state := (!random_state * 1103515245 + 12345) land 0x7FFFFFFF;
155155- !random_state
156156-157157-let random_int bound =
158158- (next_random ()) mod bound
159159-160160-let random_gaussian () =
161161- (* Box-Muller transform approximation *)
162162- let u1 = (float_of_int (next_random ())) /. float_of_int 0x7FFFFFFF in
163163- let u2 = (float_of_int (next_random ())) /. float_of_int 0x7FFFFFFF in
164164- let u1 = max 0.0001 u1 in (* Avoid log(0) *)
165165- sqrt (-2.0 *. log u1) *. cos (2.0 *. Float.pi *. u2)
166166-167167-(** Run detection on extracted n-grams *)
168168-let detect_block t ngrams =
169169- let n_langs = Array.length t.lang_list in
170170- if n_langs = 0 || Array.length ngrams = 0 then [||]
171171- else begin
172172- let lang_prob = Array.make n_langs 0.0 in
173173-174174- (* Set seed if specified *)
175175- (match t.seed with
176176- | Some s -> set_seed s
177177- | None -> set_seed (int_of_float (Unix.gettimeofday () *. 1000.0)));
178178-179179- for _ = 0 to t.config.n_trial - 1 do
180180- let prob = init_prob n_langs in
181181- let alpha = t.config.alpha +. random_gaussian () *. alpha_width in
182182-183183- let converged = ref false in
184184- let i = ref 0 in
185185- while not !converged && !i < iteration_limit do
186186- let r = random_int (Array.length ngrams) in
187187- let _ = update_lang_prob prob ngrams.(r) t.word_lang_prob alpha in
188188- if !i mod 5 = 0 then begin
189189- let max_p = normalize_prob prob in
190190- if max_p > t.config.conv_threshold then converged := true
191191- end;
192192- incr i
193193- done;
194194-195195- (* Accumulate probabilities *)
196196- for j = 0 to n_langs - 1 do
197197- lang_prob.(j) <- lang_prob.(j) +. prob.(j) /. float_of_int t.config.n_trial
198198- done
199199- done;
200200-201201- lang_prob
202202- end
203203-204204-(** Create detector from profiles *)
205205-let create ?(config=default_config) profiles =
206206- let lang_list = Array.of_list (List.map fst profiles) in
207207- let n_langs = Array.length lang_list in
208208-209209- (* Build word -> lang prob map *)
210210- (* First, collect all unique n-grams and their frequencies per language *)
211211- let all_ngrams = Hashtbl.create 65536 in
212212- let lang_totals = Array.make n_langs 0 in
213213-214214- List.iteri (fun lang_idx (_, freq_list) ->
215215- List.iter (fun (ngram, count) ->
216216- let current =
217217- match Hashtbl.find_opt all_ngrams ngram with
218218- | Some arr -> arr
219219- | None ->
220220- let arr = Array.make n_langs 0 in
221221- Hashtbl.add all_ngrams ngram arr;
222222- arr
223223- in
224224- current.(lang_idx) <- count;
225225- lang_totals.(lang_idx) <- lang_totals.(lang_idx) + count
226226- ) freq_list
227227- ) profiles;
228228-229229- (* Convert to probability map *)
230230- let word_lang_prob =
231231- Hashtbl.fold (fun ngram counts acc ->
232232- (* Compute probability for each language *)
233233- let probs = Array.make n_langs 0.0 in
234234- for i = 0 to n_langs - 1 do
235235- if lang_totals.(i) > 0 then
236236- probs.(i) <- float_of_int counts.(i) /. float_of_int lang_totals.(i)
237237- done;
238238- StringMap.add ngram probs acc
239239- ) all_ngrams StringMap.empty
240240- in
241241-242242- { config; word_lang_prob; lang_list; seed = None }
243243-244244-(** Set random seed for reproducibility *)
245245-let set_random_seed t seed =
246246- t.seed <- Some seed
247247-248248-(** Detect language of text *)
249249-let detect t text =
250250- let ngrams = extract_ngrams ~max_len:t.config.max_text_length text t.word_lang_prob in
251251- if Array.length ngrams = 0 then []
252252- else begin
253253- let probs = detect_block t ngrams in
254254- (* Sort by probability descending *)
255255- let results = ref [] in
256256- for i = 0 to Array.length probs - 1 do
257257- if probs.(i) > t.config.prob_threshold then
258258- results := { lang = t.lang_list.(i); prob = probs.(i) } :: !results
259259- done;
260260- List.sort (fun a b -> compare b.prob a.prob) !results
261261- end
262262-263263-(** Get best language or None *)
264264-let detect_best t text =
265265- match detect t text with
266266- | [] -> None
267267- | best :: _ -> Some best.lang
268268-269269-(** Get best language with probability *)
270270-let detect_with_prob t text =
271271- match detect t text with
272272- | [] -> None
273273- | best :: _ -> Some (best.lang, best.prob)
274274-275275-(** Create a detector with all built-in profiles *)
276276-let create_default ?config () =
277277- create ?config Profiles.all_profiles
-47
lib/langdetect/langdetect.mli
···11-(** Language detection library based on n-gram frequency analysis. *)
22-33-(** Language detection result *)
44-type result = {
55- lang: string;
66- prob: float;
77-}
88-99-(** Detection parameters *)
1010-type config = {
1111- alpha: float; (** Smoothing parameter (default 0.5) *)
1212- n_trial: int; (** Number of random trials (default 7) *)
1313- max_text_length: int; (** Maximum text length to process *)
1414- conv_threshold: float; (** Convergence threshold *)
1515- prob_threshold: float; (** Minimum probability to report *)
1616-}
1717-1818-(** Default configuration *)
1919-val default_config : config
2020-2121-(** Detector state *)
2222-type t
2323-2424-(** Create detector from language profiles.
2525- Each profile is (lang_code, frequency_list) where frequency_list is
2626- a list of (ngram, count) pairs. *)
2727-val create : ?config:config -> (string * (string * int) list) list -> t
2828-2929-(** Set random seed for reproducible results *)
3030-val set_random_seed : t -> int -> unit
3131-3232-(** Detect language of text.
3333- Returns list of possible languages with probabilities, sorted by
3434- probability descending. Only languages above prob_threshold are included. *)
3535-val detect : t -> string -> result list
3636-3737-(** Detect best matching language.
3838- Returns None if no language could be detected. *)
3939-val detect_best : t -> string -> string option
4040-4141-(** Detect best matching language with its probability.
4242- Returns None if no language could be detected. *)
4343-val detect_with_prob : t -> string -> (string * float) option
4444-4545-(** Create a detector with all built-in language profiles.
4646- This is a convenience function that calls create with all supported profiles. *)
4747-val create_default : ?config:config -> unit -> t