···1616 let user = River.User.make ~username ~fullname ?email () in
1717 match River.State.create_user state user with
1818 | Ok () ->
1919- Log.info (fun m -> m "User %s created" username);
1919+ Fmt.pr "@.%a %a %a@.@."
2020+ Fmt.(styled (`Fg `Green) string) "✓"
2121+ Fmt.(styled `Bold string) "User created:"
2222+ Fmt.(styled (`Fg `Cyan) string) username;
2023 0
2124 | Error err ->
2222- Log.err (fun m -> m "%s" err);
2525+ Fmt.pr "@.%a %s@.@."
2626+ Fmt.(styled (`Fg `Red) string) "✗ Error:"
2727+ err;
2328 1
24292530 let remove state ~username =
2631 match River.State.delete_user state ~username with
2732 | Ok () ->
2828- Log.info (fun m -> m "User %s removed" username);
3333+ Fmt.pr "@.%a %a %a@.@."
3434+ Fmt.(styled (`Fg `Green) string) "✓"
3535+ Fmt.(styled `Bold string) "User removed:"
3636+ Fmt.(styled (`Fg `Cyan) string) username;
2937 0
3038 | Error err ->
3131- Log.err (fun m -> m "%s" err);
3939+ Fmt.pr "@.%a %s@.@."
4040+ Fmt.(styled (`Fg `Red) string) "✗ Error:"
4141+ err;
3242 1
33433444 let list state =
3545 let users = River.State.list_users state in
3636- if users = [] then
3737- Printf.printf "No users found\n"
3838- else begin
3939- Printf.printf "Users:\n";
4646+ if users = [] then begin
4747+ Fmt.pr "@.%a@.@."
4848+ Fmt.(styled `Yellow string)
4949+ "No users found. Use 'river-cli user add' to create one."
5050+ end else begin
5151+ Fmt.pr "@.%a@."
5252+ Fmt.(styled `Bold (styled (`Fg `Cyan) string))
5353+ (Printf.sprintf "Users (%d total)" (List.length users));
5454+ Fmt.pr "%a@.@." Fmt.(styled `Faint string) (String.make 60 '-');
4055 List.iter (fun username ->
4156 match River.State.get_user state ~username with
4257 | Some user ->
4358 let email_str = match River.User.email user with
4444- | Some e -> " <" ^ e ^ ">"
5959+ | Some e -> Fmt.str " %a" Fmt.(styled `Faint string) ("<" ^ e ^ ">")
4560 | None -> ""
4661 in
4747- Printf.printf " %s (%s%s) - %d feeds\n"
4848- username (River.User.fullname user) email_str
4949- (List.length (River.User.feeds user))
6262+ let feed_count = List.length (River.User.feeds user) in
6363+ Fmt.pr "%a %a%s@."
6464+ Fmt.(styled `Bold (styled (`Fg `Blue) string)) username
6565+ Fmt.(styled `Green string) (River.User.fullname user)
6666+ email_str;
6767+ Fmt.pr " %a %a %a@.@."
6868+ Fmt.(styled `Faint string) "└─"
6969+ Fmt.(styled (`Fg `Yellow) string) (string_of_int feed_count)
7070+ Fmt.(styled `Faint string) (if feed_count = 1 then "feed" else "feeds")
5071 | None -> ()
5172 ) users
5273 end;
···5576 let add_feed state ~username ~name ~url =
5677 match River.State.get_user state ~username with
5778 | None ->
5858- Log.err (fun m -> m "User %s not found" username);
7979+ Fmt.pr "@.%a User %a not found@.@."
8080+ Fmt.(styled (`Fg `Red) string) "✗ Error:"
8181+ Fmt.(styled `Bold string) username;
5982 1
6083 | Some user ->
6184 let source = River.Source.make ~name ~url in
6285 let user = River.User.add_feed user source in
6386 (match River.State.update_user state user with
6487 | Ok () ->
6565- Log.info (fun m -> m "Feed %s added to user %s" name username);
8888+ Fmt.pr "@.%a Feed added to %a@."
8989+ Fmt.(styled (`Fg `Green) string) "✓"
9090+ Fmt.(styled (`Fg `Cyan) string) username;
9191+ Fmt.pr " %a %a@."
9292+ Fmt.(styled `Faint string) "Name:"
9393+ Fmt.(styled `Bold string) name;
9494+ Fmt.pr " %a %a@.@."
9595+ Fmt.(styled `Faint string) "URL: "
9696+ Fmt.(styled (`Fg `Blue) string) url;
6697 0
6798 | Error err ->
6868- Log.err (fun m -> m "%s" err);
9999+ Fmt.pr "@.%a %s@.@."
100100+ Fmt.(styled (`Fg `Red) string) "✗ Error:"
101101+ err;
69102 1)
7010371104 let remove_feed state ~username ~url =
72105 match River.State.get_user state ~username with
73106 | None ->
7474- Log.err (fun m -> m "User %s not found" username);
107107+ Fmt.pr "@.%a User %a not found@.@."
108108+ Fmt.(styled (`Fg `Red) string) "✗ Error:"
109109+ Fmt.(styled `Bold string) username;
75110 1
76111 | Some user ->
77112 let user = River.User.remove_feed user ~url in
78113 (match River.State.update_user state user with
79114 | Ok () ->
8080- Log.info (fun m -> m "Feed removed from user %s" username);
115115+ Fmt.pr "@.%a Feed removed from %a@.@."
116116+ Fmt.(styled (`Fg `Green) string) "✓"
117117+ Fmt.(styled (`Fg `Cyan) string) username;
81118 0
82119 | Error err ->
8383- Log.err (fun m -> m "%s" err);
120120+ Fmt.pr "@.%a %s@.@."
121121+ Fmt.(styled (`Fg `Red) string) "✗ Error:"
122122+ err;
84123 1)
8512486125 let show state ~username =
87126 match River.State.get_user state ~username with
88127 | None ->
8989- Log.err (fun m -> m "User %s not found" username);
128128+ Fmt.pr "@.%a User %a not found@.@."
129129+ Fmt.(styled (`Fg `Red) string) "✗ Error:"
130130+ Fmt.(styled `Bold string) username;
90131 1
91132 | Some user ->
9292- Printf.printf "Username: %s\n" (River.User.username user);
9393- Printf.printf "Full name: %s\n" (River.User.fullname user);
9494- Printf.printf "Email: %s\n"
9595- (Option.value (River.User.email user) ~default:"(none)");
9696- Printf.printf "Last synced: %s\n"
133133+ Fmt.pr "@.%a@."
134134+ Fmt.(styled `Bold (styled (`Fg `Cyan) string))
135135+ (Printf.sprintf "User: %s" (River.User.username user));
136136+ Fmt.pr "%a@.@." Fmt.(styled `Faint string) (String.make 60 '-');
137137+138138+ Fmt.pr "%a %a@."
139139+ Fmt.(styled `Faint string) "Full name:"
140140+ Fmt.(styled `Green string) (River.User.fullname user);
141141+142142+ Fmt.pr "%a %a@."
143143+ Fmt.(styled `Faint string) "Email: "
144144+ Fmt.string (Option.value (River.User.email user) ~default:"(not set)");
145145+146146+ Fmt.pr "%a %a@.@."
147147+ Fmt.(styled `Faint string) "Synced: "
148148+ Fmt.(styled `Yellow string)
97149 (Option.value (River.User.last_synced user) ~default:"never");
150150+98151 let feeds = River.User.feeds user in
9999- Printf.printf "Feeds (%d):\n" (List.length feeds);
100100- List.iter (fun feed ->
101101- Printf.printf " - %s: %s\n"
102102- (River.Source.name feed) (River.Source.url feed)
103103- ) feeds;
152152+ Fmt.pr "%a@."
153153+ Fmt.(styled `Bold string)
154154+ (Printf.sprintf "Feeds (%d)" (List.length feeds));
155155+ Fmt.pr "%a@." Fmt.(styled `Faint string) (String.make 60 '-');
156156+157157+ if feeds = [] then
158158+ Fmt.pr "%a@.@."
159159+ Fmt.(styled `Faint string)
160160+ " No feeds configured. Use 'river-cli user add-feed' to add one."
161161+ else
162162+ List.iter (fun feed ->
163163+ Fmt.pr "@.%a@."
164164+ Fmt.(styled `Bold (styled (`Fg `Blue) string))
165165+ (River.Source.name feed);
166166+ Fmt.pr " %a %a@.@."
167167+ Fmt.(styled `Faint string) "URL:"
168168+ Fmt.(styled (`Fg `Magenta) string) (River.Source.url feed)
169169+ ) feeds;
104170 0
105171end
106172107173(* Sync command *)
108174module Sync = struct
109175 let sync_user env state ~username =
176176+ Fmt.pr "@.%a Syncing feeds for %a...@."
177177+ Fmt.(styled (`Fg `Cyan) string) "→"
178178+ Fmt.(styled `Bold string) username;
110179 match River.State.sync_user env state ~username with
111180 | Ok () ->
112112- Log.info (fun m -> m "Sync completed for user %s" username);
181181+ Fmt.pr "%a Sync completed successfully@.@."
182182+ Fmt.(styled (`Fg `Green) string) "✓";
113183 0
114184 | Error err ->
115115- Log.err (fun m -> m "Sync failed: %s" err);
185185+ Fmt.pr "%a Sync failed: %s@.@."
186186+ Fmt.(styled (`Fg `Red) string) "✗"
187187+ err;
116188 1
117189118190 let sync_all env state =
191191+ Fmt.pr "@.%a Syncing all users...@.@."
192192+ Fmt.(styled (`Fg `Cyan) string) "→";
119193 match River.State.sync_all env state with
120194 | Ok (success, fail) ->
121121- Log.info (fun m -> m "Synced %d users (%d failed)" success fail);
122122- if fail = 0 then 0 else 1
195195+ if fail = 0 then begin
196196+ Fmt.pr "%a Successfully synced %a@.@."
197197+ Fmt.(styled (`Fg `Green) string) "✓"
198198+ Fmt.(styled `Bold (styled (`Fg `Green) string)) (Printf.sprintf "%d users" success);
199199+ 0
200200+ end else begin
201201+ Fmt.pr "%a Synced %a, %a@.@."
202202+ Fmt.(styled `Yellow string) "⚠"
203203+ Fmt.(styled (`Fg `Green) string) (Printf.sprintf "%d users" success)
204204+ Fmt.(styled (`Fg `Red) string) (Printf.sprintf "%d failed" fail);
205205+ 1
206206+ end
123207 | Error err ->
124124- Log.err (fun m -> m "Sync failed: %s" err);
208208+ Fmt.pr "%a Sync failed: %s@.@."
209209+ Fmt.(styled (`Fg `Red) string) "✗"
210210+ err;
125211 1
126212end
127213···633719 1
634720 | Ok metrics ->
635721 (* Display quality metrics *)
636636- Fmt.pr "@.";
637637- Fmt.pr "%a@." Fmt.(styled `Bold string)
638638- (String.make 70 '=');
639639- Fmt.pr " %a %s@." Fmt.(styled `Bold (styled (`Fg `Blue) string))
640640- "User Quality Analysis:" username;
641641- Fmt.pr "%a@.@." Fmt.(styled `Bold string)
642642- (String.make 70 '=');
722722+ Fmt.pr "@.%a@."
723723+ Fmt.(styled `Bold (styled (`Fg `Cyan) string))
724724+ (Printf.sprintf "Feed Quality Analysis: %s" username);
725725+ Fmt.pr "%a@.@." Fmt.(styled `Faint string) (String.make 70 '=');
643726644644- (* Overall quality score *)
727727+ (* Overall quality score with visual indicator *)
645728 let score = River.Quality.quality_score metrics in
646646- let score_color = match score with
647647- | s when s >= 80.0 -> `Green
648648- | s when s >= 60.0 -> `Yellow
649649- | s when s >= 40.0 -> `Magenta
650650- | _ -> `Red
729729+ let score_color, score_label = match score with
730730+ | s when s >= 80.0 -> `Green, "Excellent"
731731+ | s when s >= 60.0 -> `Yellow, "Good"
732732+ | s when s >= 40.0 -> `Magenta, "Fair"
733733+ | _ -> `Red, "Poor"
651734 in
652652- Fmt.pr "%a %.1f/100.0@.@."
653653- Fmt.(styled (`Fg score_color) (styled `Bold string))
654654- "Overall Quality Score:"
655655- score;
735735+ let bar_width = 40 in
736736+ let filled = int_of_float (score /. 100.0 *. float_of_int bar_width) in
737737+ let bar = String.make filled '#' ^ String.make (bar_width - filled) '-' in
738738+ Fmt.pr "%a@."
739739+ Fmt.(styled `Bold string) "Overall Quality Score";
740740+ Fmt.pr " %a %.1f/100 %a@.@."
741741+ Fmt.(styled (`Fg score_color) string) bar
742742+ score
743743+ Fmt.(styled (`Fg score_color) (styled `Bold string)) (Printf.sprintf "(%s)" score_label);
656744657745 (* Entry statistics *)
658658- Fmt.pr "%a@." Fmt.(styled `Cyan string) "Entry Statistics:";
659659- Fmt.pr " Total entries: %d@." (River.Quality.total_entries metrics);
746746+ Fmt.pr "%a %a@."
747747+ Fmt.(styled `Bold string) "📊 Entries:"
748748+ Fmt.(styled (`Fg `Yellow) (styled `Bold string))
749749+ (string_of_int (River.Quality.total_entries metrics));
660750 Fmt.pr "@.";
661751662662- (* Completeness metrics *)
663663- Fmt.pr "%a@." Fmt.(styled `Cyan string) "Completeness:";
752752+ (* Completeness metrics with visual indicators *)
753753+ Fmt.pr "%a@." Fmt.(styled `Bold string) "Completeness";
664754 let total = River.Quality.total_entries metrics in
665755 let pct entries =
666756 float_of_int entries /. float_of_int total *. 100.0
667757 in
668668- Fmt.pr " Entries with content: %3d/%d (%5.1f%%)@."
669669- (River.Quality.entries_with_content metrics)
670670- total
671671- (pct (River.Quality.entries_with_content metrics));
672672- Fmt.pr " Entries with dates: %3d/%d (%5.1f%%)@."
673673- (River.Quality.entries_with_date metrics)
674674- total
675675- (pct (River.Quality.entries_with_date metrics));
676676- Fmt.pr " Entries with authors: %3d/%d (%5.1f%%)@."
677677- (River.Quality.entries_with_author metrics)
678678- total
679679- (pct (River.Quality.entries_with_author metrics));
680680- Fmt.pr " Entries with summaries:%3d/%d (%5.1f%%)@."
681681- (River.Quality.entries_with_summary metrics)
682682- total
683683- (pct (River.Quality.entries_with_summary metrics));
684684- Fmt.pr " Entries with tags: %3d/%d (%5.1f%%)@."
685685- (River.Quality.entries_with_tags metrics)
686686- total
687687- (pct (River.Quality.entries_with_tags metrics));
758758+ let show_metric label count =
759759+ let p = pct count in
760760+ let icon, color = match p with
761761+ | p when p >= 90.0 -> "✓", `Green
762762+ | p when p >= 50.0 -> "○", `Yellow
763763+ | _ -> "✗", `Red
764764+ in
765765+ Fmt.pr " %a %s %3d/%d %a@."
766766+ Fmt.(styled (`Fg color) string) icon
767767+ label
768768+ count total
769769+ Fmt.(styled `Faint string) (Printf.sprintf "(%.1f%%)" p)
770770+ in
771771+ show_metric "Content: " (River.Quality.entries_with_content metrics);
772772+ show_metric "Dates: " (River.Quality.entries_with_date metrics);
773773+ show_metric "Authors: " (River.Quality.entries_with_author metrics);
774774+ show_metric "Summaries:" (River.Quality.entries_with_summary metrics);
775775+ show_metric "Tags: " (River.Quality.entries_with_tags metrics);
688776 Fmt.pr "@.";
689777690778 (* Content statistics *)
691779 if River.Quality.entries_with_content metrics > 0 then begin
692692- Fmt.pr "%a@." Fmt.(styled `Cyan string) "Content Statistics:";
693693- Fmt.pr " Average length: %.0f characters@."
780780+ Fmt.pr "%a@." Fmt.(styled `Bold string) "Content Statistics";
781781+ Fmt.pr " %a %.0f chars@."
782782+ Fmt.(styled `Faint string) "Average:"
694783 (River.Quality.avg_content_length metrics);
695695- Fmt.pr " Min length: %d characters@."
696696- (River.Quality.min_content_length metrics);
697697- Fmt.pr " Max length: %d characters@."
698698- (River.Quality.max_content_length metrics);
699699- Fmt.pr "@."
784784+ Fmt.pr " %a %a ... %a@.@."
785785+ Fmt.(styled `Faint string) "Range: "
786786+ Fmt.(styled (`Fg `Cyan) string) (string_of_int (River.Quality.min_content_length metrics))
787787+ Fmt.(styled (`Fg `Cyan) string) (string_of_int (River.Quality.max_content_length metrics))
700788 end;
701789702790 (* Posting frequency *)
703791 (match River.Quality.posting_frequency_days metrics with
704792 | Some freq ->
705705- Fmt.pr "%a@." Fmt.(styled `Cyan string) "Posting Frequency:";
706706- Fmt.pr " Average: %.1f days between posts@." freq;
793793+ Fmt.pr "%a@." Fmt.(styled `Bold string) "Posting Frequency";
707794 let posts_per_week = 7.0 /. freq in
708708- Fmt.pr " (~%.1f posts per week)@." posts_per_week;
709709- Fmt.pr "@."
795795+ Fmt.pr " %a %.1f days between posts@."
796796+ Fmt.(styled `Faint string) "Average:"
797797+ freq;
798798+ Fmt.pr " %a ~%.1f posts/week@.@."
799799+ Fmt.(styled `Faint string) " "
800800+ posts_per_week
710801 | None ->
711802 Fmt.pr "%a@.@." Fmt.(styled `Faint string)
712803 "Not enough data to calculate posting frequency");
+123
stack/river/lib/feed.ml
···11+(*
22+ * Copyright (c) 2014, OCaml.org project
33+ * Copyright (c) 2015 KC Sivaramakrishnan <sk826@cl.cam.ac.uk>
44+ *
55+ * Permission to use, copy, modify, and distribute this software for any
66+ * purpose with or without fee is hereby granted, provided that the above
77+ * copyright notice and this permission notice appear in all copies.
88+ *
99+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
1010+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
1111+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
1212+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
1313+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
1414+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
1515+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
1616+ *)
1717+1818+(** Feed fetching and parsing. *)
1919+2020+let src = Logs.Src.create "river" ~doc:"River RSS/Atom aggregator"
2121+module Log = (val Logs.src_log src : Logs.LOG)
2222+2323+type feed_content =
2424+ | Atom of Syndic.Atom.feed
2525+ | Rss2 of Syndic.Rss2.channel
2626+ | Json of Jsonfeed.t
2727+2828+type t = {
2929+ source : Source.t;
3030+ title : string;
3131+ content : feed_content;
3232+}
3333+3434+let string_of_feed = function
3535+ | Atom _ -> "Atom"
3636+ | Rss2 _ -> "Rss2"
3737+ | Json _ -> "JSONFeed"
3838+3939+let classify_feed ~xmlbase (body : string) =
4040+ Log.debug (fun m -> m "Attempting to parse feed (%d bytes)" (String.length body));
4141+4242+ (* Quick check - does it look like JSON? *)
4343+ let looks_like_json =
4444+ String.length body > 0 &&
4545+ let first_char = String.get body 0 in
4646+ first_char = '{' || first_char = '['
4747+ in
4848+4949+ if looks_like_json then (
5050+ (* Try JSONFeed first *)
5151+ Log.debug (fun m -> m "Body looks like JSON, trying JSONFeed parser");
5252+ match Jsonfeed.of_string body with
5353+ | Ok jsonfeed ->
5454+ Log.debug (fun m -> m "Successfully parsed as JSONFeed");
5555+ Json jsonfeed
5656+ | Error err ->
5757+ Log.debug (fun m -> m "Not a JSONFeed: %s" (Jsont.Error.to_string err));
5858+ (* Fall through to XML parsing *)
5959+ failwith "Not a valid JSONFeed"
6060+ ) else (
6161+ (* Try XML formats *)
6262+ try
6363+ let feed = Atom (Syndic.Atom.parse ~xmlbase (Xmlm.make_input (`String (0, body)))) in
6464+ Log.debug (fun m -> m "Successfully parsed as Atom feed");
6565+ feed
6666+ with
6767+ | Syndic.Atom.Error.Error (pos, msg) -> (
6868+ Log.debug (fun m -> m "Not an Atom feed: %s at position (%d, %d)"
6969+ msg (fst pos) (snd pos));
7070+ try
7171+ let feed = Rss2 (Syndic.Rss2.parse ~xmlbase (Xmlm.make_input (`String (0, body)))) in
7272+ Log.debug (fun m -> m "Successfully parsed as RSS2 feed");
7373+ feed
7474+ with Syndic.Rss2.Error.Error (pos, msg) ->
7575+ Log.err (fun m -> m "Failed to parse as RSS2: %s at position (%d, %d)"
7676+ msg (fst pos) (snd pos));
7777+ failwith "Neither Atom nor RSS2 feed")
7878+ | Not_found as e ->
7979+ Log.err (fun m -> m "Not_found exception during Atom feed parsing");
8080+ Log.err (fun m -> m "Backtrace:\n%s" (Printexc.get_backtrace ()));
8181+ raise e
8282+ | e ->
8383+ Log.err (fun m -> m "Unexpected exception during feed parsing: %s"
8484+ (Printexc.to_string e));
8585+ Log.err (fun m -> m "Backtrace:\n%s" (Printexc.get_backtrace ()));
8686+ raise e
8787+ )
8888+8989+let fetch session source =
9090+ Log.info (fun m -> m "Fetching feed: %s" (Source.name source));
9191+9292+ let xmlbase = Uri.of_string (Source.url source) in
9393+9494+ (* Use Requests_json_api.get_result for clean Result-based error handling *)
9595+ let requests_session = Session.get_requests_session session in
9696+ let response =
9797+ match Requests_json_api.get_result requests_session (Source.url source) with
9898+ | Ok body ->
9999+ Log.info (fun m -> m "Successfully fetched %s (%d bytes)"
100100+ (Source.url source) (String.length body));
101101+ body
102102+ | Error (status, msg) ->
103103+ Log.err (fun m -> m "Failed to fetch feed '%s': HTTP %d - %s"
104104+ (Source.name source) status msg);
105105+ failwith (Printf.sprintf "HTTP %d: %s" status msg)
106106+ in
107107+108108+ let content = classify_feed ~xmlbase response in
109109+ let title =
110110+ match content with
111111+ | Atom atom -> Text_extract.string_of_text_construct atom.Syndic.Atom.title
112112+ | Rss2 ch -> ch.Syndic.Rss2.title
113113+ | Json jsonfeed -> Jsonfeed.title jsonfeed
114114+ in
115115+116116+ Log.info (fun m -> m "Successfully fetched %s feed '%s' (title: '%s')"
117117+ (string_of_feed content) (Source.name source) title);
118118+119119+ { source; title; content }
120120+121121+let source t = t.source
122122+let content t = t.content
123123+let title t = t.title
+43
stack/river/lib/feed.mli
···11+(*
22+ * Copyright (c) 2014, OCaml.org project
33+ * Copyright (c) 2015 KC Sivaramakrishnan <sk826@cl.cam.ac.uk>
44+ *
55+ * Permission to use, copy, modify, and distribute this software for any
66+ * purpose with or without fee is hereby granted, provided that the above
77+ * copyright notice and this permission notice appear in all copies.
88+ *
99+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
1010+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
1111+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
1212+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
1313+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
1414+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
1515+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
1616+ *)
1717+1818+(** Feed fetching and parsing. *)
1919+2020+type feed_content =
2121+ | Atom of Syndic.Atom.feed
2222+ | Rss2 of Syndic.Rss2.channel
2323+ | Json of Jsonfeed.t
2424+(** The underlying feed content, which can be Atom, RSS2, or JSONFeed format. *)
2525+2626+type t
2727+(** An Atom, RSS2, or JSON Feed. *)
2828+2929+val fetch : Session.t -> Source.t -> t
3030+(** [fetch session source] fetches and parses a feed from the given source.
3131+3232+ @param session The HTTP session
3333+ @param source The feed source to fetch
3434+ @raise Failure if the feed cannot be fetched or parsed *)
3535+3636+val source : t -> Source.t
3737+(** [source feed] returns the source this feed was fetched from. *)
3838+3939+val content : t -> feed_content
4040+(** [content feed] returns the underlying feed content. *)
4141+4242+val title : t -> string
4343+(** [title feed] returns the feed title. *)
+139
stack/river/lib/format.ml
···11+(*
22+ * Copyright (c) 2014, OCaml.org project
33+ * Copyright (c) 2015 KC Sivaramakrishnan <sk826@cl.cam.ac.uk>
44+ *
55+ * Permission to use, copy, modify, and distribute this software for any
66+ * purpose with or without fee is hereby granted, provided that the above
77+ * copyright notice and this permission notice appear in all copies.
88+ *
99+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
1010+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
1111+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
1212+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
1313+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
1414+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
1515+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
1616+ *)
1717+1818+(** Feed format conversion and export. *)
1919+2020+module Atom = struct
2121+ let entry_of_post post =
2222+ let content = Syndic.Atom.Html (None, Post.content post) in
2323+ let contributors =
2424+ [ Syndic.Atom.author ~uri:(Uri.of_string (Source.url (Feed.source (Post.feed post))))
2525+ (Source.name (Feed.source (Post.feed post))) ]
2626+ in
2727+ let links =
2828+ match Post.link post with
2929+ | Some l -> [ Syndic.Atom.link ~rel:Syndic.Atom.Alternate l ]
3030+ | None -> []
3131+ in
3232+ let id =
3333+ match Post.link post with
3434+ | Some l -> l
3535+ | None -> Uri.of_string (Digest.to_hex (Digest.string (Post.title post)))
3636+ in
3737+ let authors = (Syndic.Atom.author ~email:(Post.email post) (Post.author post), []) in
3838+ let title : Syndic.Atom.text_construct = Syndic.Atom.Text (Post.title post) in
3939+ let updated =
4040+ match Post.date post with
4141+ (* Atom entry requires a date but RSS2 does not. So if a date
4242+ * is not available, just capture the current date. *)
4343+ | None -> Ptime.of_float_s (Unix.gettimeofday ()) |> Option.get
4444+ | Some d -> d
4545+ in
4646+ Syndic.Atom.entry ~content ~contributors ~links ~id ~authors ~title ~updated
4747+ ()
4848+4949+ let entries_of_posts posts = List.map entry_of_post posts
5050+5151+ let feed_of_entries ~title ?id ?(authors = []) entries =
5252+ let feed_id = match id with
5353+ | Some i -> Uri.of_string i
5454+ | None -> Uri.of_string "urn:river:merged"
5555+ in
5656+ let feed_authors = List.map (fun (name, email) ->
5757+ match email with
5858+ | Some e -> Syndic.Atom.author ~email:e name
5959+ | None -> Syndic.Atom.author name
6060+ ) authors in
6161+ {
6262+ Syndic.Atom.id = feed_id;
6363+ title = Syndic.Atom.Text title;
6464+ updated = Ptime.of_float_s (Unix.time ()) |> Option.get;
6565+ entries;
6666+ authors = feed_authors;
6767+ categories = [];
6868+ contributors = [];
6969+ generator = Some {
7070+ Syndic.Atom.version = Some "1.0";
7171+ uri = None;
7272+ content = "River Feed Aggregator";
7373+ };
7474+ icon = None;
7575+ links = [];
7676+ logo = None;
7777+ rights = None;
7878+ subtitle = None;
7979+ }
8080+8181+ let to_string feed =
8282+ let output = Buffer.create 4096 in
8383+ Syndic.Atom.output feed (`Buffer output);
8484+ Buffer.contents output
8585+end
8686+8787+module Rss2 = struct
8888+ let of_feed feed =
8989+ match Feed.content feed with
9090+ | Feed.Rss2 ch -> Some ch
9191+ | _ -> None
9292+end
9393+9494+module Jsonfeed = struct
9595+ let item_of_post post =
9696+ (* Convert HTML content back to string *)
9797+ let html = Post.content post in
9898+ let content = `Html html in
9999+100100+ (* Create author *)
101101+ let authors =
102102+ if Post.author post <> "" then
103103+ let author = Jsonfeed.Author.create ~name:(Post.author post) () in
104104+ Some [author]
105105+ else
106106+ None
107107+ in
108108+109109+ (* Create item *)
110110+ Jsonfeed.Item.create
111111+ ~id:(Post.id post)
112112+ ~content
113113+ ?url:(Option.map Uri.to_string (Post.link post))
114114+ ~title:(Post.title post)
115115+ ?summary:(Post.summary post)
116116+ ?date_published:(Post.date post)
117117+ ?authors
118118+ ~tags:(Post.tags post)
119119+ ()
120120+121121+ let items_of_posts posts = List.map item_of_post posts
122122+123123+ let feed_of_items ~title ?home_page_url ?feed_url ?description ?icon ?favicon items =
124124+ Jsonfeed.create ~title ?home_page_url ?feed_url ?description ?icon ?favicon ~items ()
125125+126126+ let feed_of_posts ~title ?home_page_url ?feed_url ?description ?icon ?favicon posts =
127127+ let items = items_of_posts posts in
128128+ feed_of_items ~title ?home_page_url ?feed_url ?description ?icon ?favicon items
129129+130130+ let to_string ?(minify = false) jsonfeed =
131131+ match Jsonfeed.to_string ~minify jsonfeed with
132132+ | Ok s -> Ok s
133133+ | Error err -> Error (Jsont.Error.to_string err)
134134+135135+ let of_feed feed =
136136+ match Feed.content feed with
137137+ | Feed.Json jf -> Some jf
138138+ | _ -> None
139139+end
+103
stack/river/lib/format.mli
···11+(*
22+ * Copyright (c) 2014, OCaml.org project
33+ * Copyright (c) 2015 KC Sivaramakrishnan <sk826@cl.cam.ac.uk>
44+ *
55+ * Permission to use, copy, modify, and distribute this software for any
66+ * purpose with or without fee is hereby granted, provided that the above
77+ * copyright notice and this permission notice appear in all copies.
88+ *
99+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
1010+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
1111+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
1212+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
1313+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
1414+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
1515+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
1616+ *)
1717+1818+(** Feed format conversion and export. *)
1919+2020+module Atom : sig
2121+ (** Atom 1.0 format support. *)
2222+2323+ val entry_of_post : Post.t -> Syndic.Atom.entry
2424+ (** [entry_of_post post] converts a post to an Atom entry. *)
2525+2626+ val entries_of_posts : Post.t list -> Syndic.Atom.entry list
2727+ (** [entries_of_posts posts] converts posts to Atom entries. *)
2828+2929+ val feed_of_entries :
3030+ title:string ->
3131+ ?id:string ->
3232+ ?authors:(string * string option) list ->
3333+ Syndic.Atom.entry list ->
3434+ Syndic.Atom.feed
3535+ (** [feed_of_entries ~title entries] creates an Atom feed from entries.
3636+3737+ @param title The feed title
3838+ @param id Optional feed ID (default: "urn:river:merged")
3939+ @param authors Optional list of (name, email) tuples *)
4040+4141+ val to_string : Syndic.Atom.feed -> string
4242+ (** [to_string feed] serializes an Atom feed to XML string. *)
4343+end
4444+4545+module Rss2 : sig
4646+ (** RSS 2.0 format support. *)
4747+4848+ val of_feed : Feed.t -> Syndic.Rss2.channel option
4949+ (** [of_feed feed] extracts RSS2 channel if the feed is RSS2 format.
5050+5151+ Returns None if the feed is not RSS2. *)
5252+end
5353+5454+module Jsonfeed : sig
5555+ (** JSON Feed 1.1 format support. *)
5656+5757+ val item_of_post : Post.t -> Jsonfeed.Item.t
5858+ (** [item_of_post post] converts a post to a JSONFeed item. *)
5959+6060+ val items_of_posts : Post.t list -> Jsonfeed.Item.t list
6161+ (** [items_of_posts posts] converts posts to JSONFeed items. *)
6262+6363+ val feed_of_items :
6464+ title:string ->
6565+ ?home_page_url:string ->
6666+ ?feed_url:string ->
6767+ ?description:string ->
6868+ ?icon:string ->
6969+ ?favicon:string ->
7070+ Jsonfeed.Item.t list ->
7171+ Jsonfeed.t
7272+ (** [feed_of_items ~title items] creates a JSONFeed from items.
7373+7474+ @param title The feed title (required)
7575+ @param home_page_url The URL of the website the feed represents
7676+ @param feed_url The URL of the feed itself
7777+ @param description A description of the feed
7878+ @param icon URL of an icon for the feed (512x512 recommended)
7979+ @param favicon URL of a favicon for the feed (64x64 recommended) *)
8080+8181+ val feed_of_posts :
8282+ title:string ->
8383+ ?home_page_url:string ->
8484+ ?feed_url:string ->
8585+ ?description:string ->
8686+ ?icon:string ->
8787+ ?favicon:string ->
8888+ Post.t list ->
8989+ Jsonfeed.t
9090+ (** [feed_of_posts ~title posts] creates a JSONFeed from posts.
9191+9292+ Convenience function that combines [items_of_posts] and [feed_of_items]. *)
9393+9494+ val to_string : ?minify:bool -> Jsonfeed.t -> (string, string) result
9595+ (** [to_string ?minify feed] serializes a JSONFeed to JSON string.
9696+9797+ @param minify If true, output compact JSON; if false, pretty-print (default: false) *)
9898+9999+ val of_feed : Feed.t -> Jsonfeed.t option
100100+ (** [of_feed feed] extracts JSONFeed if the feed is JSONFeed format.
101101+102102+ Returns None if the feed is not JSONFeed. *)
103103+end
+330
stack/river/lib/html_markdown.ml
···11+(*
22+ * Copyright (c) 2014, OCaml.org project
33+ * Copyright (c) 2015 KC Sivaramakrishnan <sk826@cl.cam.ac.uk>
44+ *
55+ * Permission to use, copy, modify, and distribute this software for any
66+ * purpose with or without fee is hereby granted, provided that the above
77+ * copyright notice and this permission notice appear in all copies.
88+ *
99+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
1010+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
1111+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
1212+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
1313+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
1414+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
1515+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
1616+ *)
1717+1818+(** Internal utility for HTML to Markdown conversion *)
1919+2020+[@@@warning "-32"] (* Suppress unused value warnings for internal utilities *)
2121+2222+(** HTML to Markdown converter using Lambda Soup *)
2323+2424+(** Extract all links from HTML content *)
2525+let extract_links html_str =
2626+ try
2727+ let soup = Soup.parse html_str in
2828+ let links = Soup.select "a[href]" soup in
2929+ Soup.fold (fun acc link ->
3030+ match Soup.attribute "href" link with
3131+ | Some href ->
3232+ let text = Soup.texts link |> String.concat "" |> String.trim in
3333+ (href, text) :: acc
3434+ | None -> acc
3535+ ) [] links
3636+ |> List.rev
3737+ with _ -> []
3838+3939+(** Check if string contains any whitespace *)
4040+let has_whitespace s =
4141+ try
4242+ let _ = Str.search_forward (Str.regexp "[ \t\n\r]") s 0 in
4343+ true
4444+ with Not_found -> false
4545+4646+(** Clean up excessive newlines and normalize spacing *)
4747+let cleanup_markdown s =
4848+ (* Normalize line endings *)
4949+ let s = Str.global_replace (Str.regexp "\r\n") "\n" s in
5050+5151+ (* Remove trailing whitespace from each line *)
5252+ let lines = String.split_on_char '\n' s in
5353+ let lines = List.map (fun line ->
5454+ (* Trim trailing spaces but preserve leading spaces for indentation *)
5555+ let len = String.length line in
5656+ let rec find_last_non_space i =
5757+ if i < 0 then -1
5858+ else if line.[i] = ' ' || line.[i] = '\t' then find_last_non_space (i - 1)
5959+ else i
6060+ in
6161+ let last = find_last_non_space (len - 1) in
6262+ if last < 0 then ""
6363+ else String.sub line 0 (last + 1)
6464+ ) lines in
6565+6666+ (* Join back and collapse excessive blank lines *)
6767+ let s = String.concat "\n" lines in
6868+6969+ (* Replace 3+ consecutive newlines with just 2 *)
7070+ let s = Str.global_replace (Str.regexp "\n\n\n+") "\n\n" s in
7171+7272+ (* Trim leading and trailing whitespace *)
7373+ String.trim s
7474+7575+(** Convert HTML to Markdown using state-based whitespace handling *)
7676+let html_to_markdown html_str =
7777+ try
7878+ let soup = Soup.parse html_str in
7979+ let buffer = Buffer.create 256 in
8080+8181+ (* State: track if we need to insert a space before next text *)
8282+ let need_space = ref false in
8383+8484+ (* Get last character in buffer, if any *)
8585+ let last_char () =
8686+ let len = Buffer.length buffer in
8787+ if len = 0 then None
8888+ else Some (Buffer.nth buffer (len - 1))
8989+ in
9090+9191+ (* Add text with proper spacing *)
9292+ let add_text text =
9393+ let trimmed = String.trim text in
9494+ if trimmed <> "" then begin
9595+ (* Check if text starts with punctuation that shouldn't have space before it *)
9696+ let starts_with_punctuation =
9797+ String.length trimmed > 0 &&
9898+ (match trimmed.[0] with
9999+ | ',' | '.' | ';' | ':' | '!' | '?' | ')' | ']' | '}' -> true
100100+ | _ -> false)
101101+ in
102102+103103+ (* Add space if needed, unless we're before punctuation *)
104104+ if !need_space && not starts_with_punctuation then begin
105105+ match last_char () with
106106+ | Some (' ' | '\n') -> ()
107107+ | _ -> Buffer.add_char buffer ' '
108108+ end;
109109+ Buffer.add_string buffer trimmed;
110110+ need_space := false
111111+ end
112112+ in
113113+114114+ (* Mark that we need space before next text (for inline elements) *)
115115+ let mark_space_needed () =
116116+ need_space := has_whitespace (Buffer.contents buffer) || Buffer.length buffer > 0
117117+ in
118118+119119+ (* Process header with ID/anchor handling *)
120120+ let process_header level elem =
121121+ need_space := false;
122122+123123+ (* Check if header contains a link with an ID fragment *)
124124+ let link_opt = Soup.select_one "a[href]" elem in
125125+ let anchor_id = match link_opt with
126126+ | Some link ->
127127+ (match Soup.attribute "href" link with
128128+ | Some href ->
129129+ (* Extract fragment from URL *)
130130+ let uri = Uri.of_string href in
131131+ Uri.fragment uri
132132+ | None -> None)
133133+ | None -> None
134134+ in
135135+136136+ (* Add anchor if we found an ID *)
137137+ (match anchor_id with
138138+ | Some id when id <> "" ->
139139+ Buffer.add_string buffer (Printf.sprintf "\n<a name=\"%s\"></a>\n" id)
140140+ | _ -> ());
141141+142142+ (* Add the header marker *)
143143+ let marker = String.make level '#' in
144144+ Buffer.add_string buffer ("\n" ^ marker ^ " ");
145145+146146+ (* Get text content, excluding link tags *)
147147+ let text = Soup.texts elem |> String.concat " " |> String.trim in
148148+ Buffer.add_string buffer text;
149149+150150+ Buffer.add_string buffer "\n\n";
151151+ need_space := false
152152+ in
153153+154154+ let rec process_node node =
155155+ match Soup.element node with
156156+ | Some elem ->
157157+ let tag = Soup.name elem in
158158+ (match tag with
159159+ (* Block elements - reset space tracking *)
160160+ | "h1" -> process_header 1 elem
161161+ | "h2" -> process_header 2 elem
162162+ | "h3" -> process_header 3 elem
163163+ | "h4" -> process_header 4 elem
164164+ | "h5" -> process_header 5 elem
165165+ | "h6" -> process_header 6 elem
166166+ | "p" ->
167167+ need_space := false;
168168+ Soup.children elem |> Soup.iter process_node;
169169+ Buffer.add_string buffer "\n\n";
170170+ need_space := false
171171+ | "br" ->
172172+ Buffer.add_string buffer "\n";
173173+ need_space := false
174174+ (* Inline elements - preserve space tracking *)
175175+ | "strong" | "b" ->
176176+ (* Add space before if needed *)
177177+ if !need_space then begin
178178+ match last_char () with
179179+ | Some (' ' | '\n') -> ()
180180+ | _ -> Buffer.add_char buffer ' '
181181+ end;
182182+ Buffer.add_string buffer "**";
183183+ need_space := false;
184184+ Soup.children elem |> Soup.iter process_node;
185185+ Buffer.add_string buffer "**";
186186+ mark_space_needed ()
187187+ | "em" | "i" ->
188188+ (* Add space before if needed *)
189189+ if !need_space then begin
190190+ match last_char () with
191191+ | Some (' ' | '\n') -> ()
192192+ | _ -> Buffer.add_char buffer ' '
193193+ end;
194194+ Buffer.add_string buffer "*";
195195+ need_space := false;
196196+ Soup.children elem |> Soup.iter process_node;
197197+ Buffer.add_string buffer "*";
198198+ mark_space_needed ()
199199+ | "code" ->
200200+ (* Add space before if needed *)
201201+ if !need_space then begin
202202+ match last_char () with
203203+ | Some (' ' | '\n') -> ()
204204+ | _ -> Buffer.add_char buffer ' '
205205+ end;
206206+ Buffer.add_string buffer "`";
207207+ need_space := false;
208208+ Soup.children elem |> Soup.iter process_node;
209209+ Buffer.add_string buffer "`";
210210+ mark_space_needed ()
211211+ | "pre" ->
212212+ need_space := false;
213213+ Buffer.add_string buffer "\n```\n";
214214+ Soup.children elem |> Soup.iter process_node;
215215+ Buffer.add_string buffer "\n```\n\n";
216216+ need_space := false
217217+ | "a" ->
218218+ let text = Soup.texts elem |> String.concat " " |> String.trim in
219219+ let href = Soup.attribute "href" elem in
220220+ (match href with
221221+ | Some href ->
222222+ (* Add space before link if needed *)
223223+ if !need_space then begin
224224+ match last_char () with
225225+ | Some (' ' | '\n') -> ()
226226+ | _ -> Buffer.add_char buffer ' '
227227+ end;
228228+ need_space := false;
229229+230230+ (* Add the link markdown *)
231231+ if text = "" then
232232+ Buffer.add_string buffer (Printf.sprintf "<%s>" href)
233233+ else
234234+ Buffer.add_string buffer (Printf.sprintf "[%s](%s)" text href);
235235+236236+ (* Mark that space may be needed after link *)
237237+ mark_space_needed ()
238238+ | None ->
239239+ add_text text)
240240+ | "ul" | "ol" ->
241241+ need_space := false;
242242+ Buffer.add_string buffer "\n";
243243+ let is_ordered = tag = "ol" in
244244+ let items = Soup.children elem |> Soup.to_list in
245245+ List.iteri (fun i item ->
246246+ match Soup.element item with
247247+ | Some li when Soup.name li = "li" ->
248248+ need_space := false;
249249+ if is_ordered then
250250+ Buffer.add_string buffer (Printf.sprintf "%d. " (i + 1))
251251+ else
252252+ Buffer.add_string buffer "- ";
253253+ Soup.children li |> Soup.iter process_node;
254254+ Buffer.add_string buffer "\n"
255255+ | _ -> ()
256256+ ) items;
257257+ Buffer.add_string buffer "\n";
258258+ need_space := false
259259+ | "blockquote" ->
260260+ need_space := false;
261261+ Buffer.add_string buffer "\n> ";
262262+ Soup.children elem |> Soup.iter process_node;
263263+ Buffer.add_string buffer "\n\n";
264264+ need_space := false
265265+ | "img" ->
266266+ (* Add space before if needed *)
267267+ if !need_space then begin
268268+ match last_char () with
269269+ | Some (' ' | '\n') -> ()
270270+ | _ -> Buffer.add_char buffer ' '
271271+ end;
272272+ let alt = Soup.attribute "alt" elem |> Option.value ~default:"" in
273273+ let src = Soup.attribute "src" elem |> Option.value ~default:"" in
274274+ Buffer.add_string buffer (Printf.sprintf "" alt src);
275275+ need_space := false;
276276+ mark_space_needed ()
277277+ | "hr" ->
278278+ need_space := false;
279279+ Buffer.add_string buffer "\n---\n\n";
280280+ need_space := false
281281+ (* Strip these tags but keep content *)
282282+ | "div" | "span" | "article" | "section" | "header" | "footer"
283283+ | "main" | "nav" | "aside" | "figure" | "figcaption" | "details" | "summary" ->
284284+ Soup.children elem |> Soup.iter process_node
285285+ (* Ignore script, style, etc *)
286286+ | "script" | "style" | "noscript" -> ()
287287+ (* Default: just process children *)
288288+ | _ ->
289289+ Soup.children elem |> Soup.iter process_node)
290290+ | None ->
291291+ (* Text node - handle whitespace properly *)
292292+ match Soup.leaf_text node with
293293+ | Some text ->
294294+ (* If text is only whitespace, mark that we need space *)
295295+ let trimmed = String.trim text in
296296+ if trimmed = "" then begin
297297+ if has_whitespace text then
298298+ need_space := true
299299+ end else begin
300300+ (* Text has content - check if it had leading/trailing whitespace *)
301301+ let had_leading_ws = has_whitespace text &&
302302+ (String.length text > 0 &&
303303+ (text.[0] = ' ' || text.[0] = '\t' || text.[0] = '\n' || text.[0] = '\r')) in
304304+305305+ (* If had leading whitespace, mark we need space *)
306306+ if had_leading_ws then need_space := true;
307307+308308+ (* Add the text content *)
309309+ add_text trimmed;
310310+311311+ (* If had trailing whitespace, mark we need space for next *)
312312+ let had_trailing_ws = has_whitespace text &&
313313+ (String.length text > 0 &&
314314+ let last = text.[String.length text - 1] in
315315+ last = ' ' || last = '\t' || last = '\n' || last = '\r') in
316316+ if had_trailing_ws then need_space := true
317317+ end
318318+ | None -> ()
319319+ in
320320+321321+ Soup.children soup |> Soup.iter process_node;
322322+323323+ (* Clean up the result *)
324324+ let result = Buffer.contents buffer in
325325+ cleanup_markdown result
326326+ with _ -> html_str
327327+328328+(** Convert HTML content to clean Markdown *)
329329+let to_markdown html_str =
330330+ html_to_markdown html_str
+84
stack/river/lib/html_meta.ml
···11+(*
22+ * Copyright (c) 2014, OCaml.org project
33+ * Copyright (c) 2015 KC Sivaramakrishnan <sk826@cl.cam.ac.uk>
44+ *
55+ * Permission to use, copy, modify, and distribute this software for any
66+ * purpose with or without fee is hereby granted, provided that the above
77+ * copyright notice and this permission notice appear in all copies.
88+ *
99+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
1010+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
1111+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
1212+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
1313+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
1414+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
1515+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
1616+ *)
1717+1818+(** Internal utility for HTML meta tag extraction *)
1919+2020+[@@@warning "-32"] (* Suppress unused value warnings for internal utilities *)
2121+2222+(** This module determines an image to be used as preview of a website.
2323+2424+ It does this by following the same logic Google+ and other websites use, and
2525+ described in this article:
2626+ https://www.raymondcamden.com/2011/07/26/How-are-Facebook-and-Google-creating-link-previews *)
2727+2828+let og_image html =
2929+ let open Soup in
3030+ let soup = parse html in
3131+ try soup $ "meta[property=og:image]" |> R.attribute "content" |> Option.some
3232+ with Failure _ -> None
3333+3434+let image_src html =
3535+ let open Soup in
3636+ let soup = parse html in
3737+ try soup $ "link[rel=\"image_src\"]" |> R.attribute "href" |> Option.some
3838+ with Failure _ -> None
3939+4040+let twitter_image html =
4141+ let open Soup in
4242+ let soup = parse html in
4343+ try
4444+ soup $ "meta[name=\"twitter:image\"]" |> R.attribute "content"
4545+ |> Option.some
4646+ with Failure _ -> None
4747+4848+let og_description html =
4949+ let open Soup in
5050+ let soup = parse html in
5151+ try
5252+ soup $ "meta[property=og:description]" |> R.attribute "content"
5353+ |> Option.some
5454+ with Failure _ -> None
5555+5656+let description html =
5757+ let open Soup in
5858+ let soup = parse html in
5959+ try
6060+ soup $ "meta[property=description]" |> R.attribute "content" |> Option.some
6161+ with Failure _ -> None
6262+6363+let preview_image html =
6464+ let preview_image =
6565+ match og_image html with
6666+ | None -> (
6767+ match image_src html with
6868+ | None -> twitter_image html
6969+ | Some x -> Some x)
7070+ | Some x -> Some x
7171+ in
7272+ match Option.map String.trim preview_image with
7373+ | Some "" -> None
7474+ | Some x -> Some x
7575+ | None -> None
7676+7777+let description html =
7878+ let preview_image =
7979+ match og_description html with None -> description html | Some x -> Some x
8080+ in
8181+ match Option.map String.trim preview_image with
8282+ | Some "" -> None
8383+ | Some x -> Some x
8484+ | None -> None
+393
stack/river/lib/post.ml
···11+(*
22+ * Copyright (c) 2014, OCaml.org project
33+ * Copyright (c) 2015 KC Sivaramakrishnan <sk826@cl.cam.ac.uk>
44+ *
55+ * Permission to use, copy, modify, and distribute this software for any
66+ * purpose with or without fee is hereby granted, provided that the above
77+ * copyright notice and this permission notice appear in all copies.
88+ *
99+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
1010+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
1111+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
1212+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
1313+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
1414+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
1515+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
1616+ *)
1717+1818+(** Post representation and extraction from feeds. *)
1919+2020+let src = Logs.Src.create "river" ~doc:"River RSS/Atom aggregator"
2121+module Log = (val Logs.src_log src : Logs.LOG)
2222+2323+type t = {
2424+ id : string;
2525+ title : string;
2626+ link : Uri.t option;
2727+ date : Syndic.Date.t option;
2828+ feed : Feed.t;
2929+ author : string;
3030+ email : string;
3131+ content : Soup.soup Soup.node;
3232+ mutable link_response : (string, string) result option;
3333+ tags : string list;
3434+ summary : string option;
3535+}
3636+3737+(** Generate a stable, unique ID from available data *)
3838+let generate_id ?guid ?link ?title ?date ~feed_url () =
3939+ match guid with
4040+ | Some id when id <> "" ->
4141+ (* Use explicit ID/GUID if available *)
4242+ id
4343+ | _ ->
4444+ match link with
4545+ | Some uri when Uri.to_string uri <> "" ->
4646+ (* Use permalink as ID (stable and unique) *)
4747+ Uri.to_string uri
4848+ | _ ->
4949+ (* Fallback: hash of feed_url + title + date *)
5050+ let title_str = Option.value title ~default:"" in
5151+ let date_str =
5252+ match date with
5353+ | Some d -> Ptime.to_rfc3339 d
5454+ | None -> ""
5555+ in
5656+ let composite = Printf.sprintf "%s|%s|%s" feed_url title_str date_str in
5757+ (* Use SHA256 for stable hashing *)
5858+ Digest.string composite |> Digest.to_hex
5959+6060+let resolve_links_attr ~xmlbase attr el =
6161+ Soup.R.attribute attr el
6262+ |> Uri.of_string
6363+ |> Syndic.XML.resolve ~xmlbase
6464+ |> Uri.to_string
6565+ |> fun value -> Soup.set_attribute attr value el
6666+6767+(* Things that posts should not contain *)
6868+let undesired_tags = [ "style"; "script" ]
6969+let undesired_attr = [ "id" ]
7070+7171+let html_of_text ?xmlbase s =
7272+ let soup = Soup.parse s in
7373+ let ($$) = Soup.($$) in
7474+ soup $$ "a[href]" |> Soup.iter (resolve_links_attr ~xmlbase "href");
7575+ soup $$ "img[src]" |> Soup.iter (resolve_links_attr ~xmlbase "src");
7676+ undesired_tags |> List.iter (fun tag -> soup $$ tag |> Soup.iter Soup.delete);
7777+ soup $$ "*" |> Soup.iter (fun el ->
7878+ undesired_attr |> List.iter (fun attr -> Soup.delete_attribute attr el));
7979+ soup
8080+8181+(* Do not trust sites using XML for HTML content. Convert to string and parse
8282+ back. (Does not always fix bad HTML unfortunately.) *)
8383+let html_of_syndic =
8484+ let ns_prefix _ = Some "" in
8585+ fun ?xmlbase h ->
8686+ html_of_text ?xmlbase
8787+ (String.concat "" (List.map (Syndic.XML.to_string ~ns_prefix) h))
8888+8989+let string_of_option = function None -> "" | Some s -> s
9090+9191+let post_compare p1 p2 =
9292+ (* Most recent posts first. Posts with no date are always last *)
9393+ match (p1.date, p2.date) with
9494+ | Some d1, Some d2 -> Syndic.Date.compare d2 d1
9595+ | None, Some _ -> 1
9696+ | Some _, None -> -1
9797+ | None, None -> 1
9898+9999+let rec remove n l =
100100+ if n <= 0 then l else match l with [] -> [] | _ :: tl -> remove (n - 1) tl
101101+102102+let rec take n = function
103103+ | [] -> []
104104+ | e :: tl -> if n > 0 then e :: take (n - 1) tl else []
105105+106106+let post_of_atom ~(feed : Feed.t) (e : Syndic.Atom.entry) =
107107+ Log.debug (fun m -> m "Processing Atom entry: %s"
108108+ (Text_extract.string_of_text_construct e.title));
109109+110110+ let link =
111111+ try
112112+ Some
113113+ (List.find (fun l -> l.Syndic.Atom.rel = Syndic.Atom.Alternate) e.links)
114114+ .href
115115+ with Not_found -> (
116116+ Log.debug (fun m -> m "No alternate link found, trying fallback");
117117+ match e.links with
118118+ | l :: _ -> Some l.href
119119+ | [] -> (
120120+ match Uri.scheme e.id with
121121+ | Some "http" -> Some e.id
122122+ | Some "https" -> Some e.id
123123+ | _ -> None))
124124+ in
125125+ let date =
126126+ match e.published with Some _ -> e.published | None -> Some e.updated
127127+ in
128128+ let content =
129129+ match e.content with
130130+ | Some (Text s) -> html_of_text s
131131+ | Some (Html (xmlbase, s)) -> html_of_text ?xmlbase s
132132+ | Some (Xhtml (xmlbase, h)) -> html_of_syndic ?xmlbase h
133133+ | Some (Mime _) | Some (Src _) | None -> (
134134+ match e.summary with
135135+ | Some (Text s) -> html_of_text s
136136+ | Some (Html (xmlbase, s)) -> html_of_text ?xmlbase s
137137+ | Some (Xhtml (xmlbase, h)) -> html_of_syndic ?xmlbase h
138138+ | None -> Soup.parse "")
139139+ in
140140+ let is_valid_author_name name =
141141+ (* Filter out empty strings and placeholder values like "Unknown" *)
142142+ let trimmed = String.trim name in
143143+ trimmed <> "" && trimmed <> "Unknown"
144144+ in
145145+ let author_name =
146146+ (* Fallback chain for author:
147147+ 1. Entry author (if present, not empty, and not "Unknown")
148148+ 2. Feed-level author (from Atom feed metadata)
149149+ 3. Feed title (from Atom feed metadata)
150150+ 4. Source name (manually entered feed name) *)
151151+ try
152152+ let author, _ = e.authors in
153153+ let trimmed = String.trim author.name in
154154+ if is_valid_author_name author.name then trimmed
155155+ else raise Not_found (* Try feed-level author *)
156156+ with Not_found -> (
157157+ match Feed.content feed with
158158+ | Feed.Atom atom_feed -> (
159159+ (* Try feed-level authors *)
160160+ match atom_feed.Syndic.Atom.authors with
161161+ | author :: _ when is_valid_author_name author.name ->
162162+ String.trim author.name
163163+ | _ ->
164164+ (* Use feed title *)
165165+ Text_extract.string_of_text_construct atom_feed.Syndic.Atom.title)
166166+ | Feed.Rss2 _ | Feed.Json _ ->
167167+ (* For RSS2 and JSONFeed, use the source name *)
168168+ Source.name (Feed.source feed))
169169+ in
170170+ (* Extract tags from Atom categories *)
171171+ let tags =
172172+ List.map (fun cat -> cat.Syndic.Atom.term) e.categories
173173+ in
174174+ (* Extract summary - convert from text_construct to string *)
175175+ let summary =
176176+ match e.summary with
177177+ | Some s -> Some (Text_extract.string_of_text_construct s)
178178+ | None -> None
179179+ in
180180+ (* Generate unique ID *)
181181+ let guid = Uri.to_string e.id in
182182+ let title_str = Text_extract.string_of_text_construct e.title in
183183+ let id =
184184+ generate_id ~guid ?link ~title:title_str ?date
185185+ ~feed_url:(Source.url (Feed.source feed)) ()
186186+ in
187187+ {
188188+ id;
189189+ title = title_str;
190190+ link;
191191+ date;
192192+ feed;
193193+ author = author_name;
194194+ email = "";
195195+ content;
196196+ link_response = None;
197197+ tags;
198198+ summary;
199199+ }
200200+201201+let post_of_rss2 ~(feed : Feed.t) it =
202202+ let title, content =
203203+ match it.Syndic.Rss2.story with
204204+ | All (t, xmlbase, d) -> (
205205+ ( t,
206206+ match it.content with
207207+ | _, "" -> html_of_text ?xmlbase d
208208+ | xmlbase, c -> html_of_text ?xmlbase c ))
209209+ | Title t ->
210210+ let xmlbase, c = it.content in
211211+ (t, html_of_text ?xmlbase c)
212212+ | Description (xmlbase, d) -> (
213213+ ( "",
214214+ match it.content with
215215+ | _, "" -> html_of_text ?xmlbase d
216216+ | xmlbase, c -> html_of_text ?xmlbase c ))
217217+ in
218218+ (* Note: it.link is of type Uri.t option in Syndic *)
219219+ let link =
220220+ match (it.guid, it.link) with
221221+ | Some u, _ when u.permalink -> Some u.data
222222+ | _, Some _ -> it.link
223223+ | Some u, _ ->
224224+ (* Sometimes the guid is indicated with isPermaLink="false" but is
225225+ nonetheless the only URL we get (e.g. ocamlpro). *)
226226+ Some u.data
227227+ | None, None -> None
228228+ in
229229+ (* Extract GUID string for ID generation *)
230230+ let guid_str =
231231+ match it.guid with
232232+ | Some u -> Some (Uri.to_string u.data)
233233+ | None -> None
234234+ in
235235+ (* RSS2 doesn't have a categories field exposed, use empty list *)
236236+ let tags = [] in
237237+ (* RSS2 doesn't have a separate summary field, so leave it empty *)
238238+ let summary = None in
239239+ (* Generate unique ID *)
240240+ let id =
241241+ generate_id ?guid:guid_str ?link ~title ?date:it.pubDate
242242+ ~feed_url:(Source.url (Feed.source feed)) ()
243243+ in
244244+ {
245245+ id;
246246+ title;
247247+ link;
248248+ feed;
249249+ author = Source.name (Feed.source feed);
250250+ email = string_of_option it.author;
251251+ content;
252252+ date = it.pubDate;
253253+ link_response = None;
254254+ tags;
255255+ summary;
256256+ }
257257+258258+let post_of_jsonfeed_item ~(feed : Feed.t) (item : Jsonfeed.Item.t) =
259259+ Log.debug (fun m -> m "Processing JSONFeed item: %s"
260260+ (Option.value (Jsonfeed.Item.title item) ~default:"Untitled"));
261261+262262+ (* Extract content - prefer HTML, fall back to text *)
263263+ let content =
264264+ match Jsonfeed.Item.content item with
265265+ | `Html html -> html_of_text html
266266+ | `Text text -> html_of_text text
267267+ | `Both (html, _text) -> html_of_text html
268268+ in
269269+270270+ (* Extract author - use first author if multiple *)
271271+ let author_name, author_email =
272272+ match Jsonfeed.Item.authors item with
273273+ | Some (first :: _) ->
274274+ let name = Jsonfeed.Author.name first |> Option.value ~default:"" in
275275+ (* JSONFeed authors don't typically have email *)
276276+ (name, "")
277277+ | _ ->
278278+ (* Fall back to feed-level authors or feed title *)
279279+ (match Feed.content feed with
280280+ | Feed.Json jsonfeed ->
281281+ (match Jsonfeed.authors jsonfeed with
282282+ | Some (first :: _) ->
283283+ let name = Jsonfeed.Author.name first |> Option.value ~default:(Feed.title feed) in
284284+ (name, "")
285285+ | _ -> (Feed.title feed, ""))
286286+ | _ -> (Feed.title feed, ""))
287287+ in
288288+289289+ (* Link - use url field *)
290290+ let link =
291291+ Jsonfeed.Item.url item
292292+ |> Option.map Uri.of_string
293293+ in
294294+295295+ (* Date *)
296296+ let date = Jsonfeed.Item.date_published item in
297297+298298+ (* Summary *)
299299+ let summary = Jsonfeed.Item.summary item in
300300+301301+ (* Tags *)
302302+ let tags =
303303+ Jsonfeed.Item.tags item
304304+ |> Option.value ~default:[]
305305+ in
306306+307307+ (* Generate unique ID - JSONFeed items always have an id field (required) *)
308308+ let guid = Jsonfeed.Item.id item in
309309+ let title_str = Jsonfeed.Item.title item |> Option.value ~default:"Untitled" in
310310+ let id =
311311+ generate_id ~guid ?link ~title:title_str ?date
312312+ ~feed_url:(Source.url (Feed.source feed)) ()
313313+ in
314314+315315+ {
316316+ id;
317317+ title = title_str;
318318+ link;
319319+ date;
320320+ feed;
321321+ author = author_name;
322322+ email = author_email;
323323+ content;
324324+ link_response = None;
325325+ tags;
326326+ summary;
327327+ }
328328+329329+let posts_of_feed c =
330330+ match Feed.content c with
331331+ | Feed.Atom f ->
332332+ let posts = List.map (post_of_atom ~feed:c) f.Syndic.Atom.entries in
333333+ Log.debug (fun m -> m "Extracted %d posts from Atom feed '%s'"
334334+ (List.length posts) (Source.name (Feed.source c)));
335335+ posts
336336+ | Feed.Rss2 ch ->
337337+ let posts = List.map (post_of_rss2 ~feed:c) ch.Syndic.Rss2.items in
338338+ Log.debug (fun m -> m "Extracted %d posts from RSS2 feed '%s'"
339339+ (List.length posts) (Source.name (Feed.source c)));
340340+ posts
341341+ | Feed.Json jsonfeed ->
342342+ let items = Jsonfeed.items jsonfeed in
343343+ let posts = List.map (post_of_jsonfeed_item ~feed:c) items in
344344+ Log.debug (fun m -> m "Extracted %d posts from JSONFeed '%s'"
345345+ (List.length posts) (Source.name (Feed.source c)));
346346+ posts
347347+348348+let get_posts ?n ?(ofs = 0) planet_feeds =
349349+ Log.info (fun m -> m "Processing %d feeds for posts" (List.length planet_feeds));
350350+351351+ let posts = List.concat @@ List.map posts_of_feed planet_feeds in
352352+ Log.debug (fun m -> m "Total posts collected: %d" (List.length posts));
353353+354354+ let posts = List.sort post_compare posts in
355355+ Log.debug (fun m -> m "Posts sorted by date (most recent first)");
356356+357357+ let posts = remove ofs posts in
358358+ let result =
359359+ match n with
360360+ | None ->
361361+ Log.debug (fun m -> m "Returning all %d posts (offset=%d)"
362362+ (List.length posts) ofs);
363363+ posts
364364+ | Some n ->
365365+ let limited = take n posts in
366366+ Log.debug (fun m -> m "Returning %d posts (requested=%d, offset=%d)"
367367+ (List.length limited) n ofs);
368368+ limited
369369+ in
370370+ result
371371+372372+let of_feeds feeds = get_posts feeds
373373+374374+let feed t = t.feed
375375+let title t = t.title
376376+let link t = t.link
377377+let date t = t.date
378378+let author t = t.author
379379+let email t = t.email
380380+let content t = Soup.to_string t.content
381381+let id t = t.id
382382+let tags t = t.tags
383383+let summary t = t.summary
384384+385385+let meta_description _t =
386386+ (* TODO: This requires environment for HTTP access *)
387387+ Log.debug (fun m -> m "meta_description not implemented (requires environment)");
388388+ None
389389+390390+let seo_image _t =
391391+ (* TODO: This requires environment for HTTP access *)
392392+ Log.debug (fun m -> m "seo_image not implemented (requires environment)");
393393+ None
+68
stack/river/lib/post.mli
···11+(*
22+ * Copyright (c) 2014, OCaml.org project
33+ * Copyright (c) 2015 KC Sivaramakrishnan <sk826@cl.cam.ac.uk>
44+ *
55+ * Permission to use, copy, modify, and distribute this software for any
66+ * purpose with or without fee is hereby granted, provided that the above
77+ * copyright notice and this permission notice appear in all copies.
88+ *
99+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
1010+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
1111+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
1212+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
1313+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
1414+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
1515+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
1616+ *)
1717+1818+(** Post representation and extraction from feeds. *)
1919+2020+type t
2121+(** A post from a feed. *)
2222+2323+val of_feeds : Feed.t list -> t list
2424+(** [of_feeds feeds] extracts and deduplicates posts from the given feeds.
2525+2626+ Posts are deduplicated by ID. *)
2727+2828+val feed : t -> Feed.t
2929+(** [feed post] returns the feed this post originated from. *)
3030+3131+val title : t -> string
3232+(** [title post] returns the post title. *)
3333+3434+val link : t -> Uri.t option
3535+(** [link post] returns the post link. *)
3636+3737+val date : t -> Syndic.Date.t option
3838+(** [date post] returns the post date. *)
3939+4040+val author : t -> string
4141+(** [author post] returns the post author name. *)
4242+4343+val email : t -> string
4444+(** [email post] returns the post author email. *)
4545+4646+val content : t -> string
4747+(** [content post] returns the post content. *)
4848+4949+val id : t -> string
5050+(** [id post] returns the unique identifier of the post. *)
5151+5252+val tags : t -> string list
5353+(** [tags post] returns the list of tags associated with the post. *)
5454+5555+val summary : t -> string option
5656+(** [summary post] returns the summary/excerpt of the post, if available. *)
5757+5858+val meta_description : t -> string option
5959+(** [meta_description post] returns the meta description from the origin site.
6060+6161+ To get the meta description, we fetch the content of [link post] and look
6262+ for an HTML meta tag with name "description" or "og:description". *)
6363+6464+val seo_image : t -> string option
6565+(** [seo_image post] returns the social media image URL.
6666+6767+ To get the SEO image, we fetch the content of [link post] and look for an
6868+ HTML meta tag with name "og:image" or "twitter:image". *)
+192
stack/river/lib/quality.ml
···11+(*
22+ * Copyright (c) 2014, OCaml.org project
33+ * Copyright (c) 2015 KC Sivaramakrishnan <sk826@cl.cam.ac.uk>
44+ *
55+ * Permission to use, copy, modify, and distribute this software for any
66+ * purpose with or without fee is hereby granted, provided that the above
77+ * copyright notice and this permission notice appear in all copies.
88+ *
99+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
1010+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
1111+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
1212+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
1313+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
1414+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
1515+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
1616+ *)
1717+1818+(** Feed quality analysis. *)
1919+2020+type t = {
2121+ total_entries : int;
2222+ entries_with_summary : int;
2323+ entries_with_author : int;
2424+ entries_with_date : int;
2525+ entries_with_content : int;
2626+ entries_with_tags : int;
2727+ avg_content_length : float;
2828+ min_content_length : int;
2929+ max_content_length : int;
3030+ posting_frequency_days : float option;
3131+ quality_score : float;
3232+}
3333+3434+let make ~total_entries ~entries_with_summary ~entries_with_author
3535+ ~entries_with_date ~entries_with_content ~entries_with_tags
3636+ ~avg_content_length ~min_content_length ~max_content_length
3737+ ~posting_frequency_days ~quality_score =
3838+ {
3939+ total_entries;
4040+ entries_with_summary;
4141+ entries_with_author;
4242+ entries_with_date;
4343+ entries_with_content;
4444+ entries_with_tags;
4545+ avg_content_length;
4646+ min_content_length;
4747+ max_content_length;
4848+ posting_frequency_days;
4949+ quality_score;
5050+ }
5151+5252+let total_entries t = t.total_entries
5353+let entries_with_summary t = t.entries_with_summary
5454+let entries_with_author t = t.entries_with_author
5555+let entries_with_date t = t.entries_with_date
5656+let entries_with_content t = t.entries_with_content
5757+let entries_with_tags t = t.entries_with_tags
5858+let avg_content_length t = t.avg_content_length
5959+let min_content_length t = t.min_content_length
6060+let max_content_length t = t.max_content_length
6161+let posting_frequency_days t = t.posting_frequency_days
6262+let quality_score t = t.quality_score
6363+6464+(** Get content length from an Atom entry *)
6565+let get_content_length (entry : Syndic.Atom.entry) =
6666+ match entry.content with
6767+ | Some (Syndic.Atom.Text s) -> String.length s
6868+ | Some (Syndic.Atom.Html (_, s)) -> String.length s
6969+ | Some (Syndic.Atom.Xhtml (_, _)) -> 0 (* Could calculate but complex *)
7070+ | Some (Syndic.Atom.Mime _) -> 0
7171+ | Some (Syndic.Atom.Src _) -> 0
7272+ | None -> (
7373+ match entry.summary with
7474+ | Some (Syndic.Atom.Text s) -> String.length s
7575+ | Some (Syndic.Atom.Html (_, s)) -> String.length s
7676+ | Some (Syndic.Atom.Xhtml (_, _)) -> 0
7777+ | None -> 0)
7878+7979+(** Check if entry has non-empty summary *)
8080+let has_summary (entry : Syndic.Atom.entry) =
8181+ match entry.summary with
8282+ | Some (Syndic.Atom.Text s) when String.trim s <> "" -> true
8383+ | Some (Syndic.Atom.Html (_, s)) when String.trim s <> "" -> true
8484+ | Some (Syndic.Atom.Xhtml (_, _)) -> true
8585+ | _ -> false
8686+8787+(** Check if entry has author *)
8888+let has_author (entry : Syndic.Atom.entry) =
8989+ let (author, _) = entry.authors in
9090+ String.trim author.name <> ""
9191+9292+(** Check if entry has content *)
9393+let has_content (entry : Syndic.Atom.entry) =
9494+ get_content_length entry > 0
9595+9696+(** Check if entry has tags/categories *)
9797+let has_tags (entry : Syndic.Atom.entry) =
9898+ entry.categories <> []
9999+100100+(** Calculate quality score from metrics *)
101101+let calculate_quality_score t =
102102+ let total = float_of_int t.total_entries in
103103+ if total = 0.0 then 0.0
104104+ else
105105+ let summary_pct = float_of_int t.entries_with_summary /. total *. 100.0 in
106106+ let author_pct = float_of_int t.entries_with_author /. total *. 100.0 in
107107+ let date_pct = float_of_int t.entries_with_date /. total *. 100.0 in
108108+ let content_pct = float_of_int t.entries_with_content /. total *. 100.0 in
109109+ let tags_pct = float_of_int t.entries_with_tags /. total *. 100.0 in
110110+111111+ (* Weighted average: content and dates are most important *)
112112+ let score =
113113+ (content_pct *. 0.30) +.
114114+ (date_pct *. 0.25) +.
115115+ (author_pct *. 0.20) +.
116116+ (summary_pct *. 0.15) +.
117117+ (tags_pct *. 0.10)
118118+ in
119119+ score
120120+121121+let analyze entries =
122122+ if entries = [] then
123123+ failwith "No entries to analyze"
124124+ else
125125+ let total_entries = List.length entries in
126126+127127+ let entries_with_summary = ref 0 in
128128+ let entries_with_author = ref 0 in
129129+ let entries_with_date = ref total_entries in (* All Atom entries have updated *)
130130+ let entries_with_content = ref 0 in
131131+ let entries_with_tags = ref 0 in
132132+ let content_lengths = ref [] in
133133+ let dates = ref [] in
134134+135135+ List.iter (fun (entry : Syndic.Atom.entry) ->
136136+ if has_summary entry then incr entries_with_summary;
137137+ if has_author entry then incr entries_with_author;
138138+ if has_content entry then begin
139139+ incr entries_with_content;
140140+ content_lengths := get_content_length entry :: !content_lengths
141141+ end;
142142+ if has_tags entry then incr entries_with_tags;
143143+ dates := entry.updated :: !dates
144144+ ) entries;
145145+146146+ (* Calculate content statistics *)
147147+ let avg_content_length, min_content_length, max_content_length =
148148+ if !content_lengths = [] then
149149+ (0.0, 0, 0)
150150+ else
151151+ let sorted = List.sort compare !content_lengths in
152152+ let sum = List.fold_left (+) 0 sorted in
153153+ let avg = float_of_int sum /. float_of_int (List.length sorted) in
154154+ let min_len = List.hd sorted in
155155+ let max_len = List.hd (List.rev sorted) in
156156+ (avg, min_len, max_len)
157157+ in
158158+159159+ (* Calculate posting frequency *)
160160+ let posting_frequency_days =
161161+ if List.length !dates < 2 then
162162+ None
163163+ else
164164+ try
165165+ let timestamps = List.map Ptime.to_float_s !dates in
166166+ let sorted_timestamps = List.sort compare timestamps in
167167+ let first = List.hd sorted_timestamps in
168168+ let last = List.hd (List.rev sorted_timestamps) in
169169+ let total_days = (last -. first) /. 86400.0 in
170170+ let num_intervals = float_of_int (List.length sorted_timestamps - 1) in
171171+ Some (total_days /. num_intervals)
172172+ with _ -> None
173173+ in
174174+175175+ (* Create metrics record (without quality_score first) *)
176176+ let metrics = {
177177+ total_entries;
178178+ entries_with_summary = !entries_with_summary;
179179+ entries_with_author = !entries_with_author;
180180+ entries_with_date = !entries_with_date;
181181+ entries_with_content = !entries_with_content;
182182+ entries_with_tags = !entries_with_tags;
183183+ avg_content_length;
184184+ min_content_length;
185185+ max_content_length;
186186+ posting_frequency_days;
187187+ quality_score = 0.0; (* Placeholder *)
188188+ } in
189189+190190+ (* Calculate quality score *)
191191+ let quality_score = calculate_quality_score metrics in
192192+ { metrics with quality_score }
+57
stack/river/lib/quality.mli
···11+(*
22+ * Copyright (c) 2014, OCaml.org project
33+ * Copyright (c) 2015 KC Sivaramakrishnan <sk826@cl.cam.ac.uk>
44+ *
55+ * Permission to use, copy, modify, and distribute this software for any
66+ * purpose with or without fee is hereby granted, provided that the above
77+ * copyright notice and this permission notice appear in all copies.
88+ *
99+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
1010+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
1111+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
1212+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
1313+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
1414+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
1515+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
1616+ *)
1717+1818+(** Feed quality analysis. *)
1919+2020+type t
2121+(** Quality metrics for a feed or user's aggregated feed. *)
2222+2323+val make :
2424+ total_entries:int ->
2525+ entries_with_summary:int ->
2626+ entries_with_author:int ->
2727+ entries_with_date:int ->
2828+ entries_with_content:int ->
2929+ entries_with_tags:int ->
3030+ avg_content_length:float ->
3131+ min_content_length:int ->
3232+ max_content_length:int ->
3333+ posting_frequency_days:float option ->
3434+ quality_score:float ->
3535+ t
3636+(** [make ~total_entries ...] creates quality metrics. *)
3737+3838+val total_entries : t -> int
3939+val entries_with_summary : t -> int
4040+val entries_with_author : t -> int
4141+val entries_with_date : t -> int
4242+val entries_with_content : t -> int
4343+val entries_with_tags : t -> int
4444+val avg_content_length : t -> float
4545+val min_content_length : t -> int
4646+val max_content_length : t -> int
4747+val posting_frequency_days : t -> float option
4848+val quality_score : t -> float
4949+(** Accessors for quality metrics. *)
5050+5151+val analyze : Syndic.Atom.entry list -> t
5252+(** [analyze entries] computes quality metrics from Atom entries.
5353+5454+ The quality score is a weighted average of:
5555+ - Content completeness (40%)
5656+ - Metadata completeness (30%)
5757+ - Content richness (30%) *)
+9-1699
stack/river/lib/river.ml
···17171818(** River RSS/Atom/JSONFeed aggregator library *)
19192020-let src = Logs.Src.create "river" ~doc:"River RSS/Atom aggregator"
2121-module Log = (val Logs.src_log src : Logs.LOG)
2222-2323-(** {1 Internal Utilities} *)
2424-2525-module Text_extract = struct
2626- open Syndic
2727-2828- (* Remove all tags *)
2929- let rec syndic_to_buffer b = function
3030- | XML.Node (_, _, subs) -> List.iter (syndic_to_buffer b) subs
3131- | XML.Data (_, d) -> Buffer.add_string b d
3232-3333- let syndic_to_string x =
3434- let b = Buffer.create 1024 in
3535- List.iter (syndic_to_buffer b) x;
3636- Buffer.contents b
3737-3838- let string_of_text_construct : Atom.text_construct -> string = function
3939- | Atom.Text s | Atom.Html (_, s) -> s
4040- | Atom.Xhtml (_, x) -> syndic_to_string x
4141-end
4242-4343-module Html_meta = struct
4444- [@@@warning "-32"] (* Suppress unused value warnings for internal utilities *)
4545-4646- (** This module determines an image to be used as preview of a website.
4747-4848- It does this by following the same logic Google+ and other websites use, and
4949- described in this article:
5050- https://www.raymondcamden.com/2011/07/26/How-are-Facebook-and-Google-creating-link-previews *)
5151-5252- let og_image html =
5353- let open Soup in
5454- let soup = parse html in
5555- try soup $ "meta[property=og:image]" |> R.attribute "content" |> Option.some
5656- with Failure _ -> None
5757-5858- let image_src html =
5959- let open Soup in
6060- let soup = parse html in
6161- try soup $ "link[rel=\"image_src\"]" |> R.attribute "href" |> Option.some
6262- with Failure _ -> None
6363-6464- let twitter_image html =
6565- let open Soup in
6666- let soup = parse html in
6767- try
6868- soup $ "meta[name=\"twitter:image\"]" |> R.attribute "content"
6969- |> Option.some
7070- with Failure _ -> None
7171-7272- let og_description html =
7373- let open Soup in
7474- let soup = parse html in
7575- try
7676- soup $ "meta[property=og:description]" |> R.attribute "content"
7777- |> Option.some
7878- with Failure _ -> None
7979-8080- let description html =
8181- let open Soup in
8282- let soup = parse html in
8383- try
8484- soup $ "meta[property=description]" |> R.attribute "content" |> Option.some
8585- with Failure _ -> None
8686-8787- let preview_image html =
8888- let preview_image =
8989- match og_image html with
9090- | None -> (
9191- match image_src html with
9292- | None -> twitter_image html
9393- | Some x -> Some x)
9494- | Some x -> Some x
9595- in
9696- match Option.map String.trim preview_image with
9797- | Some "" -> None
9898- | Some x -> Some x
9999- | None -> None
100100-101101- let description html =
102102- let preview_image =
103103- match og_description html with None -> description html | Some x -> Some x
104104- in
105105- match Option.map String.trim preview_image with
106106- | Some "" -> None
107107- | Some x -> Some x
108108- | None -> None
109109-end
110110-111111-module Html_markdown = struct
112112- [@@@warning "-32"] (* Suppress unused value warnings for internal utilities *)
113113-114114- (** HTML to Markdown converter using Lambda Soup *)
115115-116116- (** Extract all links from HTML content *)
117117- let extract_links html_str =
118118- try
119119- let soup = Soup.parse html_str in
120120- let links = Soup.select "a[href]" soup in
121121- Soup.fold (fun acc link ->
122122- match Soup.attribute "href" link with
123123- | Some href ->
124124- let text = Soup.texts link |> String.concat "" |> String.trim in
125125- (href, text) :: acc
126126- | None -> acc
127127- ) [] links
128128- |> List.rev
129129- with _ -> []
130130-131131- (** Check if string contains any whitespace *)
132132- let has_whitespace s =
133133- try
134134- let _ = Str.search_forward (Str.regexp "[ \t\n\r]") s 0 in
135135- true
136136- with Not_found -> false
137137-138138- (** Clean up excessive newlines and normalize spacing *)
139139- let cleanup_markdown s =
140140- (* Normalize line endings *)
141141- let s = Str.global_replace (Str.regexp "\r\n") "\n" s in
142142-143143- (* Remove trailing whitespace from each line *)
144144- let lines = String.split_on_char '\n' s in
145145- let lines = List.map (fun line ->
146146- (* Trim trailing spaces but preserve leading spaces for indentation *)
147147- let len = String.length line in
148148- let rec find_last_non_space i =
149149- if i < 0 then -1
150150- else if line.[i] = ' ' || line.[i] = '\t' then find_last_non_space (i - 1)
151151- else i
152152- in
153153- let last = find_last_non_space (len - 1) in
154154- if last < 0 then ""
155155- else String.sub line 0 (last + 1)
156156- ) lines in
157157-158158- (* Join back and collapse excessive blank lines *)
159159- let s = String.concat "\n" lines in
160160-161161- (* Replace 3+ consecutive newlines with just 2 *)
162162- let s = Str.global_replace (Str.regexp "\n\n\n+") "\n\n" s in
163163-164164- (* Trim leading and trailing whitespace *)
165165- String.trim s
166166-167167- (** Convert HTML to Markdown using state-based whitespace handling *)
168168- let html_to_markdown html_str =
169169- try
170170- let soup = Soup.parse html_str in
171171- let buffer = Buffer.create 256 in
172172-173173- (* State: track if we need to insert a space before next text *)
174174- let need_space = ref false in
175175-176176- (* Get last character in buffer, if any *)
177177- let last_char () =
178178- let len = Buffer.length buffer in
179179- if len = 0 then None
180180- else Some (Buffer.nth buffer (len - 1))
181181- in
182182-183183- (* Add text with proper spacing *)
184184- let add_text text =
185185- let trimmed = String.trim text in
186186- if trimmed <> "" then begin
187187- (* Check if text starts with punctuation that shouldn't have space before it *)
188188- let starts_with_punctuation =
189189- String.length trimmed > 0 &&
190190- (match trimmed.[0] with
191191- | ',' | '.' | ';' | ':' | '!' | '?' | ')' | ']' | '}' -> true
192192- | _ -> false)
193193- in
194194-195195- (* Add space if needed, unless we're before punctuation *)
196196- if !need_space && not starts_with_punctuation then begin
197197- match last_char () with
198198- | Some (' ' | '\n') -> ()
199199- | _ -> Buffer.add_char buffer ' '
200200- end;
201201- Buffer.add_string buffer trimmed;
202202- need_space := false
203203- end
204204- in
205205-206206- (* Mark that we need space before next text (for inline elements) *)
207207- let mark_space_needed () =
208208- need_space := has_whitespace (Buffer.contents buffer) || Buffer.length buffer > 0
209209- in
210210-211211- (* Process header with ID/anchor handling *)
212212- let process_header level elem =
213213- need_space := false;
214214-215215- (* Check if header contains a link with an ID fragment *)
216216- let link_opt = Soup.select_one "a[href]" elem in
217217- let anchor_id = match link_opt with
218218- | Some link ->
219219- (match Soup.attribute "href" link with
220220- | Some href ->
221221- (* Extract fragment from URL *)
222222- let uri = Uri.of_string href in
223223- Uri.fragment uri
224224- | None -> None)
225225- | None -> None
226226- in
227227-228228- (* Add anchor if we found an ID *)
229229- (match anchor_id with
230230- | Some id when id <> "" ->
231231- Buffer.add_string buffer (Printf.sprintf "\n<a name=\"%s\"></a>\n" id)
232232- | _ -> ());
233233-234234- (* Add the header marker *)
235235- let marker = String.make level '#' in
236236- Buffer.add_string buffer ("\n" ^ marker ^ " ");
237237-238238- (* Get text content, excluding link tags *)
239239- let text = Soup.texts elem |> String.concat " " |> String.trim in
240240- Buffer.add_string buffer text;
241241-242242- Buffer.add_string buffer "\n\n";
243243- need_space := false
244244- in
245245-246246- let rec process_node node =
247247- match Soup.element node with
248248- | Some elem ->
249249- let tag = Soup.name elem in
250250- (match tag with
251251- (* Block elements - reset space tracking *)
252252- | "h1" -> process_header 1 elem
253253- | "h2" -> process_header 2 elem
254254- | "h3" -> process_header 3 elem
255255- | "h4" -> process_header 4 elem
256256- | "h5" -> process_header 5 elem
257257- | "h6" -> process_header 6 elem
258258- | "p" ->
259259- need_space := false;
260260- Soup.children elem |> Soup.iter process_node;
261261- Buffer.add_string buffer "\n\n";
262262- need_space := false
263263- | "br" ->
264264- Buffer.add_string buffer "\n";
265265- need_space := false
266266- (* Inline elements - preserve space tracking *)
267267- | "strong" | "b" ->
268268- (* Add space before if needed *)
269269- if !need_space then begin
270270- match last_char () with
271271- | Some (' ' | '\n') -> ()
272272- | _ -> Buffer.add_char buffer ' '
273273- end;
274274- Buffer.add_string buffer "**";
275275- need_space := false;
276276- Soup.children elem |> Soup.iter process_node;
277277- Buffer.add_string buffer "**";
278278- mark_space_needed ()
279279- | "em" | "i" ->
280280- (* Add space before if needed *)
281281- if !need_space then begin
282282- match last_char () with
283283- | Some (' ' | '\n') -> ()
284284- | _ -> Buffer.add_char buffer ' '
285285- end;
286286- Buffer.add_string buffer "*";
287287- need_space := false;
288288- Soup.children elem |> Soup.iter process_node;
289289- Buffer.add_string buffer "*";
290290- mark_space_needed ()
291291- | "code" ->
292292- (* Add space before if needed *)
293293- if !need_space then begin
294294- match last_char () with
295295- | Some (' ' | '\n') -> ()
296296- | _ -> Buffer.add_char buffer ' '
297297- end;
298298- Buffer.add_string buffer "`";
299299- need_space := false;
300300- Soup.children elem |> Soup.iter process_node;
301301- Buffer.add_string buffer "`";
302302- mark_space_needed ()
303303- | "pre" ->
304304- need_space := false;
305305- Buffer.add_string buffer "\n```\n";
306306- Soup.children elem |> Soup.iter process_node;
307307- Buffer.add_string buffer "\n```\n\n";
308308- need_space := false
309309- | "a" ->
310310- let text = Soup.texts elem |> String.concat " " |> String.trim in
311311- let href = Soup.attribute "href" elem in
312312- (match href with
313313- | Some href ->
314314- (* Add space before link if needed *)
315315- if !need_space then begin
316316- match last_char () with
317317- | Some (' ' | '\n') -> ()
318318- | _ -> Buffer.add_char buffer ' '
319319- end;
320320- need_space := false;
321321-322322- (* Add the link markdown *)
323323- if text = "" then
324324- Buffer.add_string buffer (Printf.sprintf "<%s>" href)
325325- else
326326- Buffer.add_string buffer (Printf.sprintf "[%s](%s)" text href);
327327-328328- (* Mark that space may be needed after link *)
329329- mark_space_needed ()
330330- | None ->
331331- add_text text)
332332- | "ul" | "ol" ->
333333- need_space := false;
334334- Buffer.add_string buffer "\n";
335335- let is_ordered = tag = "ol" in
336336- let items = Soup.children elem |> Soup.to_list in
337337- List.iteri (fun i item ->
338338- match Soup.element item with
339339- | Some li when Soup.name li = "li" ->
340340- need_space := false;
341341- if is_ordered then
342342- Buffer.add_string buffer (Printf.sprintf "%d. " (i + 1))
343343- else
344344- Buffer.add_string buffer "- ";
345345- Soup.children li |> Soup.iter process_node;
346346- Buffer.add_string buffer "\n"
347347- | _ -> ()
348348- ) items;
349349- Buffer.add_string buffer "\n";
350350- need_space := false
351351- | "blockquote" ->
352352- need_space := false;
353353- Buffer.add_string buffer "\n> ";
354354- Soup.children elem |> Soup.iter process_node;
355355- Buffer.add_string buffer "\n\n";
356356- need_space := false
357357- | "img" ->
358358- (* Add space before if needed *)
359359- if !need_space then begin
360360- match last_char () with
361361- | Some (' ' | '\n') -> ()
362362- | _ -> Buffer.add_char buffer ' '
363363- end;
364364- let alt = Soup.attribute "alt" elem |> Option.value ~default:"" in
365365- let src = Soup.attribute "src" elem |> Option.value ~default:"" in
366366- Buffer.add_string buffer (Printf.sprintf "" alt src);
367367- need_space := false;
368368- mark_space_needed ()
369369- | "hr" ->
370370- need_space := false;
371371- Buffer.add_string buffer "\n---\n\n";
372372- need_space := false
373373- (* Strip these tags but keep content *)
374374- | "div" | "span" | "article" | "section" | "header" | "footer"
375375- | "main" | "nav" | "aside" | "figure" | "figcaption" | "details" | "summary" ->
376376- Soup.children elem |> Soup.iter process_node
377377- (* Ignore script, style, etc *)
378378- | "script" | "style" | "noscript" -> ()
379379- (* Default: just process children *)
380380- | _ ->
381381- Soup.children elem |> Soup.iter process_node)
382382- | None ->
383383- (* Text node - handle whitespace properly *)
384384- match Soup.leaf_text node with
385385- | Some text ->
386386- (* If text is only whitespace, mark that we need space *)
387387- let trimmed = String.trim text in
388388- if trimmed = "" then begin
389389- if has_whitespace text then
390390- need_space := true
391391- end else begin
392392- (* Text has content - check if it had leading/trailing whitespace *)
393393- let had_leading_ws = has_whitespace text &&
394394- (String.length text > 0 &&
395395- (text.[0] = ' ' || text.[0] = '\t' || text.[0] = '\n' || text.[0] = '\r')) in
396396-397397- (* If had leading whitespace, mark we need space *)
398398- if had_leading_ws then need_space := true;
399399-400400- (* Add the text content *)
401401- add_text trimmed;
402402-403403- (* If had trailing whitespace, mark we need space for next *)
404404- let had_trailing_ws = has_whitespace text &&
405405- (String.length text > 0 &&
406406- let last = text.[String.length text - 1] in
407407- last = ' ' || last = '\t' || last = '\n' || last = '\r') in
408408- if had_trailing_ws then need_space := true
409409- end
410410- | None -> ()
411411- in
412412-413413- Soup.children soup |> Soup.iter process_node;
414414-415415- (* Clean up the result *)
416416- let result = Buffer.contents buffer in
417417- cleanup_markdown result
418418- with _ -> html_str
419419-420420- (** Convert HTML content to clean Markdown *)
421421- let to_markdown html_str =
422422- html_to_markdown html_str
423423-end
424424-425425-(** {1 Feed Sources} *)
426426-427427-module Source = struct
428428- type t = {
429429- name : string;
430430- url : string;
431431- }
432432-433433- let make ~name ~url = { name; url }
434434-435435- let name t = t.name
436436- let url t = t.url
437437-438438- let jsont =
439439- let make name url = { name; url } in
440440- Jsont.Object.map ~kind:"Source" make
441441- |> Jsont.Object.mem "name" Jsont.string ~enc:(fun s -> s.name)
442442- |> Jsont.Object.mem "url" Jsont.string ~enc:(fun s -> s.url)
443443- |> Jsont.Object.finish
444444-end
445445-446446-(** {1 HTTP Session Management} *)
447447-448448-module Session = struct
449449- type t = {
450450- session : (float Eio.Time.clock_ty Eio.Resource.t,
451451- [`Generic | `Unix] Eio.Net.ty Eio.Resource.t) Requests.t;
452452- }
453453-454454- let init ~sw env =
455455- Log.info (fun m -> m "Initializing River session");
456456- let session = Requests.create ~sw
457457- ~default_headers:(Requests.Headers.of_list [
458458- ("User-Agent", "OCaml-River/1.0");
459459- ])
460460- ~follow_redirects:true
461461- ~max_redirects:5
462462- ~verify_tls:true
463463- env
464464- in
465465- { session }
466466-467467- let with_session env f =
468468- Log.info (fun m -> m "Creating River session");
469469- Eio.Switch.run @@ fun sw ->
470470- let client = init ~sw env in
471471- f client
472472-473473- let get_requests_session t = t.session
474474-end
475475-476476-(** {1 Feeds and Posts} *)
477477-478478-module Feed = struct
479479- type feed_content =
480480- | Atom of Syndic.Atom.feed
481481- | Rss2 of Syndic.Rss2.channel
482482- | Json of Jsonfeed.t
483483-484484- type t = {
485485- source : Source.t;
486486- title : string;
487487- content : feed_content;
488488- }
489489-490490- let string_of_feed = function
491491- | Atom _ -> "Atom"
492492- | Rss2 _ -> "Rss2"
493493- | Json _ -> "JSONFeed"
494494-495495- let classify_feed ~xmlbase (body : string) =
496496- Log.debug (fun m -> m "Attempting to parse feed (%d bytes)" (String.length body));
497497-498498- (* Quick check - does it look like JSON? *)
499499- let looks_like_json =
500500- String.length body > 0 &&
501501- let first_char = String.get body 0 in
502502- first_char = '{' || first_char = '['
503503- in
504504-505505- if looks_like_json then (
506506- (* Try JSONFeed first *)
507507- Log.debug (fun m -> m "Body looks like JSON, trying JSONFeed parser");
508508- match Jsonfeed.of_string body with
509509- | Ok jsonfeed ->
510510- Log.debug (fun m -> m "Successfully parsed as JSONFeed");
511511- Json jsonfeed
512512- | Error err ->
513513- Log.debug (fun m -> m "Not a JSONFeed: %s" (Jsont.Error.to_string err));
514514- (* Fall through to XML parsing *)
515515- failwith "Not a valid JSONFeed"
516516- ) else (
517517- (* Try XML formats *)
518518- try
519519- let feed = Atom (Syndic.Atom.parse ~xmlbase (Xmlm.make_input (`String (0, body)))) in
520520- Log.debug (fun m -> m "Successfully parsed as Atom feed");
521521- feed
522522- with
523523- | Syndic.Atom.Error.Error (pos, msg) -> (
524524- Log.debug (fun m -> m "Not an Atom feed: %s at position (%d, %d)"
525525- msg (fst pos) (snd pos));
526526- try
527527- let feed = Rss2 (Syndic.Rss2.parse ~xmlbase (Xmlm.make_input (`String (0, body)))) in
528528- Log.debug (fun m -> m "Successfully parsed as RSS2 feed");
529529- feed
530530- with Syndic.Rss2.Error.Error (pos, msg) ->
531531- Log.err (fun m -> m "Failed to parse as RSS2: %s at position (%d, %d)"
532532- msg (fst pos) (snd pos));
533533- failwith "Neither Atom nor RSS2 feed")
534534- | Not_found as e ->
535535- Log.err (fun m -> m "Not_found exception during Atom feed parsing");
536536- Log.err (fun m -> m "Backtrace:\n%s" (Printexc.get_backtrace ()));
537537- raise e
538538- | e ->
539539- Log.err (fun m -> m "Unexpected exception during feed parsing: %s"
540540- (Printexc.to_string e));
541541- Log.err (fun m -> m "Backtrace:\n%s" (Printexc.get_backtrace ()));
542542- raise e
543543- )
544544-545545- let fetch session source =
546546- Log.info (fun m -> m "Fetching feed: %s" (Source.name source));
547547-548548- let xmlbase = Uri.of_string (Source.url source) in
549549-550550- (* Use Requests_json_api.get_result for clean Result-based error handling *)
551551- let requests_session = Session.get_requests_session session in
552552- let response =
553553- match Requests_json_api.get_result requests_session (Source.url source) with
554554- | Ok body ->
555555- Log.info (fun m -> m "Successfully fetched %s (%d bytes)"
556556- (Source.url source) (String.length body));
557557- body
558558- | Error (status, msg) ->
559559- Log.err (fun m -> m "Failed to fetch feed '%s': HTTP %d - %s"
560560- (Source.name source) status msg);
561561- failwith (Printf.sprintf "HTTP %d: %s" status msg)
562562- in
563563-564564- let content = classify_feed ~xmlbase response in
565565- let title =
566566- match content with
567567- | Atom atom -> Text_extract.string_of_text_construct atom.Syndic.Atom.title
568568- | Rss2 ch -> ch.Syndic.Rss2.title
569569- | Json jsonfeed -> Jsonfeed.title jsonfeed
570570- in
571571-572572- Log.info (fun m -> m "Successfully fetched %s feed '%s' (title: '%s')"
573573- (string_of_feed content) (Source.name source) title);
574574-575575- { source; title; content }
576576-577577- let source t = t.source
578578-end
579579-580580-(** {1 Posts} *)
581581-582582-module Post = struct
583583- type t = {
584584- id : string;
585585- title : string;
586586- link : Uri.t option;
587587- date : Syndic.Date.t option;
588588- feed : Feed.t;
589589- author : string;
590590- email : string;
591591- content : Soup.soup Soup.node;
592592- mutable link_response : (string, string) result option;
593593- tags : string list;
594594- summary : string option;
595595- }
596596-597597- (** Generate a stable, unique ID from available data *)
598598- let generate_id ?guid ?link ?title ?date ~feed_url () =
599599- match guid with
600600- | Some id when id <> "" ->
601601- (* Use explicit ID/GUID if available *)
602602- id
603603- | _ ->
604604- match link with
605605- | Some uri when Uri.to_string uri <> "" ->
606606- (* Use permalink as ID (stable and unique) *)
607607- Uri.to_string uri
608608- | _ ->
609609- (* Fallback: hash of feed_url + title + date *)
610610- let title_str = Option.value title ~default:"" in
611611- let date_str =
612612- match date with
613613- | Some d -> Ptime.to_rfc3339 d
614614- | None -> ""
615615- in
616616- let composite = Printf.sprintf "%s|%s|%s" feed_url title_str date_str in
617617- (* Use SHA256 for stable hashing *)
618618- Digest.string composite |> Digest.to_hex
619619-620620- let resolve_links_attr ~xmlbase attr el =
621621- Soup.R.attribute attr el
622622- |> Uri.of_string
623623- |> Syndic.XML.resolve ~xmlbase
624624- |> Uri.to_string
625625- |> fun value -> Soup.set_attribute attr value el
626626-627627- (* Things that posts should not contain *)
628628- let undesired_tags = [ "style"; "script" ]
629629- let undesired_attr = [ "id" ]
630630-631631- let html_of_text ?xmlbase s =
632632- let soup = Soup.parse s in
633633- let ($$) = Soup.($$) in
634634- soup $$ "a[href]" |> Soup.iter (resolve_links_attr ~xmlbase "href");
635635- soup $$ "img[src]" |> Soup.iter (resolve_links_attr ~xmlbase "src");
636636- undesired_tags |> List.iter (fun tag -> soup $$ tag |> Soup.iter Soup.delete);
637637- soup $$ "*" |> Soup.iter (fun el ->
638638- undesired_attr |> List.iter (fun attr -> Soup.delete_attribute attr el));
639639- soup
640640-641641- (* Do not trust sites using XML for HTML content. Convert to string and parse
642642- back. (Does not always fix bad HTML unfortunately.) *)
643643- let html_of_syndic =
644644- let ns_prefix _ = Some "" in
645645- fun ?xmlbase h ->
646646- html_of_text ?xmlbase
647647- (String.concat "" (List.map (Syndic.XML.to_string ~ns_prefix) h))
648648-649649- let string_of_option = function None -> "" | Some s -> s
650650-651651- let post_compare p1 p2 =
652652- (* Most recent posts first. Posts with no date are always last *)
653653- match (p1.date, p2.date) with
654654- | Some d1, Some d2 -> Syndic.Date.compare d2 d1
655655- | None, Some _ -> 1
656656- | Some _, None -> -1
657657- | None, None -> 1
658658-659659- let rec remove n l =
660660- if n <= 0 then l else match l with [] -> [] | _ :: tl -> remove (n - 1) tl
661661-662662- let rec take n = function
663663- | [] -> []
664664- | e :: tl -> if n > 0 then e :: take (n - 1) tl else []
665665-666666- let post_of_atom ~(feed : Feed.t) (e : Syndic.Atom.entry) =
667667- Log.debug (fun m -> m "Processing Atom entry: %s"
668668- (Text_extract.string_of_text_construct e.title));
669669-670670- let link =
671671- try
672672- Some
673673- (List.find (fun l -> l.Syndic.Atom.rel = Syndic.Atom.Alternate) e.links)
674674- .href
675675- with Not_found -> (
676676- Log.debug (fun m -> m "No alternate link found, trying fallback");
677677- match e.links with
678678- | l :: _ -> Some l.href
679679- | [] -> (
680680- match Uri.scheme e.id with
681681- | Some "http" -> Some e.id
682682- | Some "https" -> Some e.id
683683- | _ -> None))
684684- in
685685- let date =
686686- match e.published with Some _ -> e.published | None -> Some e.updated
687687- in
688688- let content =
689689- match e.content with
690690- | Some (Text s) -> html_of_text s
691691- | Some (Html (xmlbase, s)) -> html_of_text ?xmlbase s
692692- | Some (Xhtml (xmlbase, h)) -> html_of_syndic ?xmlbase h
693693- | Some (Mime _) | Some (Src _) | None -> (
694694- match e.summary with
695695- | Some (Text s) -> html_of_text s
696696- | Some (Html (xmlbase, s)) -> html_of_text ?xmlbase s
697697- | Some (Xhtml (xmlbase, h)) -> html_of_syndic ?xmlbase h
698698- | None -> Soup.parse "")
699699- in
700700- let is_valid_author_name name =
701701- (* Filter out empty strings and placeholder values like "Unknown" *)
702702- let trimmed = String.trim name in
703703- trimmed <> "" && trimmed <> "Unknown"
704704- in
705705- let author_name =
706706- (* Fallback chain for author:
707707- 1. Entry author (if present, not empty, and not "Unknown")
708708- 2. Feed-level author (from Atom feed metadata)
709709- 3. Feed title (from Atom feed metadata)
710710- 4. Source name (manually entered feed name) *)
711711- try
712712- let author, _ = e.authors in
713713- let trimmed = String.trim author.name in
714714- if is_valid_author_name author.name then trimmed
715715- else raise Not_found (* Try feed-level author *)
716716- with Not_found -> (
717717- match feed.content with
718718- | Feed.Atom atom_feed -> (
719719- (* Try feed-level authors *)
720720- match atom_feed.Syndic.Atom.authors with
721721- | author :: _ when is_valid_author_name author.name ->
722722- String.trim author.name
723723- | _ ->
724724- (* Use feed title *)
725725- Text_extract.string_of_text_construct atom_feed.Syndic.Atom.title)
726726- | Feed.Rss2 _ | Feed.Json _ ->
727727- (* For RSS2 and JSONFeed, use the source name *)
728728- Source.name feed.source)
729729- in
730730- (* Extract tags from Atom categories *)
731731- let tags =
732732- List.map (fun cat -> cat.Syndic.Atom.term) e.categories
733733- in
734734- (* Extract summary - convert from text_construct to string *)
735735- let summary =
736736- match e.summary with
737737- | Some s -> Some (Text_extract.string_of_text_construct s)
738738- | None -> None
739739- in
740740- (* Generate unique ID *)
741741- let guid = Uri.to_string e.id in
742742- let title_str = Text_extract.string_of_text_construct e.title in
743743- let id =
744744- generate_id ~guid ?link ~title:title_str ?date
745745- ~feed_url:(Source.url feed.source) ()
746746- in
747747- {
748748- id;
749749- title = title_str;
750750- link;
751751- date;
752752- feed;
753753- author = author_name;
754754- email = "";
755755- content;
756756- link_response = None;
757757- tags;
758758- summary;
759759- }
760760-761761- let post_of_rss2 ~(feed : Feed.t) it =
762762- let title, content =
763763- match it.Syndic.Rss2.story with
764764- | All (t, xmlbase, d) -> (
765765- ( t,
766766- match it.content with
767767- | _, "" -> html_of_text ?xmlbase d
768768- | xmlbase, c -> html_of_text ?xmlbase c ))
769769- | Title t ->
770770- let xmlbase, c = it.content in
771771- (t, html_of_text ?xmlbase c)
772772- | Description (xmlbase, d) -> (
773773- ( "",
774774- match it.content with
775775- | _, "" -> html_of_text ?xmlbase d
776776- | xmlbase, c -> html_of_text ?xmlbase c ))
777777- in
778778- (* Note: it.link is of type Uri.t option in Syndic *)
779779- let link =
780780- match (it.guid, it.link) with
781781- | Some u, _ when u.permalink -> Some u.data
782782- | _, Some _ -> it.link
783783- | Some u, _ ->
784784- (* Sometimes the guid is indicated with isPermaLink="false" but is
785785- nonetheless the only URL we get (e.g. ocamlpro). *)
786786- Some u.data
787787- | None, None -> None
788788- in
789789- (* Extract GUID string for ID generation *)
790790- let guid_str =
791791- match it.guid with
792792- | Some u -> Some (Uri.to_string u.data)
793793- | None -> None
794794- in
795795- (* RSS2 doesn't have a categories field exposed, use empty list *)
796796- let tags = [] in
797797- (* RSS2 doesn't have a separate summary field, so leave it empty *)
798798- let summary = None in
799799- (* Generate unique ID *)
800800- let id =
801801- generate_id ?guid:guid_str ?link ~title ?date:it.pubDate
802802- ~feed_url:(Source.url feed.source) ()
803803- in
804804- {
805805- id;
806806- title;
807807- link;
808808- feed;
809809- author = Source.name feed.source;
810810- email = string_of_option it.author;
811811- content;
812812- date = it.pubDate;
813813- link_response = None;
814814- tags;
815815- summary;
816816- }
817817-818818- let post_of_jsonfeed_item ~(feed : Feed.t) (item : Jsonfeed.Item.t) =
819819- Log.debug (fun m -> m "Processing JSONFeed item: %s"
820820- (Option.value (Jsonfeed.Item.title item) ~default:"Untitled"));
821821-822822- (* Extract content - prefer HTML, fall back to text *)
823823- let content =
824824- match Jsonfeed.Item.content item with
825825- | `Html html -> html_of_text html
826826- | `Text text -> html_of_text text
827827- | `Both (html, _text) -> html_of_text html
828828- in
829829-830830- (* Extract author - use first author if multiple *)
831831- let author_name, author_email =
832832- match Jsonfeed.Item.authors item with
833833- | Some (first :: _) ->
834834- let name = Jsonfeed.Author.name first |> Option.value ~default:"" in
835835- (* JSONFeed authors don't typically have email *)
836836- (name, "")
837837- | _ ->
838838- (* Fall back to feed-level authors or feed title *)
839839- (match feed.content with
840840- | Feed.Json jsonfeed ->
841841- (match Jsonfeed.authors jsonfeed with
842842- | Some (first :: _) ->
843843- let name = Jsonfeed.Author.name first |> Option.value ~default:feed.title in
844844- (name, "")
845845- | _ -> (feed.title, ""))
846846- | _ -> (feed.title, ""))
847847- in
848848-849849- (* Link - use url field *)
850850- let link =
851851- Jsonfeed.Item.url item
852852- |> Option.map Uri.of_string
853853- in
854854-855855- (* Date *)
856856- let date = Jsonfeed.Item.date_published item in
857857-858858- (* Summary *)
859859- let summary = Jsonfeed.Item.summary item in
860860-861861- (* Tags *)
862862- let tags =
863863- Jsonfeed.Item.tags item
864864- |> Option.value ~default:[]
865865- in
866866-867867- (* Generate unique ID - JSONFeed items always have an id field (required) *)
868868- let guid = Jsonfeed.Item.id item in
869869- let title_str = Jsonfeed.Item.title item |> Option.value ~default:"Untitled" in
870870- let id =
871871- generate_id ~guid ?link ~title:title_str ?date
872872- ~feed_url:(Source.url feed.source) ()
873873- in
874874-875875- {
876876- id;
877877- title = title_str;
878878- link;
879879- date;
880880- feed;
881881- author = author_name;
882882- email = author_email;
883883- content;
884884- link_response = None;
885885- tags;
886886- summary;
887887- }
888888-889889- let posts_of_feed c =
890890- match c.Feed.content with
891891- | Feed.Atom f ->
892892- let posts = List.map (post_of_atom ~feed:c) f.Syndic.Atom.entries in
893893- Log.debug (fun m -> m "Extracted %d posts from Atom feed '%s'"
894894- (List.length posts) (Source.name c.source));
895895- posts
896896- | Feed.Rss2 ch ->
897897- let posts = List.map (post_of_rss2 ~feed:c) ch.Syndic.Rss2.items in
898898- Log.debug (fun m -> m "Extracted %d posts from RSS2 feed '%s'"
899899- (List.length posts) (Source.name c.source));
900900- posts
901901- | Feed.Json jsonfeed ->
902902- let items = Jsonfeed.items jsonfeed in
903903- let posts = List.map (post_of_jsonfeed_item ~feed:c) items in
904904- Log.debug (fun m -> m "Extracted %d posts from JSONFeed '%s'"
905905- (List.length posts) (Source.name c.source));
906906- posts
907907-908908- let get_posts ?n ?(ofs = 0) planet_feeds =
909909- Log.info (fun m -> m "Processing %d feeds for posts" (List.length planet_feeds));
910910-911911- let posts = List.concat @@ List.map posts_of_feed planet_feeds in
912912- Log.debug (fun m -> m "Total posts collected: %d" (List.length posts));
913913-914914- let posts = List.sort post_compare posts in
915915- Log.debug (fun m -> m "Posts sorted by date (most recent first)");
916916-917917- let posts = remove ofs posts in
918918- let result =
919919- match n with
920920- | None ->
921921- Log.debug (fun m -> m "Returning all %d posts (offset=%d)"
922922- (List.length posts) ofs);
923923- posts
924924- | Some n ->
925925- let limited = take n posts in
926926- Log.debug (fun m -> m "Returning %d posts (requested=%d, offset=%d)"
927927- (List.length limited) n ofs);
928928- limited
929929- in
930930- result
931931-932932- let of_feeds feeds = get_posts feeds
933933-934934- let feed t = t.feed
935935- let title t = t.title
936936- let link t = t.link
937937- let date t = t.date
938938- let author t = t.author
939939- let email t = t.email
940940- let content t = Soup.to_string t.content
941941- let id t = t.id
942942- let tags t = t.tags
943943- let summary t = t.summary
944944-945945- let meta_description _t =
946946- (* TODO: This requires environment for HTTP access *)
947947- Log.debug (fun m -> m "meta_description not implemented (requires environment)");
948948- None
949949-950950- let seo_image _t =
951951- (* TODO: This requires environment for HTTP access *)
952952- Log.debug (fun m -> m "seo_image not implemented (requires environment)");
953953- None
954954-end
955955-956956-(** {1 Format Conversion and Export} *)
957957-958958-module Format = struct
959959- module Atom = struct
960960- let entry_of_post post =
961961- let content = Syndic.Atom.Html (None, Post.content post) in
962962- let contributors =
963963- [ Syndic.Atom.author ~uri:(Uri.of_string (Source.url (Feed.source (Post.feed post))))
964964- (Source.name (Feed.source (Post.feed post))) ]
965965- in
966966- let links =
967967- match Post.link post with
968968- | Some l -> [ Syndic.Atom.link ~rel:Syndic.Atom.Alternate l ]
969969- | None -> []
970970- in
971971- let id =
972972- match Post.link post with
973973- | Some l -> l
974974- | None -> Uri.of_string (Digest.to_hex (Digest.string (Post.title post)))
975975- in
976976- let authors = (Syndic.Atom.author ~email:(Post.email post) (Post.author post), []) in
977977- let title : Syndic.Atom.text_construct = Syndic.Atom.Text (Post.title post) in
978978- let updated =
979979- match Post.date post with
980980- (* Atom entry requires a date but RSS2 does not. So if a date
981981- * is not available, just capture the current date. *)
982982- | None -> Ptime.of_float_s (Unix.gettimeofday ()) |> Option.get
983983- | Some d -> d
984984- in
985985- Syndic.Atom.entry ~content ~contributors ~links ~id ~authors ~title ~updated
986986- ()
987987-988988- let entries_of_posts posts = List.map entry_of_post posts
989989-990990- let feed_of_entries ~title ?id ?(authors = []) entries =
991991- let feed_id = match id with
992992- | Some i -> Uri.of_string i
993993- | None -> Uri.of_string "urn:river:merged"
994994- in
995995- let feed_authors = List.map (fun (name, email) ->
996996- match email with
997997- | Some e -> Syndic.Atom.author ~email:e name
998998- | None -> Syndic.Atom.author name
999999- ) authors in
10001000- {
10011001- Syndic.Atom.id = feed_id;
10021002- title = Syndic.Atom.Text title;
10031003- updated = Ptime.of_float_s (Unix.time ()) |> Option.get;
10041004- entries;
10051005- authors = feed_authors;
10061006- categories = [];
10071007- contributors = [];
10081008- generator = Some {
10091009- Syndic.Atom.version = Some "1.0";
10101010- uri = None;
10111011- content = "River Feed Aggregator";
10121012- };
10131013- icon = None;
10141014- links = [];
10151015- logo = None;
10161016- rights = None;
10171017- subtitle = None;
10181018- }
10191019-10201020- let to_string feed =
10211021- let output = Buffer.create 4096 in
10221022- Syndic.Atom.output feed (`Buffer output);
10231023- Buffer.contents output
10241024- end
10251025-10261026- module Rss2 = struct
10271027- let of_feed feed =
10281028- match feed.Feed.content with
10291029- | Feed.Rss2 ch -> Some ch
10301030- | _ -> None
10311031- end
10321032-10331033- module Jsonfeed = struct
10341034- let item_of_post post =
10351035- (* Convert HTML content back to string *)
10361036- let html = Post.content post in
10371037- let content = `Html html in
10381038-10391039- (* Create author *)
10401040- let authors =
10411041- if Post.author post <> "" then
10421042- let author = Jsonfeed.Author.create ~name:(Post.author post) () in
10431043- Some [author]
10441044- else
10451045- None
10461046- in
10471047-10481048- (* Create item *)
10491049- Jsonfeed.Item.create
10501050- ~id:(Post.id post)
10511051- ~content
10521052- ?url:(Option.map Uri.to_string (Post.link post))
10531053- ~title:(Post.title post)
10541054- ?summary:(Post.summary post)
10551055- ?date_published:(Post.date post)
10561056- ?authors
10571057- ~tags:(Post.tags post)
10581058- ()
10591059-10601060- let items_of_posts posts = List.map item_of_post posts
10611061-10621062- let feed_of_items ~title ?home_page_url ?feed_url ?description ?icon ?favicon items =
10631063- Jsonfeed.create ~title ?home_page_url ?feed_url ?description ?icon ?favicon ~items ()
10641064-10651065- let feed_of_posts ~title ?home_page_url ?feed_url ?description ?icon ?favicon posts =
10661066- let items = items_of_posts posts in
10671067- feed_of_items ~title ?home_page_url ?feed_url ?description ?icon ?favicon items
10681068-10691069- let to_string ?(minify = false) jsonfeed =
10701070- match Jsonfeed.to_string ~minify jsonfeed with
10711071- | Ok s -> Ok s
10721072- | Error err -> Error (Jsont.Error.to_string err)
10731073-10741074- let of_feed feed =
10751075- match feed.Feed.content with
10761076- | Feed.Json jf -> Some jf
10771077- | _ -> None
10781078- end
10791079-end
10801080-10811081-(** {1 User Management} *)
10821082-10831083-module User = struct
10841084- type t = {
10851085- username : string;
10861086- fullname : string;
10871087- email : string option;
10881088- feeds : Source.t list;
10891089- last_synced : string option;
10901090- }
10911091-10921092- let make ~username ~fullname ?email ?(feeds = []) ?last_synced () =
10931093- { username; fullname; email; feeds; last_synced }
10941094-10951095- let username t = t.username
10961096- let fullname t = t.fullname
10971097- let email t = t.email
10981098- let feeds t = t.feeds
10991099- let last_synced t = t.last_synced
11001100-11011101- let add_feed t source =
11021102- { t with feeds = source :: t.feeds }
11031103-11041104- let remove_feed t ~url =
11051105- let feeds = List.filter (fun s -> Source.url s <> url) t.feeds in
11061106- { t with feeds }
11071107-11081108- let set_last_synced t timestamp =
11091109- { t with last_synced = Some timestamp }
11101110-11111111- let jsont =
11121112- let make username fullname email feeds last_synced =
11131113- { username; fullname; email; feeds; last_synced }
11141114- in
11151115- Jsont.Object.map ~kind:"User" make
11161116- |> Jsont.Object.mem "username" Jsont.string ~enc:(fun u -> u.username)
11171117- |> Jsont.Object.mem "fullname" Jsont.string ~enc:(fun u -> u.fullname)
11181118- |> Jsont.Object.opt_mem "email" Jsont.string ~enc:(fun u -> u.email)
11191119- |> Jsont.Object.mem "feeds" (Jsont.list Source.jsont) ~enc:(fun u -> u.feeds)
11201120- |> Jsont.Object.opt_mem "last_synced" Jsont.string ~enc:(fun u -> u.last_synced)
11211121- |> Jsont.Object.finish
11221122-end
11231123-11241124-(** {1 Feed Quality Analysis} *)
11251125-11261126-module Quality = struct
11271127- type t = {
11281128- total_entries : int;
11291129- entries_with_summary : int;
11301130- entries_with_author : int;
11311131- entries_with_date : int;
11321132- entries_with_content : int;
11331133- entries_with_tags : int;
11341134- avg_content_length : float;
11351135- min_content_length : int;
11361136- max_content_length : int;
11371137- posting_frequency_days : float option;
11381138- quality_score : float;
11391139- }
11401140-11411141- let make ~total_entries ~entries_with_summary ~entries_with_author
11421142- ~entries_with_date ~entries_with_content ~entries_with_tags
11431143- ~avg_content_length ~min_content_length ~max_content_length
11441144- ~posting_frequency_days ~quality_score =
11451145- {
11461146- total_entries;
11471147- entries_with_summary;
11481148- entries_with_author;
11491149- entries_with_date;
11501150- entries_with_content;
11511151- entries_with_tags;
11521152- avg_content_length;
11531153- min_content_length;
11541154- max_content_length;
11551155- posting_frequency_days;
11561156- quality_score;
11571157- }
11581158-11591159- let total_entries t = t.total_entries
11601160- let entries_with_summary t = t.entries_with_summary
11611161- let entries_with_author t = t.entries_with_author
11621162- let entries_with_date t = t.entries_with_date
11631163- let entries_with_content t = t.entries_with_content
11641164- let entries_with_tags t = t.entries_with_tags
11651165- let avg_content_length t = t.avg_content_length
11661166- let min_content_length t = t.min_content_length
11671167- let max_content_length t = t.max_content_length
11681168- let posting_frequency_days t = t.posting_frequency_days
11691169- let quality_score t = t.quality_score
11701170-11711171- (** Get content length from an Atom entry *)
11721172- let get_content_length (entry : Syndic.Atom.entry) =
11731173- match entry.content with
11741174- | Some (Syndic.Atom.Text s) -> String.length s
11751175- | Some (Syndic.Atom.Html (_, s)) -> String.length s
11761176- | Some (Syndic.Atom.Xhtml (_, _)) -> 0 (* Could calculate but complex *)
11771177- | Some (Syndic.Atom.Mime _) -> 0
11781178- | Some (Syndic.Atom.Src _) -> 0
11791179- | None -> (
11801180- match entry.summary with
11811181- | Some (Syndic.Atom.Text s) -> String.length s
11821182- | Some (Syndic.Atom.Html (_, s)) -> String.length s
11831183- | Some (Syndic.Atom.Xhtml (_, _)) -> 0
11841184- | None -> 0)
11851185-11861186- (** Check if entry has non-empty summary *)
11871187- let has_summary (entry : Syndic.Atom.entry) =
11881188- match entry.summary with
11891189- | Some (Syndic.Atom.Text s) when String.trim s <> "" -> true
11901190- | Some (Syndic.Atom.Html (_, s)) when String.trim s <> "" -> true
11911191- | Some (Syndic.Atom.Xhtml (_, _)) -> true
11921192- | _ -> false
11931193-11941194- (** Check if entry has author *)
11951195- let has_author (entry : Syndic.Atom.entry) =
11961196- let (author, _) = entry.authors in
11971197- String.trim author.name <> ""
11981198-11991199- (** Check if entry has content *)
12001200- let has_content (entry : Syndic.Atom.entry) =
12011201- get_content_length entry > 0
12021202-12031203- (** Check if entry has tags/categories *)
12041204- let has_tags (entry : Syndic.Atom.entry) =
12051205- entry.categories <> []
12061206-12071207- (** Calculate quality score from metrics *)
12081208- let calculate_quality_score t =
12091209- let total = float_of_int t.total_entries in
12101210- if total = 0.0 then 0.0
12111211- else
12121212- let summary_pct = float_of_int t.entries_with_summary /. total *. 100.0 in
12131213- let author_pct = float_of_int t.entries_with_author /. total *. 100.0 in
12141214- let date_pct = float_of_int t.entries_with_date /. total *. 100.0 in
12151215- let content_pct = float_of_int t.entries_with_content /. total *. 100.0 in
12161216- let tags_pct = float_of_int t.entries_with_tags /. total *. 100.0 in
12171217-12181218- (* Weighted average: content and dates are most important *)
12191219- let score =
12201220- (content_pct *. 0.30) +.
12211221- (date_pct *. 0.25) +.
12221222- (author_pct *. 0.20) +.
12231223- (summary_pct *. 0.15) +.
12241224- (tags_pct *. 0.10)
12251225- in
12261226- score
12271227-12281228- let analyze entries =
12291229- if entries = [] then
12301230- failwith "No entries to analyze"
12311231- else
12321232- let total_entries = List.length entries in
12331233-12341234- let entries_with_summary = ref 0 in
12351235- let entries_with_author = ref 0 in
12361236- let entries_with_date = ref total_entries in (* All Atom entries have updated *)
12371237- let entries_with_content = ref 0 in
12381238- let entries_with_tags = ref 0 in
12391239- let content_lengths = ref [] in
12401240- let dates = ref [] in
12411241-12421242- List.iter (fun (entry : Syndic.Atom.entry) ->
12431243- if has_summary entry then incr entries_with_summary;
12441244- if has_author entry then incr entries_with_author;
12451245- if has_content entry then begin
12461246- incr entries_with_content;
12471247- content_lengths := get_content_length entry :: !content_lengths
12481248- end;
12491249- if has_tags entry then incr entries_with_tags;
12501250- dates := entry.updated :: !dates
12511251- ) entries;
12521252-12531253- (* Calculate content statistics *)
12541254- let avg_content_length, min_content_length, max_content_length =
12551255- if !content_lengths = [] then
12561256- (0.0, 0, 0)
12571257- else
12581258- let sorted = List.sort compare !content_lengths in
12591259- let sum = List.fold_left (+) 0 sorted in
12601260- let avg = float_of_int sum /. float_of_int (List.length sorted) in
12611261- let min_len = List.hd sorted in
12621262- let max_len = List.hd (List.rev sorted) in
12631263- (avg, min_len, max_len)
12641264- in
12651265-12661266- (* Calculate posting frequency *)
12671267- let posting_frequency_days =
12681268- if List.length !dates < 2 then
12691269- None
12701270- else
12711271- try
12721272- let timestamps = List.map Ptime.to_float_s !dates in
12731273- let sorted_timestamps = List.sort compare timestamps in
12741274- let first = List.hd sorted_timestamps in
12751275- let last = List.hd (List.rev sorted_timestamps) in
12761276- let total_days = (last -. first) /. 86400.0 in
12771277- let num_intervals = float_of_int (List.length sorted_timestamps - 1) in
12781278- Some (total_days /. num_intervals)
12791279- with _ -> None
12801280- in
12811281-12821282- (* Create metrics record (without quality_score first) *)
12831283- let metrics = {
12841284- total_entries;
12851285- entries_with_summary = !entries_with_summary;
12861286- entries_with_author = !entries_with_author;
12871287- entries_with_date = !entries_with_date;
12881288- entries_with_content = !entries_with_content;
12891289- entries_with_tags = !entries_with_tags;
12901290- avg_content_length;
12911291- min_content_length;
12921292- max_content_length;
12931293- posting_frequency_days;
12941294- quality_score = 0.0; (* Placeholder *)
12951295- } in
12961296-12971297- (* Calculate quality score *)
12981298- let quality_score = calculate_quality_score metrics in
12991299- { metrics with quality_score }
13001300-end
13011301-13021302-(** {1 State Management} *)
13031303-13041304-module State = struct
13051305- type t = {
13061306- xdg : Xdge.t;
13071307- }
13081308-13091309- module Paths = struct
13101310- (** Get the users directory path *)
13111311- let users_dir state = Eio.Path.(Xdge.state_dir state.xdg / "users")
13121312-13131313- (** Get the feeds directory path *)
13141314- let feeds_dir state = Eio.Path.(Xdge.state_dir state.xdg / "feeds")
13151315-13161316- (** Get the user feeds directory path *)
13171317- let user_feeds_dir state = Eio.Path.(feeds_dir state / "user")
13181318-13191319- (** Get the path to a user's JSON file *)
13201320- let user_file state username =
13211321- Eio.Path.(users_dir state / (username ^ ".json"))
13221322-13231323- (** Get the path to a user's Atom feed file *)
13241324- let user_feed_file state username =
13251325- Eio.Path.(user_feeds_dir state / (username ^ ".xml"))
13261326-13271327- (** Ensure all necessary directories exist *)
13281328- let ensure_directories state =
13291329- let dirs = [
13301330- users_dir state;
13311331- feeds_dir state;
13321332- user_feeds_dir state;
13331333- ] in
13341334- List.iter (fun dir ->
13351335- try Eio.Path.mkdir ~perm:0o755 dir
13361336- with Eio.Io (Eio.Fs.E (Already_exists _), _) -> ()
13371337- ) dirs
13381338- end
13391339-13401340- module Json = struct
13411341- (** Decode a user from JSON string *)
13421342- let user_of_string s =
13431343- match Jsont_bytesrw.decode_string' User.jsont s with
13441344- | Ok user -> Some user
13451345- | Error err ->
13461346- Log.err (fun m -> m "Failed to parse user JSON: %s" (Jsont.Error.to_string err));
13471347- None
13481348-13491349- (** Encode a user to JSON string *)
13501350- let user_to_string user =
13511351- match Jsont_bytesrw.encode_string' ~format:Jsont.Indent User.jsont user with
13521352- | Ok s -> s
13531353- | Error err -> failwith ("Failed to encode user: " ^ Jsont.Error.to_string err)
13541354- end
13551355-13561356- module Storage = struct
13571357- (** Load a user from disk *)
13581358- let load_user state username =
13591359- let file = Paths.user_file state username in
13601360- try
13611361- let content = Eio.Path.load file in
13621362- Json.user_of_string content
13631363- with
13641364- | Eio.Io (Eio.Fs.E (Not_found _), _) -> None
13651365- | e ->
13661366- Log.err (fun m -> m "Error loading user %s: %s" username (Printexc.to_string e));
13671367- None
13681368-13691369- (** Save a user to disk *)
13701370- let save_user state user =
13711371- let file = Paths.user_file state (User.username user) in
13721372- let json = Json.user_to_string user in
13731373- Eio.Path.save ~create:(`Or_truncate 0o644) file json
13741374-13751375- (** List all usernames *)
13761376- let list_users state =
13771377- try
13781378- Eio.Path.read_dir (Paths.users_dir state)
13791379- |> List.filter_map (fun name ->
13801380- if Filename.check_suffix name ".json" then
13811381- Some (Filename.chop_suffix name ".json")
13821382- else None
13831383- )
13841384- with _ -> []
13851385-13861386- (** Load existing Atom entries for a user *)
13871387- let load_existing_posts state username =
13881388- let file = Paths.user_feed_file state username in
13891389- try
13901390- let content = Eio.Path.load file in
13911391- (* Parse existing Atom feed *)
13921392- let input = Xmlm.make_input (`String (0, content)) in
13931393- let feed = Syndic.Atom.parse input in
13941394- feed.Syndic.Atom.entries
13951395- with
13961396- | Eio.Io (Eio.Fs.E (Not_found _), _) -> []
13971397- | e ->
13981398- Log.err (fun m -> m "Error loading existing posts for %s: %s"
13991399- username (Printexc.to_string e));
14001400- []
14011401-14021402- (** Save Atom entries for a user *)
14031403- let save_atom_feed state username entries =
14041404- let file = Paths.user_feed_file state username in
14051405- let feed = Format.Atom.feed_of_entries ~title:username entries in
14061406- let xml = Format.Atom.to_string feed in
14071407- Eio.Path.save ~create:(`Or_truncate 0o644) file xml
14081408-14091409- (** Delete a user and their feed file *)
14101410- let delete_user state username =
14111411- let user_file = Paths.user_file state username in
14121412- let feed_file = Paths.user_feed_file state username in
14131413- (try Eio.Path.unlink user_file with _ -> ());
14141414- (try Eio.Path.unlink feed_file with _ -> ())
14151415- end
14161416-14171417- module Sync = struct
14181418- (** Merge new entries with existing ones, updating matching IDs *)
14191419- let merge_entries ~existing ~new_entries =
14201420- (* Create a map of new entry IDs for efficient lookup and updates *)
14211421- let module UriMap = Map.Make(Uri) in
14221422- let new_entries_map =
14231423- List.fold_left (fun acc (entry : Syndic.Atom.entry) ->
14241424- UriMap.add entry.id entry acc
14251425- ) UriMap.empty new_entries
14261426- in
14271427-14281428- (* Update existing entries with new ones if IDs match, otherwise keep existing *)
14291429- let updated_existing =
14301430- List.filter_map (fun (entry : Syndic.Atom.entry) ->
14311431- if UriMap.mem entry.id new_entries_map then
14321432- None (* Will be replaced by new entry *)
14331433- else
14341434- Some entry (* Keep existing entry *)
14351435- ) existing
14361436- in
2020+(** {1 Public API Modules} *)
14372114381438- (* Combine new entries with non-replaced existing entries *)
14391439- let combined = new_entries @ updated_existing in
14401440- List.sort (fun (a : Syndic.Atom.entry) (b : Syndic.Atom.entry) ->
14411441- Ptime.compare b.updated a.updated
14421442- ) combined
14431443-14441444- (** Get current timestamp in ISO 8601 format *)
14451445- let current_timestamp () =
14461446- let open Unix in
14471447- let tm = gmtime (time ()) in
14481448- Printf.sprintf "%04d-%02d-%02dT%02d:%02d:%02dZ"
14491449- (tm.tm_year + 1900) (tm.tm_mon + 1) tm.tm_mday
14501450- tm.tm_hour tm.tm_min tm.tm_sec
14511451-14521452- (** Sync feeds for a single user *)
14531453- let sync_user session state ~username =
14541454- match Storage.load_user state username with
14551455- | None ->
14561456- Error (Printf.sprintf "User %s not found" username)
14571457- | Some user when User.feeds user = [] ->
14581458- Log.info (fun m -> m "No feeds configured for user %s" username);
14591459- Ok ()
14601460- | Some user ->
14611461- Log.info (fun m -> m "Syncing feeds for user %s..." username);
14621462-14631463- (* Fetch all feeds concurrently *)
14641464- let fetched_feeds =
14651465- Eio.Fiber.List.filter_map (fun source ->
14661466- try
14671467- Log.info (fun m -> m " Fetching %s (%s)..."
14681468- (Source.name source) (Source.url source));
14691469- Some (Feed.fetch session source)
14701470- with e ->
14711471- Log.err (fun m -> m " Failed to fetch %s: %s"
14721472- (Source.name source) (Printexc.to_string e));
14731473- None
14741474- ) (User.feeds user)
14751475- in
14761476-14771477- if fetched_feeds = [] then begin
14781478- Error "No feeds successfully fetched"
14791479- end else begin
14801480- (* Get posts from fetched feeds *)
14811481- let posts = Post.of_feeds fetched_feeds in
14821482- Log.info (fun m -> m " Found %d new posts" (List.length posts));
14831483-14841484- (* Convert to Atom entries *)
14851485- let new_entries = Format.Atom.entries_of_posts posts in
14861486-14871487- (* Load existing entries *)
14881488- let existing = Storage.load_existing_posts state username in
14891489- Log.info (fun m -> m " Found %d existing posts" (List.length existing));
14901490-14911491- (* Merge entries *)
14921492- let merged = merge_entries ~existing ~new_entries in
14931493- Log.info (fun m -> m " Total posts after merge: %d" (List.length merged));
14941494-14951495- (* Save updated feed *)
14961496- Storage.save_atom_feed state username merged;
14971497-14981498- (* Update last_synced timestamp *)
14991499- let now = current_timestamp () in
15001500- let user = User.set_last_synced user now in
15011501- Storage.save_user state user;
15021502-15031503- Log.info (fun m -> m "Sync completed for user %s" username);
15041504- Ok ()
15051505- end
15061506- end
15071507-15081508- module Export = struct
15091509- (** Convert Atom entry to JSONFeed item *)
15101510- let atom_entry_to_jsonfeed_item (entry : Syndic.Atom.entry) =
15111511- (* Extract ID *)
15121512- let id = Uri.to_string entry.id in
15131513-15141514- (* Extract title *)
15151515- let title =
15161516- match entry.title with
15171517- | Syndic.Atom.Text s -> Some s
15181518- | Syndic.Atom.Html (_, s) -> Some s
15191519- | Syndic.Atom.Xhtml (_, _) -> Some "Untitled"
15201520- in
15211521-15221522- (* Extract URL *)
15231523- let url =
15241524- match entry.links with
15251525- | link :: _ -> Some (Uri.to_string link.href)
15261526- | [] -> None
15271527- in
15281528-15291529- (* Extract content *)
15301530- let content =
15311531- match entry.content with
15321532- | Some (Syndic.Atom.Text s) -> `Text s
15331533- | Some (Syndic.Atom.Html (_, s)) -> `Html s
15341534- | Some (Syndic.Atom.Xhtml (_, nodes)) ->
15351535- let html = String.concat "" (List.map Syndic.XML.to_string nodes) in
15361536- `Html html
15371537- | Some (Syndic.Atom.Mime _) | Some (Syndic.Atom.Src _) | None ->
15381538- `Text ""
15391539- in
15401540-15411541- (* Extract summary *)
15421542- let summary =
15431543- match entry.summary with
15441544- | Some (Syndic.Atom.Text s) when String.trim s <> "" -> Some s
15451545- | Some (Syndic.Atom.Html (_, s)) when String.trim s <> "" -> Some s
15461546- | _ -> None
15471547- in
15481548-15491549- (* Extract authors *)
15501550- let authors =
15511551- let (author, contributors) = entry.authors in
15521552- let author_list = author :: contributors in
15531553- let jsonfeed_authors = List.filter_map (fun (a : Syndic.Atom.author) ->
15541554- let name = String.trim a.name in
15551555- if name = "" then None
15561556- else Some (Jsonfeed.Author.create ~name ())
15571557- ) author_list in
15581558- if jsonfeed_authors = [] then None else Some jsonfeed_authors
15591559- in
15601560-15611561- (* Extract tags *)
15621562- let tags =
15631563- match entry.categories with
15641564- | [] -> None
15651565- | cats ->
15661566- let tag_list = List.map (fun (c : Syndic.Atom.category) ->
15671567- match c.label with
15681568- | Some lbl -> lbl
15691569- | None -> c.term
15701570- ) cats in
15711571- if tag_list = [] then None else Some tag_list
15721572- in
15731573-15741574- (* Create JSONFeed item *)
15751575- Jsonfeed.Item.create
15761576- ~id
15771577- ~content
15781578- ?title
15791579- ?url
15801580- ?summary
15811581- ?authors
15821582- ?tags
15831583- ~date_published:entry.updated
15841584- ()
15851585-15861586- (** Export entries as JSONFeed *)
15871587- let export_jsonfeed ~title entries =
15881588- let items = List.map atom_entry_to_jsonfeed_item entries in
15891589- let feed = Jsonfeed.create ~title ~items () in
15901590- match Jsonfeed.to_string ~minify:false feed with
15911591- | Ok json -> Ok json
15921592- | Error err -> Error (Printf.sprintf "Failed to serialize JSON Feed: %s" (Jsont.Error.to_string err))
15931593- end
15941594-15951595- let create env ~app_name =
15961596- let xdg = Xdge.create env#fs app_name in
15971597- let state = { xdg } in
15981598- Paths.ensure_directories state;
15991599- state
16001600-16011601- let create_user state user =
16021602- match Storage.load_user state (User.username user) with
16031603- | Some _ ->
16041604- Error (Printf.sprintf "User %s already exists" (User.username user))
16051605- | None ->
16061606- Storage.save_user state user;
16071607- Log.info (fun m -> m "User %s created" (User.username user));
16081608- Ok ()
16091609-16101610- let delete_user state ~username =
16111611- match Storage.load_user state username with
16121612- | None ->
16131613- Error (Printf.sprintf "User %s not found" username)
16141614- | Some _ ->
16151615- Storage.delete_user state username;
16161616- Log.info (fun m -> m "User %s deleted" username);
16171617- Ok ()
16181618-16191619- let get_user state ~username =
16201620- Storage.load_user state username
16211621-16221622- let update_user state user =
16231623- match Storage.load_user state (User.username user) with
16241624- | None ->
16251625- Error (Printf.sprintf "User %s not found" (User.username user))
16261626- | Some _ ->
16271627- Storage.save_user state user;
16281628- Log.info (fun m -> m "User %s updated" (User.username user));
16291629- Ok ()
16301630-16311631- let list_users state =
16321632- Storage.list_users state
16331633-16341634- let sync_user env state ~username =
16351635- Session.with_session env @@ fun session ->
16361636- Sync.sync_user session state ~username
16371637-16381638- let sync_all env state =
16391639- let users = Storage.list_users state in
16401640- if users = [] then begin
16411641- Log.info (fun m -> m "No users to sync");
16421642- Ok (0, 0)
16431643- end else begin
16441644- Log.info (fun m -> m "Syncing %d users concurrently..." (List.length users));
16451645-16461646- Session.with_session env @@ fun session ->
16471647- let results =
16481648- Eio.Fiber.List.map (fun username ->
16491649- match Sync.sync_user session state ~username with
16501650- | Ok () -> true
16511651- | Error err ->
16521652- Log.err (fun m -> m "Failed to sync user %s: %s" username err);
16531653- false
16541654- ) users
16551655- in
16561656- let success_count = List.length (List.filter (fun x -> x) results) in
16571657- let fail_count = List.length users - success_count in
16581658-16591659- if fail_count = 0 then
16601660- Log.info (fun m -> m "All users synced successfully");
16611661-16621662- Ok (success_count, fail_count)
16631663- end
16641664-16651665- let get_user_posts state ~username ?limit () =
16661666- let entries = Storage.load_existing_posts state username in
16671667- match limit with
16681668- | None -> entries
16691669- | Some n -> List.filteri (fun i _ -> i < n) entries
16701670-16711671- let get_all_posts state ?limit () =
16721672- let users = Storage.list_users state in
16731673-16741674- (* Collect all entries from all users with username tag *)
16751675- let all_entries =
16761676- List.concat_map (fun username ->
16771677- let entries = Storage.load_existing_posts state username in
16781678- List.map (fun entry -> (username, entry)) entries
16791679- ) users
16801680- in
16811681-16821682- (* Sort by date (newest first) *)
16831683- let sorted = List.sort (fun (_, a : string * Syndic.Atom.entry) (_, b) ->
16841684- Ptime.compare b.updated a.updated
16851685- ) all_entries in
16861686-16871687- match limit with
16881688- | None -> sorted
16891689- | Some n -> List.filteri (fun i _ -> i < n) sorted
16901690-16911691- let export_merged_feed state ~title ~format ?limit () =
16921692- let all_posts = get_all_posts state ?limit () in
16931693- let entries = List.map snd all_posts in
16941694-16951695- match format with
16961696- | `Atom ->
16971697- let xml = Format.Atom.to_string (Format.Atom.feed_of_entries ~title entries) in
16981698- Ok xml
16991699- | `Jsonfeed ->
17001700- if entries = [] then
17011701- (* Empty JSONFeed *)
17021702- let feed = Jsonfeed.create ~title ~items:[] () in
17031703- match Jsonfeed.to_string ~minify:false feed with
17041704- | Ok json -> Ok json
17051705- | Error err -> Error (Printf.sprintf "Failed to serialize JSON Feed: %s" (Jsont.Error.to_string err))
17061706- else
17071707- Export.export_jsonfeed ~title entries
17081708-17091709- let analyze_user_quality state ~username =
17101710- match Storage.load_user state username with
17111711- | None ->
17121712- Error (Printf.sprintf "User %s not found" username)
17131713- | Some _ ->
17141714- let entries = Storage.load_existing_posts state username in
17151715- if entries = [] then
17161716- Error "No entries to analyze"
17171717- else
17181718- Ok (Quality.analyze entries)
17191719-end
2222+module Source = Source
2323+module Session = Session
2424+module Feed = Feed
2525+module Post = Post
2626+module Format = Format
2727+module User = User
2828+module Quality = Quality
2929+module State = State
+47
stack/river/lib/session.ml
···11+(*
22+ * Copyright (c) 2014, OCaml.org project
33+ * Copyright (c) 2015 KC Sivaramakrishnan <sk826@cl.cam.ac.uk>
44+ *
55+ * Permission to use, copy, modify, and distribute this software for any
66+ * purpose with or without fee is hereby granted, provided that the above
77+ * copyright notice and this permission notice appear in all copies.
88+ *
99+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
1010+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
1111+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
1212+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
1313+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
1414+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
1515+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
1616+ *)
1717+1818+(** HTTP session management for fetching feeds. *)
1919+2020+let src = Logs.Src.create "river" ~doc:"River RSS/Atom aggregator"
2121+module Log = (val Logs.src_log src : Logs.LOG)
2222+2323+type t = {
2424+ session : (float Eio.Time.clock_ty Eio.Resource.t,
2525+ [`Generic | `Unix] Eio.Net.ty Eio.Resource.t) Requests.t;
2626+}
2727+2828+let init ~sw env =
2929+ Log.info (fun m -> m "Initializing River session");
3030+ let session = Requests.create ~sw
3131+ ~default_headers:(Requests.Headers.of_list [
3232+ ("User-Agent", "OCaml-River/1.0");
3333+ ])
3434+ ~follow_redirects:true
3535+ ~max_redirects:5
3636+ ~verify_tls:true
3737+ env
3838+ in
3939+ { session }
4040+4141+let with_session env f =
4242+ Log.info (fun m -> m "Creating River session");
4343+ Eio.Switch.run @@ fun sw ->
4444+ let client = init ~sw env in
4545+ f client
4646+4747+let get_requests_session t = t.session
+59
stack/river/lib/session.mli
···11+(*
22+ * Copyright (c) 2014, OCaml.org project
33+ * Copyright (c) 2015 KC Sivaramakrishnan <sk826@cl.cam.ac.uk>
44+ *
55+ * Permission to use, copy, modify, and distribute this software for any
66+ * purpose with or without fee is hereby granted, provided that the above
77+ * copyright notice and this permission notice appear in all copies.
88+ *
99+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
1010+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
1111+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
1212+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
1313+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
1414+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
1515+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
1616+ *)
1717+1818+(** HTTP session management for fetching feeds. *)
1919+2020+type t
2121+(** An abstract HTTP session for fetching feeds.
2222+2323+ The session manages HTTP connections and is tied to an Eio switch
2424+ for proper resource cleanup. *)
2525+2626+val init :
2727+ sw:Eio.Switch.t ->
2828+ < clock : float Eio.Time.clock_ty Eio.Resource.t;
2929+ fs : Eio.Fs.dir_ty Eio.Path.t;
3030+ net : [ `Generic | `Unix ] Eio.Net.ty Eio.Resource.t; .. > ->
3131+ t
3232+(** [init ~sw env] creates a new HTTP session.
3333+3434+ The session is configured with appropriate defaults for fetching feeds:
3535+ - User-Agent: "OCaml-River/1.0"
3636+ - Automatic redirect following (max 5 redirects)
3737+ - TLS verification enabled
3838+3939+ @param sw The switch for resource management
4040+ @param env The Eio environment *)
4141+4242+val with_session :
4343+ < clock : float Eio.Time.clock_ty Eio.Resource.t;
4444+ fs : Eio.Fs.dir_ty Eio.Path.t;
4545+ net : [ `Generic | `Unix ] Eio.Net.ty Eio.Resource.t; .. > ->
4646+ (t -> 'a) -> 'a
4747+(** [with_session env f] creates a session and automatically manages its lifecycle.
4848+4949+ This is the recommended way to use River as it ensures proper cleanup.
5050+5151+ @param env The Eio environment
5252+ @param f The function to run with the session *)
5353+5454+val get_requests_session : t ->
5555+ (float Eio.Time.clock_ty Eio.Resource.t,
5656+ [`Generic | `Unix] Eio.Net.ty Eio.Resource.t) Requests.t
5757+(** [get_requests_session t] returns the underlying Requests session.
5858+5959+ This is used internally by the Feed module. *)
+35
stack/river/lib/source.ml
···11+(*
22+ * Copyright (c) 2014, OCaml.org project
33+ * Copyright (c) 2015 KC Sivaramakrishnan <sk826@cl.cam.ac.uk>
44+ *
55+ * Permission to use, copy, modify, and distribute this software for any
66+ * purpose with or without fee is hereby granted, provided that the above
77+ * copyright notice and this permission notice appear in all copies.
88+ *
99+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
1010+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
1111+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
1212+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
1313+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
1414+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
1515+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
1616+ *)
1717+1818+(** Feed source with name and URL. *)
1919+2020+type t = {
2121+ name : string;
2222+ url : string;
2323+}
2424+2525+let make ~name ~url = { name; url }
2626+2727+let name t = t.name
2828+let url t = t.url
2929+3030+let jsont =
3131+ let make name url = { name; url } in
3232+ Jsont.Object.map ~kind:"Source" make
3333+ |> Jsont.Object.mem "name" Jsont.string ~enc:(fun s -> s.name)
3434+ |> Jsont.Object.mem "url" Jsont.string ~enc:(fun s -> s.url)
3535+ |> Jsont.Object.finish
+33
stack/river/lib/source.mli
···11+(*
22+ * Copyright (c) 2014, OCaml.org project
33+ * Copyright (c) 2015 KC Sivaramakrishnan <sk826@cl.cam.ac.uk>
44+ *
55+ * Permission to use, copy, modify, and distribute this software for any
66+ * purpose with or without fee is hereby granted, provided that the above
77+ * copyright notice and this permission notice appear in all copies.
88+ *
99+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
1010+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
1111+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
1212+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
1313+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
1414+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
1515+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
1616+ *)
1717+1818+(** Feed source with name and URL. *)
1919+2020+type t
2121+(** A feed source with name and URL. *)
2222+2323+val make : name:string -> url:string -> t
2424+(** [make ~name ~url] creates a new feed source. *)
2525+2626+val name : t -> string
2727+(** [name source] returns the feed name/label. *)
2828+2929+val url : t -> string
3030+(** [url source] returns the feed URL. *)
3131+3232+val jsont : t Jsont.t
3333+(** JSON codec for sources. *)
+436
stack/river/lib/state.ml
···11+(*
22+ * Copyright (c) 2014, OCaml.org project
33+ * Copyright (c) 2015 KC Sivaramakrishnan <sk826@cl.cam.ac.uk>
44+ *
55+ * Permission to use, copy, modify, and distribute this software for any
66+ * purpose with or without fee is hereby granted, provided that the above
77+ * copyright notice and this permission notice appear in all copies.
88+ *
99+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
1010+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
1111+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
1212+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
1313+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
1414+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
1515+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
1616+ *)
1717+1818+(** State management for user data and feeds. *)
1919+2020+let src = Logs.Src.create "river" ~doc:"River RSS/Atom aggregator"
2121+module Log = (val Logs.src_log src : Logs.LOG)
2222+2323+type t = {
2424+ xdg : Xdge.t;
2525+}
2626+2727+module Paths = struct
2828+ (** Get the users directory path *)
2929+ let users_dir state = Eio.Path.(Xdge.state_dir state.xdg / "users")
3030+3131+ (** Get the feeds directory path *)
3232+ let feeds_dir state = Eio.Path.(Xdge.state_dir state.xdg / "feeds")
3333+3434+ (** Get the user feeds directory path *)
3535+ let user_feeds_dir state = Eio.Path.(feeds_dir state / "user")
3636+3737+ (** Get the path to a user's JSON file *)
3838+ let user_file state username =
3939+ Eio.Path.(users_dir state / (username ^ ".json"))
4040+4141+ (** Get the path to a user's Atom feed file *)
4242+ let user_feed_file state username =
4343+ Eio.Path.(user_feeds_dir state / (username ^ ".xml"))
4444+4545+ (** Ensure all necessary directories exist *)
4646+ let ensure_directories state =
4747+ let dirs = [
4848+ users_dir state;
4949+ feeds_dir state;
5050+ user_feeds_dir state;
5151+ ] in
5252+ List.iter (fun dir ->
5353+ try Eio.Path.mkdir ~perm:0o755 dir
5454+ with Eio.Io (Eio.Fs.E (Already_exists _), _) -> ()
5555+ ) dirs
5656+end
5757+5858+module Json = struct
5959+ (** Decode a user from JSON string *)
6060+ let user_of_string s =
6161+ match Jsont_bytesrw.decode_string' User.jsont s with
6262+ | Ok user -> Some user
6363+ | Error err ->
6464+ Log.err (fun m -> m "Failed to parse user JSON: %s" (Jsont.Error.to_string err));
6565+ None
6666+6767+ (** Encode a user to JSON string *)
6868+ let user_to_string user =
6969+ match Jsont_bytesrw.encode_string' ~format:Jsont.Indent User.jsont user with
7070+ | Ok s -> s
7171+ | Error err -> failwith ("Failed to encode user: " ^ Jsont.Error.to_string err)
7272+end
7373+7474+module Storage = struct
7575+ (** Load a user from disk *)
7676+ let load_user state username =
7777+ let file = Paths.user_file state username in
7878+ try
7979+ let content = Eio.Path.load file in
8080+ Json.user_of_string content
8181+ with
8282+ | Eio.Io (Eio.Fs.E (Not_found _), _) -> None
8383+ | e ->
8484+ Log.err (fun m -> m "Error loading user %s: %s" username (Printexc.to_string e));
8585+ None
8686+8787+ (** Save a user to disk *)
8888+ let save_user state user =
8989+ let file = Paths.user_file state (User.username user) in
9090+ let json = Json.user_to_string user in
9191+ Eio.Path.save ~create:(`Or_truncate 0o644) file json
9292+9393+ (** List all usernames *)
9494+ let list_users state =
9595+ try
9696+ Eio.Path.read_dir (Paths.users_dir state)
9797+ |> List.filter_map (fun name ->
9898+ if Filename.check_suffix name ".json" then
9999+ Some (Filename.chop_suffix name ".json")
100100+ else None
101101+ )
102102+ with _ -> []
103103+104104+ (** Load existing Atom entries for a user *)
105105+ let load_existing_posts state username =
106106+ let file = Paths.user_feed_file state username in
107107+ try
108108+ let content = Eio.Path.load file in
109109+ (* Parse existing Atom feed *)
110110+ let input = Xmlm.make_input (`String (0, content)) in
111111+ let feed = Syndic.Atom.parse input in
112112+ feed.Syndic.Atom.entries
113113+ with
114114+ | Eio.Io (Eio.Fs.E (Not_found _), _) -> []
115115+ | e ->
116116+ Log.err (fun m -> m "Error loading existing posts for %s: %s"
117117+ username (Printexc.to_string e));
118118+ []
119119+120120+ (** Save Atom entries for a user *)
121121+ let save_atom_feed state username entries =
122122+ let file = Paths.user_feed_file state username in
123123+ let feed = Format.Atom.feed_of_entries ~title:username entries in
124124+ let xml = Format.Atom.to_string feed in
125125+ Eio.Path.save ~create:(`Or_truncate 0o644) file xml
126126+127127+ (** Delete a user and their feed file *)
128128+ let delete_user state username =
129129+ let user_file = Paths.user_file state username in
130130+ let feed_file = Paths.user_feed_file state username in
131131+ (try Eio.Path.unlink user_file with _ -> ());
132132+ (try Eio.Path.unlink feed_file with _ -> ())
133133+end
134134+135135+module Sync = struct
136136+ (** Merge new entries with existing ones, updating matching IDs *)
137137+ let merge_entries ~existing ~new_entries =
138138+ (* Create a map of new entry IDs for efficient lookup and updates *)
139139+ let module UriMap = Map.Make(Uri) in
140140+ let new_entries_map =
141141+ List.fold_left (fun acc (entry : Syndic.Atom.entry) ->
142142+ UriMap.add entry.id entry acc
143143+ ) UriMap.empty new_entries
144144+ in
145145+146146+ (* Update existing entries with new ones if IDs match, otherwise keep existing *)
147147+ let updated_existing =
148148+ List.filter_map (fun (entry : Syndic.Atom.entry) ->
149149+ if UriMap.mem entry.id new_entries_map then
150150+ None (* Will be replaced by new entry *)
151151+ else
152152+ Some entry (* Keep existing entry *)
153153+ ) existing
154154+ in
155155+156156+ (* Combine new entries with non-replaced existing entries *)
157157+ let combined = new_entries @ updated_existing in
158158+ List.sort (fun (a : Syndic.Atom.entry) (b : Syndic.Atom.entry) ->
159159+ Ptime.compare b.updated a.updated
160160+ ) combined
161161+162162+ (** Get current timestamp in ISO 8601 format *)
163163+ let current_timestamp () =
164164+ let open Unix in
165165+ let tm = gmtime (time ()) in
166166+ Printf.sprintf "%04d-%02d-%02dT%02d:%02d:%02dZ"
167167+ (tm.tm_year + 1900) (tm.tm_mon + 1) tm.tm_mday
168168+ tm.tm_hour tm.tm_min tm.tm_sec
169169+170170+ (** Sync feeds for a single user *)
171171+ let sync_user session state ~username =
172172+ match Storage.load_user state username with
173173+ | None ->
174174+ Error (Printf.sprintf "User %s not found" username)
175175+ | Some user when User.feeds user = [] ->
176176+ Log.info (fun m -> m "No feeds configured for user %s" username);
177177+ Ok ()
178178+ | Some user ->
179179+ Log.info (fun m -> m "Syncing feeds for user %s..." username);
180180+181181+ (* Fetch all feeds concurrently *)
182182+ let fetched_feeds =
183183+ Eio.Fiber.List.filter_map (fun source ->
184184+ try
185185+ Log.info (fun m -> m " Fetching %s (%s)..."
186186+ (Source.name source) (Source.url source));
187187+ Some (Feed.fetch session source)
188188+ with e ->
189189+ Log.err (fun m -> m " Failed to fetch %s: %s"
190190+ (Source.name source) (Printexc.to_string e));
191191+ None
192192+ ) (User.feeds user)
193193+ in
194194+195195+ if fetched_feeds = [] then begin
196196+ Error "No feeds successfully fetched"
197197+ end else begin
198198+ (* Get posts from fetched feeds *)
199199+ let posts = Post.of_feeds fetched_feeds in
200200+ Log.info (fun m -> m " Found %d new posts" (List.length posts));
201201+202202+ (* Convert to Atom entries *)
203203+ let new_entries = Format.Atom.entries_of_posts posts in
204204+205205+ (* Load existing entries *)
206206+ let existing = Storage.load_existing_posts state username in
207207+ Log.info (fun m -> m " Found %d existing posts" (List.length existing));
208208+209209+ (* Merge entries *)
210210+ let merged = merge_entries ~existing ~new_entries in
211211+ Log.info (fun m -> m " Total posts after merge: %d" (List.length merged));
212212+213213+ (* Save updated feed *)
214214+ Storage.save_atom_feed state username merged;
215215+216216+ (* Update last_synced timestamp *)
217217+ let now = current_timestamp () in
218218+ let user = User.set_last_synced user now in
219219+ Storage.save_user state user;
220220+221221+ Log.info (fun m -> m "Sync completed for user %s" username);
222222+ Ok ()
223223+ end
224224+end
225225+226226+module Export = struct
227227+ (** Convert Atom entry to JSONFeed item *)
228228+ let atom_entry_to_jsonfeed_item (entry : Syndic.Atom.entry) =
229229+ (* Extract ID *)
230230+ let id = Uri.to_string entry.id in
231231+232232+ (* Extract title *)
233233+ let title =
234234+ match entry.title with
235235+ | Syndic.Atom.Text s -> Some s
236236+ | Syndic.Atom.Html (_, s) -> Some s
237237+ | Syndic.Atom.Xhtml (_, _) -> Some "Untitled"
238238+ in
239239+240240+ (* Extract URL *)
241241+ let url =
242242+ match entry.links with
243243+ | link :: _ -> Some (Uri.to_string link.href)
244244+ | [] -> None
245245+ in
246246+247247+ (* Extract content *)
248248+ let content =
249249+ match entry.content with
250250+ | Some (Syndic.Atom.Text s) -> `Text s
251251+ | Some (Syndic.Atom.Html (_, s)) -> `Html s
252252+ | Some (Syndic.Atom.Xhtml (_, nodes)) ->
253253+ let html = String.concat "" (List.map Syndic.XML.to_string nodes) in
254254+ `Html html
255255+ | Some (Syndic.Atom.Mime _) | Some (Syndic.Atom.Src _) | None ->
256256+ `Text ""
257257+ in
258258+259259+ (* Extract summary *)
260260+ let summary =
261261+ match entry.summary with
262262+ | Some (Syndic.Atom.Text s) when String.trim s <> "" -> Some s
263263+ | Some (Syndic.Atom.Html (_, s)) when String.trim s <> "" -> Some s
264264+ | _ -> None
265265+ in
266266+267267+ (* Extract authors *)
268268+ let authors =
269269+ let (author, contributors) = entry.authors in
270270+ let author_list = author :: contributors in
271271+ let jsonfeed_authors = List.filter_map (fun (a : Syndic.Atom.author) ->
272272+ let name = String.trim a.name in
273273+ if name = "" then None
274274+ else Some (Jsonfeed.Author.create ~name ())
275275+ ) author_list in
276276+ if jsonfeed_authors = [] then None else Some jsonfeed_authors
277277+ in
278278+279279+ (* Extract tags *)
280280+ let tags =
281281+ match entry.categories with
282282+ | [] -> None
283283+ | cats ->
284284+ let tag_list = List.map (fun (c : Syndic.Atom.category) ->
285285+ match c.label with
286286+ | Some lbl -> lbl
287287+ | None -> c.term
288288+ ) cats in
289289+ if tag_list = [] then None else Some tag_list
290290+ in
291291+292292+ (* Create JSONFeed item *)
293293+ Jsonfeed.Item.create
294294+ ~id
295295+ ~content
296296+ ?title
297297+ ?url
298298+ ?summary
299299+ ?authors
300300+ ?tags
301301+ ~date_published:entry.updated
302302+ ()
303303+304304+ (** Export entries as JSONFeed *)
305305+ let export_jsonfeed ~title entries =
306306+ let items = List.map atom_entry_to_jsonfeed_item entries in
307307+ let feed = Jsonfeed.create ~title ~items () in
308308+ match Jsonfeed.to_string ~minify:false feed with
309309+ | Ok json -> Ok json
310310+ | Error err -> Error (Printf.sprintf "Failed to serialize JSON Feed: %s" (Jsont.Error.to_string err))
311311+end
312312+313313+let create env ~app_name =
314314+ let xdg = Xdge.create env#fs app_name in
315315+ let state = { xdg } in
316316+ Paths.ensure_directories state;
317317+ state
318318+319319+let create_user state user =
320320+ match Storage.load_user state (User.username user) with
321321+ | Some _ ->
322322+ Error (Printf.sprintf "User %s already exists" (User.username user))
323323+ | None ->
324324+ Storage.save_user state user;
325325+ Log.info (fun m -> m "User %s created" (User.username user));
326326+ Ok ()
327327+328328+let delete_user state ~username =
329329+ match Storage.load_user state username with
330330+ | None ->
331331+ Error (Printf.sprintf "User %s not found" username)
332332+ | Some _ ->
333333+ Storage.delete_user state username;
334334+ Log.info (fun m -> m "User %s deleted" username);
335335+ Ok ()
336336+337337+let get_user state ~username =
338338+ Storage.load_user state username
339339+340340+let update_user state user =
341341+ match Storage.load_user state (User.username user) with
342342+ | None ->
343343+ Error (Printf.sprintf "User %s not found" (User.username user))
344344+ | Some _ ->
345345+ Storage.save_user state user;
346346+ Log.info (fun m -> m "User %s updated" (User.username user));
347347+ Ok ()
348348+349349+let list_users state =
350350+ Storage.list_users state
351351+352352+let sync_user env state ~username =
353353+ Session.with_session env @@ fun session ->
354354+ Sync.sync_user session state ~username
355355+356356+let sync_all env state =
357357+ let users = Storage.list_users state in
358358+ if users = [] then begin
359359+ Log.info (fun m -> m "No users to sync");
360360+ Ok (0, 0)
361361+ end else begin
362362+ Log.info (fun m -> m "Syncing %d users concurrently..." (List.length users));
363363+364364+ Session.with_session env @@ fun session ->
365365+ let results =
366366+ Eio.Fiber.List.map (fun username ->
367367+ match Sync.sync_user session state ~username with
368368+ | Ok () -> true
369369+ | Error err ->
370370+ Log.err (fun m -> m "Failed to sync user %s: %s" username err);
371371+ false
372372+ ) users
373373+ in
374374+ let success_count = List.length (List.filter (fun x -> x) results) in
375375+ let fail_count = List.length users - success_count in
376376+377377+ if fail_count = 0 then
378378+ Log.info (fun m -> m "All users synced successfully");
379379+380380+ Ok (success_count, fail_count)
381381+ end
382382+383383+let get_user_posts state ~username ?limit () =
384384+ let entries = Storage.load_existing_posts state username in
385385+ match limit with
386386+ | None -> entries
387387+ | Some n -> List.filteri (fun i _ -> i < n) entries
388388+389389+let get_all_posts state ?limit () =
390390+ let users = Storage.list_users state in
391391+392392+ (* Collect all entries from all users with username tag *)
393393+ let all_entries =
394394+ List.concat_map (fun username ->
395395+ let entries = Storage.load_existing_posts state username in
396396+ List.map (fun entry -> (username, entry)) entries
397397+ ) users
398398+ in
399399+400400+ (* Sort by date (newest first) *)
401401+ let sorted = List.sort (fun (_, a : string * Syndic.Atom.entry) (_, b) ->
402402+ Ptime.compare b.updated a.updated
403403+ ) all_entries in
404404+405405+ match limit with
406406+ | None -> sorted
407407+ | Some n -> List.filteri (fun i _ -> i < n) sorted
408408+409409+let export_merged_feed state ~title ~format ?limit () =
410410+ let all_posts = get_all_posts state ?limit () in
411411+ let entries = List.map snd all_posts in
412412+413413+ match format with
414414+ | `Atom ->
415415+ let xml = Format.Atom.to_string (Format.Atom.feed_of_entries ~title entries) in
416416+ Ok xml
417417+ | `Jsonfeed ->
418418+ if entries = [] then
419419+ (* Empty JSONFeed *)
420420+ let feed = Jsonfeed.create ~title ~items:[] () in
421421+ match Jsonfeed.to_string ~minify:false feed with
422422+ | Ok json -> Ok json
423423+ | Error err -> Error (Printf.sprintf "Failed to serialize JSON Feed: %s" (Jsont.Error.to_string err))
424424+ else
425425+ Export.export_jsonfeed ~title entries
426426+427427+let analyze_user_quality state ~username =
428428+ match Storage.load_user state username with
429429+ | None ->
430430+ Error (Printf.sprintf "User %s not found" username)
431431+ | Some _ ->
432432+ let entries = Storage.load_existing_posts state username in
433433+ if entries = [] then
434434+ Error "No entries to analyze"
435435+ else
436436+ Ok (Quality.analyze entries)
+120
stack/river/lib/state.mli
···11+(*
22+ * Copyright (c) 2014, OCaml.org project
33+ * Copyright (c) 2015 KC Sivaramakrishnan <sk826@cl.cam.ac.uk>
44+ *
55+ * Permission to use, copy, modify, and distribute this software for any
66+ * purpose with or without fee is hereby granted, provided that the above
77+ * copyright notice and this permission notice appear in all copies.
88+ *
99+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
1010+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
1111+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
1212+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
1313+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
1414+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
1515+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
1616+ *)
1717+1818+(** State management for user data and feeds. *)
1919+2020+type t
2121+(** State handle for managing user data and feeds on disk. *)
2222+2323+val create :
2424+ < fs : Eio.Fs.dir_ty Eio.Path.t; .. > ->
2525+ app_name:string ->
2626+ t
2727+(** [create env ~app_name] creates a state handle using XDG directories.
2828+2929+ Data is stored in:
3030+ - Users: $XDG_STATE_HOME/[app_name]/users/
3131+ - Feeds: $XDG_STATE_HOME/[app_name]/feeds/user/
3232+3333+ @param env The Eio environment with filesystem access
3434+ @param app_name Application name for XDG paths *)
3535+3636+(** {2 User Operations} *)
3737+3838+val create_user : t -> User.t -> (unit, string) result
3939+(** [create_user state user] creates a new user.
4040+4141+ Returns [Error] if the user already exists. *)
4242+4343+val delete_user : t -> username:string -> (unit, string) result
4444+(** [delete_user state ~username] deletes a user and their feed data. *)
4545+4646+val get_user : t -> username:string -> User.t option
4747+(** [get_user state ~username] retrieves a user by username. *)
4848+4949+val update_user : t -> User.t -> (unit, string) result
5050+(** [update_user state user] saves updated user configuration. *)
5151+5252+val list_users : t -> string list
5353+(** [list_users state] returns all usernames. *)
5454+5555+(** {2 Feed Operations} *)
5656+5757+val sync_user :
5858+ < clock : float Eio.Time.clock_ty Eio.Resource.t;
5959+ fs : Eio.Fs.dir_ty Eio.Path.t;
6060+ net : [ `Generic | `Unix ] Eio.Net.ty Eio.Resource.t; .. > ->
6161+ t ->
6262+ username:string ->
6363+ (unit, string) result
6464+(** [sync_user env state ~username] fetches all feeds for the user and stores merged result.
6565+6666+ Posts are fetched concurrently and merged with existing posts.
6767+ The result is stored as an Atom feed. *)
6868+6969+val sync_all :
7070+ < clock : float Eio.Time.clock_ty Eio.Resource.t;
7171+ fs : Eio.Fs.dir_ty Eio.Path.t;
7272+ net : [ `Generic | `Unix ] Eio.Net.ty Eio.Resource.t; .. > ->
7373+ t ->
7474+ (int * int, string) result
7575+(** [sync_all env state] syncs all users concurrently.
7676+7777+ Returns [Ok (success_count, fail_count)]. *)
7878+7979+val get_user_posts :
8080+ t ->
8181+ username:string ->
8282+ ?limit:int ->
8383+ unit ->
8484+ Syndic.Atom.entry list
8585+(** [get_user_posts state ~username ()] retrieves stored posts for a user.
8686+8787+ @param limit Optional maximum number of posts to return *)
8888+8989+val get_all_posts :
9090+ t ->
9191+ ?limit:int ->
9292+ unit ->
9393+ (string * Syndic.Atom.entry) list
9494+(** [get_all_posts state ()] retrieves posts from all users, sorted by date.
9595+9696+ Returns list of (username, entry) tuples.
9797+ @param limit Optional maximum number of posts to return *)
9898+9999+(** {2 Export} *)
100100+101101+val export_merged_feed :
102102+ t ->
103103+ title:string ->
104104+ format:[ `Atom | `Jsonfeed ] ->
105105+ ?limit:int ->
106106+ unit ->
107107+ (string, string) result
108108+(** [export_merged_feed state ~title ~format ()] exports a merged feed of all users.
109109+110110+ @param title Feed title
111111+ @param format Output format
112112+ @param limit Optional maximum number of entries *)
113113+114114+(** {2 Analysis} *)
115115+116116+val analyze_user_quality :
117117+ t ->
118118+ username:string ->
119119+ (Quality.t, string) result
120120+(** [analyze_user_quality state ~username] analyzes quality metrics for a user's feed. *)
+34
stack/river/lib/text_extract.ml
···11+(*
22+ * Copyright (c) 2014, OCaml.org project
33+ * Copyright (c) 2015 KC Sivaramakrishnan <sk826@cl.cam.ac.uk>
44+ *
55+ * Permission to use, copy, modify, and distribute this software for any
66+ * purpose with or without fee is hereby granted, provided that the above
77+ * copyright notice and this permission notice appear in all copies.
88+ *
99+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
1010+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
1111+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
1212+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
1313+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
1414+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
1515+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
1616+ *)
1717+1818+(** Internal utility for Syndic text extraction *)
1919+2020+open Syndic
2121+2222+(* Remove all tags *)
2323+let rec syndic_to_buffer b = function
2424+ | XML.Node (_, _, subs) -> List.iter (syndic_to_buffer b) subs
2525+ | XML.Data (_, d) -> Buffer.add_string b d
2626+2727+let syndic_to_string x =
2828+ let b = Buffer.create 1024 in
2929+ List.iter (syndic_to_buffer b) x;
3030+ Buffer.contents b
3131+3232+let string_of_text_construct : Atom.text_construct -> string = function
3333+ | Atom.Text s | Atom.Html (_, s) -> s
3434+ | Atom.Xhtml (_, x) -> syndic_to_string x
+57
stack/river/lib/user.ml
···11+(*
22+ * Copyright (c) 2014, OCaml.org project
33+ * Copyright (c) 2015 KC Sivaramakrishnan <sk826@cl.cam.ac.uk>
44+ *
55+ * Permission to use, copy, modify, and distribute this software for any
66+ * purpose with or without fee is hereby granted, provided that the above
77+ * copyright notice and this permission notice appear in all copies.
88+ *
99+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
1010+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
1111+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
1212+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
1313+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
1414+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
1515+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
1616+ *)
1717+1818+(** User management. *)
1919+2020+type t = {
2121+ username : string;
2222+ fullname : string;
2323+ email : string option;
2424+ feeds : Source.t list;
2525+ last_synced : string option;
2626+}
2727+2828+let make ~username ~fullname ?email ?(feeds = []) ?last_synced () =
2929+ { username; fullname; email; feeds; last_synced }
3030+3131+let username t = t.username
3232+let fullname t = t.fullname
3333+let email t = t.email
3434+let feeds t = t.feeds
3535+let last_synced t = t.last_synced
3636+3737+let add_feed t source =
3838+ { t with feeds = source :: t.feeds }
3939+4040+let remove_feed t ~url =
4141+ let feeds = List.filter (fun s -> Source.url s <> url) t.feeds in
4242+ { t with feeds }
4343+4444+let set_last_synced t timestamp =
4545+ { t with last_synced = Some timestamp }
4646+4747+let jsont =
4848+ let make username fullname email feeds last_synced =
4949+ { username; fullname; email; feeds; last_synced }
5050+ in
5151+ Jsont.Object.map ~kind:"User" make
5252+ |> Jsont.Object.mem "username" Jsont.string ~enc:(fun u -> u.username)
5353+ |> Jsont.Object.mem "fullname" Jsont.string ~enc:(fun u -> u.fullname)
5454+ |> Jsont.Object.opt_mem "email" Jsont.string ~enc:(fun u -> u.email)
5555+ |> Jsont.Object.mem "feeds" (Jsont.list Source.jsont) ~enc:(fun u -> u.feeds)
5656+ |> Jsont.Object.opt_mem "last_synced" Jsont.string ~enc:(fun u -> u.last_synced)
5757+ |> Jsont.Object.finish
+64
stack/river/lib/user.mli
···11+(*
22+ * Copyright (c) 2014, OCaml.org project
33+ * Copyright (c) 2015 KC Sivaramakrishnan <sk826@cl.cam.ac.uk>
44+ *
55+ * Permission to use, copy, modify, and distribute this software for any
66+ * purpose with or without fee is hereby granted, provided that the above
77+ * copyright notice and this permission notice appear in all copies.
88+ *
99+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
1010+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
1111+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
1212+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
1313+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
1414+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
1515+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
1616+ *)
1717+1818+(** User management. *)
1919+2020+type t
2121+(** User configuration with feed subscriptions. *)
2222+2323+val make :
2424+ username:string ->
2525+ fullname:string ->
2626+ ?email:string ->
2727+ ?feeds:Source.t list ->
2828+ ?last_synced:string ->
2929+ unit ->
3030+ t
3131+(** [make ~username ~fullname ()] creates a new user.
3232+3333+ @param username Unique username identifier
3434+ @param fullname User's display name
3535+ @param email Optional email address
3636+ @param feeds Optional list of feed sources (default: [])
3737+ @param last_synced Optional ISO 8601 timestamp of last sync *)
3838+3939+val username : t -> string
4040+(** [username user] returns the username. *)
4141+4242+val fullname : t -> string
4343+(** [fullname user] returns the full name. *)
4444+4545+val email : t -> string option
4646+(** [email user] returns the email address if set. *)
4747+4848+val feeds : t -> Source.t list
4949+(** [feeds user] returns the list of subscribed feeds. *)
5050+5151+val last_synced : t -> string option
5252+(** [last_synced user] returns the last sync timestamp if set. *)
5353+5454+val add_feed : t -> Source.t -> t
5555+(** [add_feed user source] returns a new user with the feed added. *)
5656+5757+val remove_feed : t -> url:string -> t
5858+(** [remove_feed user ~url] returns a new user with the feed removed by URL. *)
5959+6060+val set_last_synced : t -> string -> t
6161+(** [set_last_synced user timestamp] returns a new user with updated sync time. *)
6262+6363+val jsont : t Jsont.t
6464+(** JSON codec for users. *)