this repo has no description
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

sync

+2603 -1785
+177 -86
stack/river/cmd/river_cmd.ml
··· 16 16 let user = River.User.make ~username ~fullname ?email () in 17 17 match River.State.create_user state user with 18 18 | Ok () -> 19 - Log.info (fun m -> m "User %s created" username); 19 + Fmt.pr "@.%a %a %a@.@." 20 + Fmt.(styled (`Fg `Green) string) "✓" 21 + Fmt.(styled `Bold string) "User created:" 22 + Fmt.(styled (`Fg `Cyan) string) username; 20 23 0 21 24 | Error err -> 22 - Log.err (fun m -> m "%s" err); 25 + Fmt.pr "@.%a %s@.@." 26 + Fmt.(styled (`Fg `Red) string) "✗ Error:" 27 + err; 23 28 1 24 29 25 30 let remove state ~username = 26 31 match River.State.delete_user state ~username with 27 32 | Ok () -> 28 - Log.info (fun m -> m "User %s removed" username); 33 + Fmt.pr "@.%a %a %a@.@." 34 + Fmt.(styled (`Fg `Green) string) "✓" 35 + Fmt.(styled `Bold string) "User removed:" 36 + Fmt.(styled (`Fg `Cyan) string) username; 29 37 0 30 38 | Error err -> 31 - Log.err (fun m -> m "%s" err); 39 + Fmt.pr "@.%a %s@.@." 40 + Fmt.(styled (`Fg `Red) string) "✗ Error:" 41 + err; 32 42 1 33 43 34 44 let list state = 35 45 let users = River.State.list_users state in 36 - if users = [] then 37 - Printf.printf "No users found\n" 38 - else begin 39 - Printf.printf "Users:\n"; 46 + if users = [] then begin 47 + Fmt.pr "@.%a@.@." 48 + Fmt.(styled `Yellow string) 49 + "No users found. Use 'river-cli user add' to create one." 50 + end else begin 51 + Fmt.pr "@.%a@." 52 + Fmt.(styled `Bold (styled (`Fg `Cyan) string)) 53 + (Printf.sprintf "Users (%d total)" (List.length users)); 54 + Fmt.pr "%a@.@." Fmt.(styled `Faint string) (String.make 60 '-'); 40 55 List.iter (fun username -> 41 56 match River.State.get_user state ~username with 42 57 | Some user -> 43 58 let email_str = match River.User.email user with 44 - | Some e -> " <" ^ e ^ ">" 59 + | Some e -> Fmt.str " %a" Fmt.(styled `Faint string) ("<" ^ e ^ ">") 45 60 | None -> "" 46 61 in 47 - Printf.printf " %s (%s%s) - %d feeds\n" 48 - username (River.User.fullname user) email_str 49 - (List.length (River.User.feeds user)) 62 + let feed_count = List.length (River.User.feeds user) in 63 + Fmt.pr "%a %a%s@." 64 + Fmt.(styled `Bold (styled (`Fg `Blue) string)) username 65 + Fmt.(styled `Green string) (River.User.fullname user) 66 + email_str; 67 + Fmt.pr " %a %a %a@.@." 68 + Fmt.(styled `Faint string) "└─" 69 + Fmt.(styled (`Fg `Yellow) string) (string_of_int feed_count) 70 + Fmt.(styled `Faint string) (if feed_count = 1 then "feed" else "feeds") 50 71 | None -> () 51 72 ) users 52 73 end; ··· 55 76 let add_feed state ~username ~name ~url = 56 77 match River.State.get_user state ~username with 57 78 | None -> 58 - Log.err (fun m -> m "User %s not found" username); 79 + Fmt.pr "@.%a User %a not found@.@." 80 + Fmt.(styled (`Fg `Red) string) "✗ Error:" 81 + Fmt.(styled `Bold string) username; 59 82 1 60 83 | Some user -> 61 84 let source = River.Source.make ~name ~url in 62 85 let user = River.User.add_feed user source in 63 86 (match River.State.update_user state user with 64 87 | Ok () -> 65 - Log.info (fun m -> m "Feed %s added to user %s" name username); 88 + Fmt.pr "@.%a Feed added to %a@." 89 + Fmt.(styled (`Fg `Green) string) "✓" 90 + Fmt.(styled (`Fg `Cyan) string) username; 91 + Fmt.pr " %a %a@." 92 + Fmt.(styled `Faint string) "Name:" 93 + Fmt.(styled `Bold string) name; 94 + Fmt.pr " %a %a@.@." 95 + Fmt.(styled `Faint string) "URL: " 96 + Fmt.(styled (`Fg `Blue) string) url; 66 97 0 67 98 | Error err -> 68 - Log.err (fun m -> m "%s" err); 99 + Fmt.pr "@.%a %s@.@." 100 + Fmt.(styled (`Fg `Red) string) "✗ Error:" 101 + err; 69 102 1) 70 103 71 104 let remove_feed state ~username ~url = 72 105 match River.State.get_user state ~username with 73 106 | None -> 74 - Log.err (fun m -> m "User %s not found" username); 107 + Fmt.pr "@.%a User %a not found@.@." 108 + Fmt.(styled (`Fg `Red) string) "✗ Error:" 109 + Fmt.(styled `Bold string) username; 75 110 1 76 111 | Some user -> 77 112 let user = River.User.remove_feed user ~url in 78 113 (match River.State.update_user state user with 79 114 | Ok () -> 80 - Log.info (fun m -> m "Feed removed from user %s" username); 115 + Fmt.pr "@.%a Feed removed from %a@.@." 116 + Fmt.(styled (`Fg `Green) string) "✓" 117 + Fmt.(styled (`Fg `Cyan) string) username; 81 118 0 82 119 | Error err -> 83 - Log.err (fun m -> m "%s" err); 120 + Fmt.pr "@.%a %s@.@." 121 + Fmt.(styled (`Fg `Red) string) "✗ Error:" 122 + err; 84 123 1) 85 124 86 125 let show state ~username = 87 126 match River.State.get_user state ~username with 88 127 | None -> 89 - Log.err (fun m -> m "User %s not found" username); 128 + Fmt.pr "@.%a User %a not found@.@." 129 + Fmt.(styled (`Fg `Red) string) "✗ Error:" 130 + Fmt.(styled `Bold string) username; 90 131 1 91 132 | Some user -> 92 - Printf.printf "Username: %s\n" (River.User.username user); 93 - Printf.printf "Full name: %s\n" (River.User.fullname user); 94 - Printf.printf "Email: %s\n" 95 - (Option.value (River.User.email user) ~default:"(none)"); 96 - Printf.printf "Last synced: %s\n" 133 + Fmt.pr "@.%a@." 134 + Fmt.(styled `Bold (styled (`Fg `Cyan) string)) 135 + (Printf.sprintf "User: %s" (River.User.username user)); 136 + Fmt.pr "%a@.@." Fmt.(styled `Faint string) (String.make 60 '-'); 137 + 138 + Fmt.pr "%a %a@." 139 + Fmt.(styled `Faint string) "Full name:" 140 + Fmt.(styled `Green string) (River.User.fullname user); 141 + 142 + Fmt.pr "%a %a@." 143 + Fmt.(styled `Faint string) "Email: " 144 + Fmt.string (Option.value (River.User.email user) ~default:"(not set)"); 145 + 146 + Fmt.pr "%a %a@.@." 147 + Fmt.(styled `Faint string) "Synced: " 148 + Fmt.(styled `Yellow string) 97 149 (Option.value (River.User.last_synced user) ~default:"never"); 150 + 98 151 let feeds = River.User.feeds user in 99 - Printf.printf "Feeds (%d):\n" (List.length feeds); 100 - List.iter (fun feed -> 101 - Printf.printf " - %s: %s\n" 102 - (River.Source.name feed) (River.Source.url feed) 103 - ) feeds; 152 + Fmt.pr "%a@." 153 + Fmt.(styled `Bold string) 154 + (Printf.sprintf "Feeds (%d)" (List.length feeds)); 155 + Fmt.pr "%a@." Fmt.(styled `Faint string) (String.make 60 '-'); 156 + 157 + if feeds = [] then 158 + Fmt.pr "%a@.@." 159 + Fmt.(styled `Faint string) 160 + " No feeds configured. Use 'river-cli user add-feed' to add one." 161 + else 162 + List.iter (fun feed -> 163 + Fmt.pr "@.%a@." 164 + Fmt.(styled `Bold (styled (`Fg `Blue) string)) 165 + (River.Source.name feed); 166 + Fmt.pr " %a %a@.@." 167 + Fmt.(styled `Faint string) "URL:" 168 + Fmt.(styled (`Fg `Magenta) string) (River.Source.url feed) 169 + ) feeds; 104 170 0 105 171 end 106 172 107 173 (* Sync command *) 108 174 module Sync = struct 109 175 let sync_user env state ~username = 176 + Fmt.pr "@.%a Syncing feeds for %a...@." 177 + Fmt.(styled (`Fg `Cyan) string) "→" 178 + Fmt.(styled `Bold string) username; 110 179 match River.State.sync_user env state ~username with 111 180 | Ok () -> 112 - Log.info (fun m -> m "Sync completed for user %s" username); 181 + Fmt.pr "%a Sync completed successfully@.@." 182 + Fmt.(styled (`Fg `Green) string) "✓"; 113 183 0 114 184 | Error err -> 115 - Log.err (fun m -> m "Sync failed: %s" err); 185 + Fmt.pr "%a Sync failed: %s@.@." 186 + Fmt.(styled (`Fg `Red) string) "✗" 187 + err; 116 188 1 117 189 118 190 let sync_all env state = 191 + Fmt.pr "@.%a Syncing all users...@.@." 192 + Fmt.(styled (`Fg `Cyan) string) "→"; 119 193 match River.State.sync_all env state with 120 194 | Ok (success, fail) -> 121 - Log.info (fun m -> m "Synced %d users (%d failed)" success fail); 122 - if fail = 0 then 0 else 1 195 + if fail = 0 then begin 196 + Fmt.pr "%a Successfully synced %a@.@." 197 + Fmt.(styled (`Fg `Green) string) "✓" 198 + Fmt.(styled `Bold (styled (`Fg `Green) string)) (Printf.sprintf "%d users" success); 199 + 0 200 + end else begin 201 + Fmt.pr "%a Synced %a, %a@.@." 202 + Fmt.(styled `Yellow string) "⚠" 203 + Fmt.(styled (`Fg `Green) string) (Printf.sprintf "%d users" success) 204 + Fmt.(styled (`Fg `Red) string) (Printf.sprintf "%d failed" fail); 205 + 1 206 + end 123 207 | Error err -> 124 - Log.err (fun m -> m "Sync failed: %s" err); 208 + Fmt.pr "%a Sync failed: %s@.@." 209 + Fmt.(styled (`Fg `Red) string) "✗" 210 + err; 125 211 1 126 212 end 127 213 ··· 633 719 1 634 720 | Ok metrics -> 635 721 (* Display quality metrics *) 636 - Fmt.pr "@."; 637 - Fmt.pr "%a@." Fmt.(styled `Bold string) 638 - (String.make 70 '='); 639 - Fmt.pr " %a %s@." Fmt.(styled `Bold (styled (`Fg `Blue) string)) 640 - "User Quality Analysis:" username; 641 - Fmt.pr "%a@.@." Fmt.(styled `Bold string) 642 - (String.make 70 '='); 722 + Fmt.pr "@.%a@." 723 + Fmt.(styled `Bold (styled (`Fg `Cyan) string)) 724 + (Printf.sprintf "Feed Quality Analysis: %s" username); 725 + Fmt.pr "%a@.@." Fmt.(styled `Faint string) (String.make 70 '='); 643 726 644 - (* Overall quality score *) 727 + (* Overall quality score with visual indicator *) 645 728 let score = River.Quality.quality_score metrics in 646 - let score_color = match score with 647 - | s when s >= 80.0 -> `Green 648 - | s when s >= 60.0 -> `Yellow 649 - | s when s >= 40.0 -> `Magenta 650 - | _ -> `Red 729 + let score_color, score_label = match score with 730 + | s when s >= 80.0 -> `Green, "Excellent" 731 + | s when s >= 60.0 -> `Yellow, "Good" 732 + | s when s >= 40.0 -> `Magenta, "Fair" 733 + | _ -> `Red, "Poor" 651 734 in 652 - Fmt.pr "%a %.1f/100.0@.@." 653 - Fmt.(styled (`Fg score_color) (styled `Bold string)) 654 - "Overall Quality Score:" 655 - score; 735 + let bar_width = 40 in 736 + let filled = int_of_float (score /. 100.0 *. float_of_int bar_width) in 737 + let bar = String.make filled '#' ^ String.make (bar_width - filled) '-' in 738 + Fmt.pr "%a@." 739 + Fmt.(styled `Bold string) "Overall Quality Score"; 740 + Fmt.pr " %a %.1f/100 %a@.@." 741 + Fmt.(styled (`Fg score_color) string) bar 742 + score 743 + Fmt.(styled (`Fg score_color) (styled `Bold string)) (Printf.sprintf "(%s)" score_label); 656 744 657 745 (* Entry statistics *) 658 - Fmt.pr "%a@." Fmt.(styled `Cyan string) "Entry Statistics:"; 659 - Fmt.pr " Total entries: %d@." (River.Quality.total_entries metrics); 746 + Fmt.pr "%a %a@." 747 + Fmt.(styled `Bold string) "📊 Entries:" 748 + Fmt.(styled (`Fg `Yellow) (styled `Bold string)) 749 + (string_of_int (River.Quality.total_entries metrics)); 660 750 Fmt.pr "@."; 661 751 662 - (* Completeness metrics *) 663 - Fmt.pr "%a@." Fmt.(styled `Cyan string) "Completeness:"; 752 + (* Completeness metrics with visual indicators *) 753 + Fmt.pr "%a@." Fmt.(styled `Bold string) "Completeness"; 664 754 let total = River.Quality.total_entries metrics in 665 755 let pct entries = 666 756 float_of_int entries /. float_of_int total *. 100.0 667 757 in 668 - Fmt.pr " Entries with content: %3d/%d (%5.1f%%)@." 669 - (River.Quality.entries_with_content metrics) 670 - total 671 - (pct (River.Quality.entries_with_content metrics)); 672 - Fmt.pr " Entries with dates: %3d/%d (%5.1f%%)@." 673 - (River.Quality.entries_with_date metrics) 674 - total 675 - (pct (River.Quality.entries_with_date metrics)); 676 - Fmt.pr " Entries with authors: %3d/%d (%5.1f%%)@." 677 - (River.Quality.entries_with_author metrics) 678 - total 679 - (pct (River.Quality.entries_with_author metrics)); 680 - Fmt.pr " Entries with summaries:%3d/%d (%5.1f%%)@." 681 - (River.Quality.entries_with_summary metrics) 682 - total 683 - (pct (River.Quality.entries_with_summary metrics)); 684 - Fmt.pr " Entries with tags: %3d/%d (%5.1f%%)@." 685 - (River.Quality.entries_with_tags metrics) 686 - total 687 - (pct (River.Quality.entries_with_tags metrics)); 758 + let show_metric label count = 759 + let p = pct count in 760 + let icon, color = match p with 761 + | p when p >= 90.0 -> "✓", `Green 762 + | p when p >= 50.0 -> "○", `Yellow 763 + | _ -> "✗", `Red 764 + in 765 + Fmt.pr " %a %s %3d/%d %a@." 766 + Fmt.(styled (`Fg color) string) icon 767 + label 768 + count total 769 + Fmt.(styled `Faint string) (Printf.sprintf "(%.1f%%)" p) 770 + in 771 + show_metric "Content: " (River.Quality.entries_with_content metrics); 772 + show_metric "Dates: " (River.Quality.entries_with_date metrics); 773 + show_metric "Authors: " (River.Quality.entries_with_author metrics); 774 + show_metric "Summaries:" (River.Quality.entries_with_summary metrics); 775 + show_metric "Tags: " (River.Quality.entries_with_tags metrics); 688 776 Fmt.pr "@."; 689 777 690 778 (* Content statistics *) 691 779 if River.Quality.entries_with_content metrics > 0 then begin 692 - Fmt.pr "%a@." Fmt.(styled `Cyan string) "Content Statistics:"; 693 - Fmt.pr " Average length: %.0f characters@." 780 + Fmt.pr "%a@." Fmt.(styled `Bold string) "Content Statistics"; 781 + Fmt.pr " %a %.0f chars@." 782 + Fmt.(styled `Faint string) "Average:" 694 783 (River.Quality.avg_content_length metrics); 695 - Fmt.pr " Min length: %d characters@." 696 - (River.Quality.min_content_length metrics); 697 - Fmt.pr " Max length: %d characters@." 698 - (River.Quality.max_content_length metrics); 699 - Fmt.pr "@." 784 + Fmt.pr " %a %a ... %a@.@." 785 + Fmt.(styled `Faint string) "Range: " 786 + Fmt.(styled (`Fg `Cyan) string) (string_of_int (River.Quality.min_content_length metrics)) 787 + Fmt.(styled (`Fg `Cyan) string) (string_of_int (River.Quality.max_content_length metrics)) 700 788 end; 701 789 702 790 (* Posting frequency *) 703 791 (match River.Quality.posting_frequency_days metrics with 704 792 | Some freq -> 705 - Fmt.pr "%a@." Fmt.(styled `Cyan string) "Posting Frequency:"; 706 - Fmt.pr " Average: %.1f days between posts@." freq; 793 + Fmt.pr "%a@." Fmt.(styled `Bold string) "Posting Frequency"; 707 794 let posts_per_week = 7.0 /. freq in 708 - Fmt.pr " (~%.1f posts per week)@." posts_per_week; 709 - Fmt.pr "@." 795 + Fmt.pr " %a %.1f days between posts@." 796 + Fmt.(styled `Faint string) "Average:" 797 + freq; 798 + Fmt.pr " %a ~%.1f posts/week@.@." 799 + Fmt.(styled `Faint string) " " 800 + posts_per_week 710 801 | None -> 711 802 Fmt.pr "%a@.@." Fmt.(styled `Faint string) 712 803 "Not enough data to calculate posting frequency");
+123
stack/river/lib/feed.ml
··· 1 + (* 2 + * Copyright (c) 2014, OCaml.org project 3 + * Copyright (c) 2015 KC Sivaramakrishnan <sk826@cl.cam.ac.uk> 4 + * 5 + * Permission to use, copy, modify, and distribute this software for any 6 + * purpose with or without fee is hereby granted, provided that the above 7 + * copyright notice and this permission notice appear in all copies. 8 + * 9 + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10 + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11 + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 12 + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13 + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 14 + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 15 + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 16 + *) 17 + 18 + (** Feed fetching and parsing. *) 19 + 20 + let src = Logs.Src.create "river" ~doc:"River RSS/Atom aggregator" 21 + module Log = (val Logs.src_log src : Logs.LOG) 22 + 23 + type feed_content = 24 + | Atom of Syndic.Atom.feed 25 + | Rss2 of Syndic.Rss2.channel 26 + | Json of Jsonfeed.t 27 + 28 + type t = { 29 + source : Source.t; 30 + title : string; 31 + content : feed_content; 32 + } 33 + 34 + let string_of_feed = function 35 + | Atom _ -> "Atom" 36 + | Rss2 _ -> "Rss2" 37 + | Json _ -> "JSONFeed" 38 + 39 + let classify_feed ~xmlbase (body : string) = 40 + Log.debug (fun m -> m "Attempting to parse feed (%d bytes)" (String.length body)); 41 + 42 + (* Quick check - does it look like JSON? *) 43 + let looks_like_json = 44 + String.length body > 0 && 45 + let first_char = String.get body 0 in 46 + first_char = '{' || first_char = '[' 47 + in 48 + 49 + if looks_like_json then ( 50 + (* Try JSONFeed first *) 51 + Log.debug (fun m -> m "Body looks like JSON, trying JSONFeed parser"); 52 + match Jsonfeed.of_string body with 53 + | Ok jsonfeed -> 54 + Log.debug (fun m -> m "Successfully parsed as JSONFeed"); 55 + Json jsonfeed 56 + | Error err -> 57 + Log.debug (fun m -> m "Not a JSONFeed: %s" (Jsont.Error.to_string err)); 58 + (* Fall through to XML parsing *) 59 + failwith "Not a valid JSONFeed" 60 + ) else ( 61 + (* Try XML formats *) 62 + try 63 + let feed = Atom (Syndic.Atom.parse ~xmlbase (Xmlm.make_input (`String (0, body)))) in 64 + Log.debug (fun m -> m "Successfully parsed as Atom feed"); 65 + feed 66 + with 67 + | Syndic.Atom.Error.Error (pos, msg) -> ( 68 + Log.debug (fun m -> m "Not an Atom feed: %s at position (%d, %d)" 69 + msg (fst pos) (snd pos)); 70 + try 71 + let feed = Rss2 (Syndic.Rss2.parse ~xmlbase (Xmlm.make_input (`String (0, body)))) in 72 + Log.debug (fun m -> m "Successfully parsed as RSS2 feed"); 73 + feed 74 + with Syndic.Rss2.Error.Error (pos, msg) -> 75 + Log.err (fun m -> m "Failed to parse as RSS2: %s at position (%d, %d)" 76 + msg (fst pos) (snd pos)); 77 + failwith "Neither Atom nor RSS2 feed") 78 + | Not_found as e -> 79 + Log.err (fun m -> m "Not_found exception during Atom feed parsing"); 80 + Log.err (fun m -> m "Backtrace:\n%s" (Printexc.get_backtrace ())); 81 + raise e 82 + | e -> 83 + Log.err (fun m -> m "Unexpected exception during feed parsing: %s" 84 + (Printexc.to_string e)); 85 + Log.err (fun m -> m "Backtrace:\n%s" (Printexc.get_backtrace ())); 86 + raise e 87 + ) 88 + 89 + let fetch session source = 90 + Log.info (fun m -> m "Fetching feed: %s" (Source.name source)); 91 + 92 + let xmlbase = Uri.of_string (Source.url source) in 93 + 94 + (* Use Requests_json_api.get_result for clean Result-based error handling *) 95 + let requests_session = Session.get_requests_session session in 96 + let response = 97 + match Requests_json_api.get_result requests_session (Source.url source) with 98 + | Ok body -> 99 + Log.info (fun m -> m "Successfully fetched %s (%d bytes)" 100 + (Source.url source) (String.length body)); 101 + body 102 + | Error (status, msg) -> 103 + Log.err (fun m -> m "Failed to fetch feed '%s': HTTP %d - %s" 104 + (Source.name source) status msg); 105 + failwith (Printf.sprintf "HTTP %d: %s" status msg) 106 + in 107 + 108 + let content = classify_feed ~xmlbase response in 109 + let title = 110 + match content with 111 + | Atom atom -> Text_extract.string_of_text_construct atom.Syndic.Atom.title 112 + | Rss2 ch -> ch.Syndic.Rss2.title 113 + | Json jsonfeed -> Jsonfeed.title jsonfeed 114 + in 115 + 116 + Log.info (fun m -> m "Successfully fetched %s feed '%s' (title: '%s')" 117 + (string_of_feed content) (Source.name source) title); 118 + 119 + { source; title; content } 120 + 121 + let source t = t.source 122 + let content t = t.content 123 + let title t = t.title
+43
stack/river/lib/feed.mli
··· 1 + (* 2 + * Copyright (c) 2014, OCaml.org project 3 + * Copyright (c) 2015 KC Sivaramakrishnan <sk826@cl.cam.ac.uk> 4 + * 5 + * Permission to use, copy, modify, and distribute this software for any 6 + * purpose with or without fee is hereby granted, provided that the above 7 + * copyright notice and this permission notice appear in all copies. 8 + * 9 + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10 + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11 + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 12 + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13 + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 14 + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 15 + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 16 + *) 17 + 18 + (** Feed fetching and parsing. *) 19 + 20 + type feed_content = 21 + | Atom of Syndic.Atom.feed 22 + | Rss2 of Syndic.Rss2.channel 23 + | Json of Jsonfeed.t 24 + (** The underlying feed content, which can be Atom, RSS2, or JSONFeed format. *) 25 + 26 + type t 27 + (** An Atom, RSS2, or JSON Feed. *) 28 + 29 + val fetch : Session.t -> Source.t -> t 30 + (** [fetch session source] fetches and parses a feed from the given source. 31 + 32 + @param session The HTTP session 33 + @param source The feed source to fetch 34 + @raise Failure if the feed cannot be fetched or parsed *) 35 + 36 + val source : t -> Source.t 37 + (** [source feed] returns the source this feed was fetched from. *) 38 + 39 + val content : t -> feed_content 40 + (** [content feed] returns the underlying feed content. *) 41 + 42 + val title : t -> string 43 + (** [title feed] returns the feed title. *)
+139
stack/river/lib/format.ml
··· 1 + (* 2 + * Copyright (c) 2014, OCaml.org project 3 + * Copyright (c) 2015 KC Sivaramakrishnan <sk826@cl.cam.ac.uk> 4 + * 5 + * Permission to use, copy, modify, and distribute this software for any 6 + * purpose with or without fee is hereby granted, provided that the above 7 + * copyright notice and this permission notice appear in all copies. 8 + * 9 + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10 + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11 + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 12 + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13 + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 14 + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 15 + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 16 + *) 17 + 18 + (** Feed format conversion and export. *) 19 + 20 + module Atom = struct 21 + let entry_of_post post = 22 + let content = Syndic.Atom.Html (None, Post.content post) in 23 + let contributors = 24 + [ Syndic.Atom.author ~uri:(Uri.of_string (Source.url (Feed.source (Post.feed post)))) 25 + (Source.name (Feed.source (Post.feed post))) ] 26 + in 27 + let links = 28 + match Post.link post with 29 + | Some l -> [ Syndic.Atom.link ~rel:Syndic.Atom.Alternate l ] 30 + | None -> [] 31 + in 32 + let id = 33 + match Post.link post with 34 + | Some l -> l 35 + | None -> Uri.of_string (Digest.to_hex (Digest.string (Post.title post))) 36 + in 37 + let authors = (Syndic.Atom.author ~email:(Post.email post) (Post.author post), []) in 38 + let title : Syndic.Atom.text_construct = Syndic.Atom.Text (Post.title post) in 39 + let updated = 40 + match Post.date post with 41 + (* Atom entry requires a date but RSS2 does not. So if a date 42 + * is not available, just capture the current date. *) 43 + | None -> Ptime.of_float_s (Unix.gettimeofday ()) |> Option.get 44 + | Some d -> d 45 + in 46 + Syndic.Atom.entry ~content ~contributors ~links ~id ~authors ~title ~updated 47 + () 48 + 49 + let entries_of_posts posts = List.map entry_of_post posts 50 + 51 + let feed_of_entries ~title ?id ?(authors = []) entries = 52 + let feed_id = match id with 53 + | Some i -> Uri.of_string i 54 + | None -> Uri.of_string "urn:river:merged" 55 + in 56 + let feed_authors = List.map (fun (name, email) -> 57 + match email with 58 + | Some e -> Syndic.Atom.author ~email:e name 59 + | None -> Syndic.Atom.author name 60 + ) authors in 61 + { 62 + Syndic.Atom.id = feed_id; 63 + title = Syndic.Atom.Text title; 64 + updated = Ptime.of_float_s (Unix.time ()) |> Option.get; 65 + entries; 66 + authors = feed_authors; 67 + categories = []; 68 + contributors = []; 69 + generator = Some { 70 + Syndic.Atom.version = Some "1.0"; 71 + uri = None; 72 + content = "River Feed Aggregator"; 73 + }; 74 + icon = None; 75 + links = []; 76 + logo = None; 77 + rights = None; 78 + subtitle = None; 79 + } 80 + 81 + let to_string feed = 82 + let output = Buffer.create 4096 in 83 + Syndic.Atom.output feed (`Buffer output); 84 + Buffer.contents output 85 + end 86 + 87 + module Rss2 = struct 88 + let of_feed feed = 89 + match Feed.content feed with 90 + | Feed.Rss2 ch -> Some ch 91 + | _ -> None 92 + end 93 + 94 + module Jsonfeed = struct 95 + let item_of_post post = 96 + (* Convert HTML content back to string *) 97 + let html = Post.content post in 98 + let content = `Html html in 99 + 100 + (* Create author *) 101 + let authors = 102 + if Post.author post <> "" then 103 + let author = Jsonfeed.Author.create ~name:(Post.author post) () in 104 + Some [author] 105 + else 106 + None 107 + in 108 + 109 + (* Create item *) 110 + Jsonfeed.Item.create 111 + ~id:(Post.id post) 112 + ~content 113 + ?url:(Option.map Uri.to_string (Post.link post)) 114 + ~title:(Post.title post) 115 + ?summary:(Post.summary post) 116 + ?date_published:(Post.date post) 117 + ?authors 118 + ~tags:(Post.tags post) 119 + () 120 + 121 + let items_of_posts posts = List.map item_of_post posts 122 + 123 + let feed_of_items ~title ?home_page_url ?feed_url ?description ?icon ?favicon items = 124 + Jsonfeed.create ~title ?home_page_url ?feed_url ?description ?icon ?favicon ~items () 125 + 126 + let feed_of_posts ~title ?home_page_url ?feed_url ?description ?icon ?favicon posts = 127 + let items = items_of_posts posts in 128 + feed_of_items ~title ?home_page_url ?feed_url ?description ?icon ?favicon items 129 + 130 + let to_string ?(minify = false) jsonfeed = 131 + match Jsonfeed.to_string ~minify jsonfeed with 132 + | Ok s -> Ok s 133 + | Error err -> Error (Jsont.Error.to_string err) 134 + 135 + let of_feed feed = 136 + match Feed.content feed with 137 + | Feed.Json jf -> Some jf 138 + | _ -> None 139 + end
+103
stack/river/lib/format.mli
··· 1 + (* 2 + * Copyright (c) 2014, OCaml.org project 3 + * Copyright (c) 2015 KC Sivaramakrishnan <sk826@cl.cam.ac.uk> 4 + * 5 + * Permission to use, copy, modify, and distribute this software for any 6 + * purpose with or without fee is hereby granted, provided that the above 7 + * copyright notice and this permission notice appear in all copies. 8 + * 9 + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10 + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11 + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 12 + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13 + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 14 + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 15 + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 16 + *) 17 + 18 + (** Feed format conversion and export. *) 19 + 20 + module Atom : sig 21 + (** Atom 1.0 format support. *) 22 + 23 + val entry_of_post : Post.t -> Syndic.Atom.entry 24 + (** [entry_of_post post] converts a post to an Atom entry. *) 25 + 26 + val entries_of_posts : Post.t list -> Syndic.Atom.entry list 27 + (** [entries_of_posts posts] converts posts to Atom entries. *) 28 + 29 + val feed_of_entries : 30 + title:string -> 31 + ?id:string -> 32 + ?authors:(string * string option) list -> 33 + Syndic.Atom.entry list -> 34 + Syndic.Atom.feed 35 + (** [feed_of_entries ~title entries] creates an Atom feed from entries. 36 + 37 + @param title The feed title 38 + @param id Optional feed ID (default: "urn:river:merged") 39 + @param authors Optional list of (name, email) tuples *) 40 + 41 + val to_string : Syndic.Atom.feed -> string 42 + (** [to_string feed] serializes an Atom feed to XML string. *) 43 + end 44 + 45 + module Rss2 : sig 46 + (** RSS 2.0 format support. *) 47 + 48 + val of_feed : Feed.t -> Syndic.Rss2.channel option 49 + (** [of_feed feed] extracts RSS2 channel if the feed is RSS2 format. 50 + 51 + Returns None if the feed is not RSS2. *) 52 + end 53 + 54 + module Jsonfeed : sig 55 + (** JSON Feed 1.1 format support. *) 56 + 57 + val item_of_post : Post.t -> Jsonfeed.Item.t 58 + (** [item_of_post post] converts a post to a JSONFeed item. *) 59 + 60 + val items_of_posts : Post.t list -> Jsonfeed.Item.t list 61 + (** [items_of_posts posts] converts posts to JSONFeed items. *) 62 + 63 + val feed_of_items : 64 + title:string -> 65 + ?home_page_url:string -> 66 + ?feed_url:string -> 67 + ?description:string -> 68 + ?icon:string -> 69 + ?favicon:string -> 70 + Jsonfeed.Item.t list -> 71 + Jsonfeed.t 72 + (** [feed_of_items ~title items] creates a JSONFeed from items. 73 + 74 + @param title The feed title (required) 75 + @param home_page_url The URL of the website the feed represents 76 + @param feed_url The URL of the feed itself 77 + @param description A description of the feed 78 + @param icon URL of an icon for the feed (512x512 recommended) 79 + @param favicon URL of a favicon for the feed (64x64 recommended) *) 80 + 81 + val feed_of_posts : 82 + title:string -> 83 + ?home_page_url:string -> 84 + ?feed_url:string -> 85 + ?description:string -> 86 + ?icon:string -> 87 + ?favicon:string -> 88 + Post.t list -> 89 + Jsonfeed.t 90 + (** [feed_of_posts ~title posts] creates a JSONFeed from posts. 91 + 92 + Convenience function that combines [items_of_posts] and [feed_of_items]. *) 93 + 94 + val to_string : ?minify:bool -> Jsonfeed.t -> (string, string) result 95 + (** [to_string ?minify feed] serializes a JSONFeed to JSON string. 96 + 97 + @param minify If true, output compact JSON; if false, pretty-print (default: false) *) 98 + 99 + val of_feed : Feed.t -> Jsonfeed.t option 100 + (** [of_feed feed] extracts JSONFeed if the feed is JSONFeed format. 101 + 102 + Returns None if the feed is not JSONFeed. *) 103 + end
+330
stack/river/lib/html_markdown.ml
··· 1 + (* 2 + * Copyright (c) 2014, OCaml.org project 3 + * Copyright (c) 2015 KC Sivaramakrishnan <sk826@cl.cam.ac.uk> 4 + * 5 + * Permission to use, copy, modify, and distribute this software for any 6 + * purpose with or without fee is hereby granted, provided that the above 7 + * copyright notice and this permission notice appear in all copies. 8 + * 9 + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10 + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11 + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 12 + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13 + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 14 + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 15 + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 16 + *) 17 + 18 + (** Internal utility for HTML to Markdown conversion *) 19 + 20 + [@@@warning "-32"] (* Suppress unused value warnings for internal utilities *) 21 + 22 + (** HTML to Markdown converter using Lambda Soup *) 23 + 24 + (** Extract all links from HTML content *) 25 + let extract_links html_str = 26 + try 27 + let soup = Soup.parse html_str in 28 + let links = Soup.select "a[href]" soup in 29 + Soup.fold (fun acc link -> 30 + match Soup.attribute "href" link with 31 + | Some href -> 32 + let text = Soup.texts link |> String.concat "" |> String.trim in 33 + (href, text) :: acc 34 + | None -> acc 35 + ) [] links 36 + |> List.rev 37 + with _ -> [] 38 + 39 + (** Check if string contains any whitespace *) 40 + let has_whitespace s = 41 + try 42 + let _ = Str.search_forward (Str.regexp "[ \t\n\r]") s 0 in 43 + true 44 + with Not_found -> false 45 + 46 + (** Clean up excessive newlines and normalize spacing *) 47 + let cleanup_markdown s = 48 + (* Normalize line endings *) 49 + let s = Str.global_replace (Str.regexp "\r\n") "\n" s in 50 + 51 + (* Remove trailing whitespace from each line *) 52 + let lines = String.split_on_char '\n' s in 53 + let lines = List.map (fun line -> 54 + (* Trim trailing spaces but preserve leading spaces for indentation *) 55 + let len = String.length line in 56 + let rec find_last_non_space i = 57 + if i < 0 then -1 58 + else if line.[i] = ' ' || line.[i] = '\t' then find_last_non_space (i - 1) 59 + else i 60 + in 61 + let last = find_last_non_space (len - 1) in 62 + if last < 0 then "" 63 + else String.sub line 0 (last + 1) 64 + ) lines in 65 + 66 + (* Join back and collapse excessive blank lines *) 67 + let s = String.concat "\n" lines in 68 + 69 + (* Replace 3+ consecutive newlines with just 2 *) 70 + let s = Str.global_replace (Str.regexp "\n\n\n+") "\n\n" s in 71 + 72 + (* Trim leading and trailing whitespace *) 73 + String.trim s 74 + 75 + (** Convert HTML to Markdown using state-based whitespace handling *) 76 + let html_to_markdown html_str = 77 + try 78 + let soup = Soup.parse html_str in 79 + let buffer = Buffer.create 256 in 80 + 81 + (* State: track if we need to insert a space before next text *) 82 + let need_space = ref false in 83 + 84 + (* Get last character in buffer, if any *) 85 + let last_char () = 86 + let len = Buffer.length buffer in 87 + if len = 0 then None 88 + else Some (Buffer.nth buffer (len - 1)) 89 + in 90 + 91 + (* Add text with proper spacing *) 92 + let add_text text = 93 + let trimmed = String.trim text in 94 + if trimmed <> "" then begin 95 + (* Check if text starts with punctuation that shouldn't have space before it *) 96 + let starts_with_punctuation = 97 + String.length trimmed > 0 && 98 + (match trimmed.[0] with 99 + | ',' | '.' | ';' | ':' | '!' | '?' | ')' | ']' | '}' -> true 100 + | _ -> false) 101 + in 102 + 103 + (* Add space if needed, unless we're before punctuation *) 104 + if !need_space && not starts_with_punctuation then begin 105 + match last_char () with 106 + | Some (' ' | '\n') -> () 107 + | _ -> Buffer.add_char buffer ' ' 108 + end; 109 + Buffer.add_string buffer trimmed; 110 + need_space := false 111 + end 112 + in 113 + 114 + (* Mark that we need space before next text (for inline elements) *) 115 + let mark_space_needed () = 116 + need_space := has_whitespace (Buffer.contents buffer) || Buffer.length buffer > 0 117 + in 118 + 119 + (* Process header with ID/anchor handling *) 120 + let process_header level elem = 121 + need_space := false; 122 + 123 + (* Check if header contains a link with an ID fragment *) 124 + let link_opt = Soup.select_one "a[href]" elem in 125 + let anchor_id = match link_opt with 126 + | Some link -> 127 + (match Soup.attribute "href" link with 128 + | Some href -> 129 + (* Extract fragment from URL *) 130 + let uri = Uri.of_string href in 131 + Uri.fragment uri 132 + | None -> None) 133 + | None -> None 134 + in 135 + 136 + (* Add anchor if we found an ID *) 137 + (match anchor_id with 138 + | Some id when id <> "" -> 139 + Buffer.add_string buffer (Printf.sprintf "\n<a name=\"%s\"></a>\n" id) 140 + | _ -> ()); 141 + 142 + (* Add the header marker *) 143 + let marker = String.make level '#' in 144 + Buffer.add_string buffer ("\n" ^ marker ^ " "); 145 + 146 + (* Get text content, excluding link tags *) 147 + let text = Soup.texts elem |> String.concat " " |> String.trim in 148 + Buffer.add_string buffer text; 149 + 150 + Buffer.add_string buffer "\n\n"; 151 + need_space := false 152 + in 153 + 154 + let rec process_node node = 155 + match Soup.element node with 156 + | Some elem -> 157 + let tag = Soup.name elem in 158 + (match tag with 159 + (* Block elements - reset space tracking *) 160 + | "h1" -> process_header 1 elem 161 + | "h2" -> process_header 2 elem 162 + | "h3" -> process_header 3 elem 163 + | "h4" -> process_header 4 elem 164 + | "h5" -> process_header 5 elem 165 + | "h6" -> process_header 6 elem 166 + | "p" -> 167 + need_space := false; 168 + Soup.children elem |> Soup.iter process_node; 169 + Buffer.add_string buffer "\n\n"; 170 + need_space := false 171 + | "br" -> 172 + Buffer.add_string buffer "\n"; 173 + need_space := false 174 + (* Inline elements - preserve space tracking *) 175 + | "strong" | "b" -> 176 + (* Add space before if needed *) 177 + if !need_space then begin 178 + match last_char () with 179 + | Some (' ' | '\n') -> () 180 + | _ -> Buffer.add_char buffer ' ' 181 + end; 182 + Buffer.add_string buffer "**"; 183 + need_space := false; 184 + Soup.children elem |> Soup.iter process_node; 185 + Buffer.add_string buffer "**"; 186 + mark_space_needed () 187 + | "em" | "i" -> 188 + (* Add space before if needed *) 189 + if !need_space then begin 190 + match last_char () with 191 + | Some (' ' | '\n') -> () 192 + | _ -> Buffer.add_char buffer ' ' 193 + end; 194 + Buffer.add_string buffer "*"; 195 + need_space := false; 196 + Soup.children elem |> Soup.iter process_node; 197 + Buffer.add_string buffer "*"; 198 + mark_space_needed () 199 + | "code" -> 200 + (* Add space before if needed *) 201 + if !need_space then begin 202 + match last_char () with 203 + | Some (' ' | '\n') -> () 204 + | _ -> Buffer.add_char buffer ' ' 205 + end; 206 + Buffer.add_string buffer "`"; 207 + need_space := false; 208 + Soup.children elem |> Soup.iter process_node; 209 + Buffer.add_string buffer "`"; 210 + mark_space_needed () 211 + | "pre" -> 212 + need_space := false; 213 + Buffer.add_string buffer "\n```\n"; 214 + Soup.children elem |> Soup.iter process_node; 215 + Buffer.add_string buffer "\n```\n\n"; 216 + need_space := false 217 + | "a" -> 218 + let text = Soup.texts elem |> String.concat " " |> String.trim in 219 + let href = Soup.attribute "href" elem in 220 + (match href with 221 + | Some href -> 222 + (* Add space before link if needed *) 223 + if !need_space then begin 224 + match last_char () with 225 + | Some (' ' | '\n') -> () 226 + | _ -> Buffer.add_char buffer ' ' 227 + end; 228 + need_space := false; 229 + 230 + (* Add the link markdown *) 231 + if text = "" then 232 + Buffer.add_string buffer (Printf.sprintf "<%s>" href) 233 + else 234 + Buffer.add_string buffer (Printf.sprintf "[%s](%s)" text href); 235 + 236 + (* Mark that space may be needed after link *) 237 + mark_space_needed () 238 + | None -> 239 + add_text text) 240 + | "ul" | "ol" -> 241 + need_space := false; 242 + Buffer.add_string buffer "\n"; 243 + let is_ordered = tag = "ol" in 244 + let items = Soup.children elem |> Soup.to_list in 245 + List.iteri (fun i item -> 246 + match Soup.element item with 247 + | Some li when Soup.name li = "li" -> 248 + need_space := false; 249 + if is_ordered then 250 + Buffer.add_string buffer (Printf.sprintf "%d. " (i + 1)) 251 + else 252 + Buffer.add_string buffer "- "; 253 + Soup.children li |> Soup.iter process_node; 254 + Buffer.add_string buffer "\n" 255 + | _ -> () 256 + ) items; 257 + Buffer.add_string buffer "\n"; 258 + need_space := false 259 + | "blockquote" -> 260 + need_space := false; 261 + Buffer.add_string buffer "\n> "; 262 + Soup.children elem |> Soup.iter process_node; 263 + Buffer.add_string buffer "\n\n"; 264 + need_space := false 265 + | "img" -> 266 + (* Add space before if needed *) 267 + if !need_space then begin 268 + match last_char () with 269 + | Some (' ' | '\n') -> () 270 + | _ -> Buffer.add_char buffer ' ' 271 + end; 272 + let alt = Soup.attribute "alt" elem |> Option.value ~default:"" in 273 + let src = Soup.attribute "src" elem |> Option.value ~default:"" in 274 + Buffer.add_string buffer (Printf.sprintf "![%s](%s)" alt src); 275 + need_space := false; 276 + mark_space_needed () 277 + | "hr" -> 278 + need_space := false; 279 + Buffer.add_string buffer "\n---\n\n"; 280 + need_space := false 281 + (* Strip these tags but keep content *) 282 + | "div" | "span" | "article" | "section" | "header" | "footer" 283 + | "main" | "nav" | "aside" | "figure" | "figcaption" | "details" | "summary" -> 284 + Soup.children elem |> Soup.iter process_node 285 + (* Ignore script, style, etc *) 286 + | "script" | "style" | "noscript" -> () 287 + (* Default: just process children *) 288 + | _ -> 289 + Soup.children elem |> Soup.iter process_node) 290 + | None -> 291 + (* Text node - handle whitespace properly *) 292 + match Soup.leaf_text node with 293 + | Some text -> 294 + (* If text is only whitespace, mark that we need space *) 295 + let trimmed = String.trim text in 296 + if trimmed = "" then begin 297 + if has_whitespace text then 298 + need_space := true 299 + end else begin 300 + (* Text has content - check if it had leading/trailing whitespace *) 301 + let had_leading_ws = has_whitespace text && 302 + (String.length text > 0 && 303 + (text.[0] = ' ' || text.[0] = '\t' || text.[0] = '\n' || text.[0] = '\r')) in 304 + 305 + (* If had leading whitespace, mark we need space *) 306 + if had_leading_ws then need_space := true; 307 + 308 + (* Add the text content *) 309 + add_text trimmed; 310 + 311 + (* If had trailing whitespace, mark we need space for next *) 312 + let had_trailing_ws = has_whitespace text && 313 + (String.length text > 0 && 314 + let last = text.[String.length text - 1] in 315 + last = ' ' || last = '\t' || last = '\n' || last = '\r') in 316 + if had_trailing_ws then need_space := true 317 + end 318 + | None -> () 319 + in 320 + 321 + Soup.children soup |> Soup.iter process_node; 322 + 323 + (* Clean up the result *) 324 + let result = Buffer.contents buffer in 325 + cleanup_markdown result 326 + with _ -> html_str 327 + 328 + (** Convert HTML content to clean Markdown *) 329 + let to_markdown html_str = 330 + html_to_markdown html_str
+84
stack/river/lib/html_meta.ml
··· 1 + (* 2 + * Copyright (c) 2014, OCaml.org project 3 + * Copyright (c) 2015 KC Sivaramakrishnan <sk826@cl.cam.ac.uk> 4 + * 5 + * Permission to use, copy, modify, and distribute this software for any 6 + * purpose with or without fee is hereby granted, provided that the above 7 + * copyright notice and this permission notice appear in all copies. 8 + * 9 + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10 + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11 + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 12 + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13 + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 14 + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 15 + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 16 + *) 17 + 18 + (** Internal utility for HTML meta tag extraction *) 19 + 20 + [@@@warning "-32"] (* Suppress unused value warnings for internal utilities *) 21 + 22 + (** This module determines an image to be used as preview of a website. 23 + 24 + It does this by following the same logic Google+ and other websites use, and 25 + described in this article: 26 + https://www.raymondcamden.com/2011/07/26/How-are-Facebook-and-Google-creating-link-previews *) 27 + 28 + let og_image html = 29 + let open Soup in 30 + let soup = parse html in 31 + try soup $ "meta[property=og:image]" |> R.attribute "content" |> Option.some 32 + with Failure _ -> None 33 + 34 + let image_src html = 35 + let open Soup in 36 + let soup = parse html in 37 + try soup $ "link[rel=\"image_src\"]" |> R.attribute "href" |> Option.some 38 + with Failure _ -> None 39 + 40 + let twitter_image html = 41 + let open Soup in 42 + let soup = parse html in 43 + try 44 + soup $ "meta[name=\"twitter:image\"]" |> R.attribute "content" 45 + |> Option.some 46 + with Failure _ -> None 47 + 48 + let og_description html = 49 + let open Soup in 50 + let soup = parse html in 51 + try 52 + soup $ "meta[property=og:description]" |> R.attribute "content" 53 + |> Option.some 54 + with Failure _ -> None 55 + 56 + let description html = 57 + let open Soup in 58 + let soup = parse html in 59 + try 60 + soup $ "meta[property=description]" |> R.attribute "content" |> Option.some 61 + with Failure _ -> None 62 + 63 + let preview_image html = 64 + let preview_image = 65 + match og_image html with 66 + | None -> ( 67 + match image_src html with 68 + | None -> twitter_image html 69 + | Some x -> Some x) 70 + | Some x -> Some x 71 + in 72 + match Option.map String.trim preview_image with 73 + | Some "" -> None 74 + | Some x -> Some x 75 + | None -> None 76 + 77 + let description html = 78 + let preview_image = 79 + match og_description html with None -> description html | Some x -> Some x 80 + in 81 + match Option.map String.trim preview_image with 82 + | Some "" -> None 83 + | Some x -> Some x 84 + | None -> None
+393
stack/river/lib/post.ml
··· 1 + (* 2 + * Copyright (c) 2014, OCaml.org project 3 + * Copyright (c) 2015 KC Sivaramakrishnan <sk826@cl.cam.ac.uk> 4 + * 5 + * Permission to use, copy, modify, and distribute this software for any 6 + * purpose with or without fee is hereby granted, provided that the above 7 + * copyright notice and this permission notice appear in all copies. 8 + * 9 + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10 + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11 + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 12 + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13 + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 14 + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 15 + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 16 + *) 17 + 18 + (** Post representation and extraction from feeds. *) 19 + 20 + let src = Logs.Src.create "river" ~doc:"River RSS/Atom aggregator" 21 + module Log = (val Logs.src_log src : Logs.LOG) 22 + 23 + type t = { 24 + id : string; 25 + title : string; 26 + link : Uri.t option; 27 + date : Syndic.Date.t option; 28 + feed : Feed.t; 29 + author : string; 30 + email : string; 31 + content : Soup.soup Soup.node; 32 + mutable link_response : (string, string) result option; 33 + tags : string list; 34 + summary : string option; 35 + } 36 + 37 + (** Generate a stable, unique ID from available data *) 38 + let generate_id ?guid ?link ?title ?date ~feed_url () = 39 + match guid with 40 + | Some id when id <> "" -> 41 + (* Use explicit ID/GUID if available *) 42 + id 43 + | _ -> 44 + match link with 45 + | Some uri when Uri.to_string uri <> "" -> 46 + (* Use permalink as ID (stable and unique) *) 47 + Uri.to_string uri 48 + | _ -> 49 + (* Fallback: hash of feed_url + title + date *) 50 + let title_str = Option.value title ~default:"" in 51 + let date_str = 52 + match date with 53 + | Some d -> Ptime.to_rfc3339 d 54 + | None -> "" 55 + in 56 + let composite = Printf.sprintf "%s|%s|%s" feed_url title_str date_str in 57 + (* Use SHA256 for stable hashing *) 58 + Digest.string composite |> Digest.to_hex 59 + 60 + let resolve_links_attr ~xmlbase attr el = 61 + Soup.R.attribute attr el 62 + |> Uri.of_string 63 + |> Syndic.XML.resolve ~xmlbase 64 + |> Uri.to_string 65 + |> fun value -> Soup.set_attribute attr value el 66 + 67 + (* Things that posts should not contain *) 68 + let undesired_tags = [ "style"; "script" ] 69 + let undesired_attr = [ "id" ] 70 + 71 + let html_of_text ?xmlbase s = 72 + let soup = Soup.parse s in 73 + let ($$) = Soup.($$) in 74 + soup $$ "a[href]" |> Soup.iter (resolve_links_attr ~xmlbase "href"); 75 + soup $$ "img[src]" |> Soup.iter (resolve_links_attr ~xmlbase "src"); 76 + undesired_tags |> List.iter (fun tag -> soup $$ tag |> Soup.iter Soup.delete); 77 + soup $$ "*" |> Soup.iter (fun el -> 78 + undesired_attr |> List.iter (fun attr -> Soup.delete_attribute attr el)); 79 + soup 80 + 81 + (* Do not trust sites using XML for HTML content. Convert to string and parse 82 + back. (Does not always fix bad HTML unfortunately.) *) 83 + let html_of_syndic = 84 + let ns_prefix _ = Some "" in 85 + fun ?xmlbase h -> 86 + html_of_text ?xmlbase 87 + (String.concat "" (List.map (Syndic.XML.to_string ~ns_prefix) h)) 88 + 89 + let string_of_option = function None -> "" | Some s -> s 90 + 91 + let post_compare p1 p2 = 92 + (* Most recent posts first. Posts with no date are always last *) 93 + match (p1.date, p2.date) with 94 + | Some d1, Some d2 -> Syndic.Date.compare d2 d1 95 + | None, Some _ -> 1 96 + | Some _, None -> -1 97 + | None, None -> 1 98 + 99 + let rec remove n l = 100 + if n <= 0 then l else match l with [] -> [] | _ :: tl -> remove (n - 1) tl 101 + 102 + let rec take n = function 103 + | [] -> [] 104 + | e :: tl -> if n > 0 then e :: take (n - 1) tl else [] 105 + 106 + let post_of_atom ~(feed : Feed.t) (e : Syndic.Atom.entry) = 107 + Log.debug (fun m -> m "Processing Atom entry: %s" 108 + (Text_extract.string_of_text_construct e.title)); 109 + 110 + let link = 111 + try 112 + Some 113 + (List.find (fun l -> l.Syndic.Atom.rel = Syndic.Atom.Alternate) e.links) 114 + .href 115 + with Not_found -> ( 116 + Log.debug (fun m -> m "No alternate link found, trying fallback"); 117 + match e.links with 118 + | l :: _ -> Some l.href 119 + | [] -> ( 120 + match Uri.scheme e.id with 121 + | Some "http" -> Some e.id 122 + | Some "https" -> Some e.id 123 + | _ -> None)) 124 + in 125 + let date = 126 + match e.published with Some _ -> e.published | None -> Some e.updated 127 + in 128 + let content = 129 + match e.content with 130 + | Some (Text s) -> html_of_text s 131 + | Some (Html (xmlbase, s)) -> html_of_text ?xmlbase s 132 + | Some (Xhtml (xmlbase, h)) -> html_of_syndic ?xmlbase h 133 + | Some (Mime _) | Some (Src _) | None -> ( 134 + match e.summary with 135 + | Some (Text s) -> html_of_text s 136 + | Some (Html (xmlbase, s)) -> html_of_text ?xmlbase s 137 + | Some (Xhtml (xmlbase, h)) -> html_of_syndic ?xmlbase h 138 + | None -> Soup.parse "") 139 + in 140 + let is_valid_author_name name = 141 + (* Filter out empty strings and placeholder values like "Unknown" *) 142 + let trimmed = String.trim name in 143 + trimmed <> "" && trimmed <> "Unknown" 144 + in 145 + let author_name = 146 + (* Fallback chain for author: 147 + 1. Entry author (if present, not empty, and not "Unknown") 148 + 2. Feed-level author (from Atom feed metadata) 149 + 3. Feed title (from Atom feed metadata) 150 + 4. Source name (manually entered feed name) *) 151 + try 152 + let author, _ = e.authors in 153 + let trimmed = String.trim author.name in 154 + if is_valid_author_name author.name then trimmed 155 + else raise Not_found (* Try feed-level author *) 156 + with Not_found -> ( 157 + match Feed.content feed with 158 + | Feed.Atom atom_feed -> ( 159 + (* Try feed-level authors *) 160 + match atom_feed.Syndic.Atom.authors with 161 + | author :: _ when is_valid_author_name author.name -> 162 + String.trim author.name 163 + | _ -> 164 + (* Use feed title *) 165 + Text_extract.string_of_text_construct atom_feed.Syndic.Atom.title) 166 + | Feed.Rss2 _ | Feed.Json _ -> 167 + (* For RSS2 and JSONFeed, use the source name *) 168 + Source.name (Feed.source feed)) 169 + in 170 + (* Extract tags from Atom categories *) 171 + let tags = 172 + List.map (fun cat -> cat.Syndic.Atom.term) e.categories 173 + in 174 + (* Extract summary - convert from text_construct to string *) 175 + let summary = 176 + match e.summary with 177 + | Some s -> Some (Text_extract.string_of_text_construct s) 178 + | None -> None 179 + in 180 + (* Generate unique ID *) 181 + let guid = Uri.to_string e.id in 182 + let title_str = Text_extract.string_of_text_construct e.title in 183 + let id = 184 + generate_id ~guid ?link ~title:title_str ?date 185 + ~feed_url:(Source.url (Feed.source feed)) () 186 + in 187 + { 188 + id; 189 + title = title_str; 190 + link; 191 + date; 192 + feed; 193 + author = author_name; 194 + email = ""; 195 + content; 196 + link_response = None; 197 + tags; 198 + summary; 199 + } 200 + 201 + let post_of_rss2 ~(feed : Feed.t) it = 202 + let title, content = 203 + match it.Syndic.Rss2.story with 204 + | All (t, xmlbase, d) -> ( 205 + ( t, 206 + match it.content with 207 + | _, "" -> html_of_text ?xmlbase d 208 + | xmlbase, c -> html_of_text ?xmlbase c )) 209 + | Title t -> 210 + let xmlbase, c = it.content in 211 + (t, html_of_text ?xmlbase c) 212 + | Description (xmlbase, d) -> ( 213 + ( "", 214 + match it.content with 215 + | _, "" -> html_of_text ?xmlbase d 216 + | xmlbase, c -> html_of_text ?xmlbase c )) 217 + in 218 + (* Note: it.link is of type Uri.t option in Syndic *) 219 + let link = 220 + match (it.guid, it.link) with 221 + | Some u, _ when u.permalink -> Some u.data 222 + | _, Some _ -> it.link 223 + | Some u, _ -> 224 + (* Sometimes the guid is indicated with isPermaLink="false" but is 225 + nonetheless the only URL we get (e.g. ocamlpro). *) 226 + Some u.data 227 + | None, None -> None 228 + in 229 + (* Extract GUID string for ID generation *) 230 + let guid_str = 231 + match it.guid with 232 + | Some u -> Some (Uri.to_string u.data) 233 + | None -> None 234 + in 235 + (* RSS2 doesn't have a categories field exposed, use empty list *) 236 + let tags = [] in 237 + (* RSS2 doesn't have a separate summary field, so leave it empty *) 238 + let summary = None in 239 + (* Generate unique ID *) 240 + let id = 241 + generate_id ?guid:guid_str ?link ~title ?date:it.pubDate 242 + ~feed_url:(Source.url (Feed.source feed)) () 243 + in 244 + { 245 + id; 246 + title; 247 + link; 248 + feed; 249 + author = Source.name (Feed.source feed); 250 + email = string_of_option it.author; 251 + content; 252 + date = it.pubDate; 253 + link_response = None; 254 + tags; 255 + summary; 256 + } 257 + 258 + let post_of_jsonfeed_item ~(feed : Feed.t) (item : Jsonfeed.Item.t) = 259 + Log.debug (fun m -> m "Processing JSONFeed item: %s" 260 + (Option.value (Jsonfeed.Item.title item) ~default:"Untitled")); 261 + 262 + (* Extract content - prefer HTML, fall back to text *) 263 + let content = 264 + match Jsonfeed.Item.content item with 265 + | `Html html -> html_of_text html 266 + | `Text text -> html_of_text text 267 + | `Both (html, _text) -> html_of_text html 268 + in 269 + 270 + (* Extract author - use first author if multiple *) 271 + let author_name, author_email = 272 + match Jsonfeed.Item.authors item with 273 + | Some (first :: _) -> 274 + let name = Jsonfeed.Author.name first |> Option.value ~default:"" in 275 + (* JSONFeed authors don't typically have email *) 276 + (name, "") 277 + | _ -> 278 + (* Fall back to feed-level authors or feed title *) 279 + (match Feed.content feed with 280 + | Feed.Json jsonfeed -> 281 + (match Jsonfeed.authors jsonfeed with 282 + | Some (first :: _) -> 283 + let name = Jsonfeed.Author.name first |> Option.value ~default:(Feed.title feed) in 284 + (name, "") 285 + | _ -> (Feed.title feed, "")) 286 + | _ -> (Feed.title feed, "")) 287 + in 288 + 289 + (* Link - use url field *) 290 + let link = 291 + Jsonfeed.Item.url item 292 + |> Option.map Uri.of_string 293 + in 294 + 295 + (* Date *) 296 + let date = Jsonfeed.Item.date_published item in 297 + 298 + (* Summary *) 299 + let summary = Jsonfeed.Item.summary item in 300 + 301 + (* Tags *) 302 + let tags = 303 + Jsonfeed.Item.tags item 304 + |> Option.value ~default:[] 305 + in 306 + 307 + (* Generate unique ID - JSONFeed items always have an id field (required) *) 308 + let guid = Jsonfeed.Item.id item in 309 + let title_str = Jsonfeed.Item.title item |> Option.value ~default:"Untitled" in 310 + let id = 311 + generate_id ~guid ?link ~title:title_str ?date 312 + ~feed_url:(Source.url (Feed.source feed)) () 313 + in 314 + 315 + { 316 + id; 317 + title = title_str; 318 + link; 319 + date; 320 + feed; 321 + author = author_name; 322 + email = author_email; 323 + content; 324 + link_response = None; 325 + tags; 326 + summary; 327 + } 328 + 329 + let posts_of_feed c = 330 + match Feed.content c with 331 + | Feed.Atom f -> 332 + let posts = List.map (post_of_atom ~feed:c) f.Syndic.Atom.entries in 333 + Log.debug (fun m -> m "Extracted %d posts from Atom feed '%s'" 334 + (List.length posts) (Source.name (Feed.source c))); 335 + posts 336 + | Feed.Rss2 ch -> 337 + let posts = List.map (post_of_rss2 ~feed:c) ch.Syndic.Rss2.items in 338 + Log.debug (fun m -> m "Extracted %d posts from RSS2 feed '%s'" 339 + (List.length posts) (Source.name (Feed.source c))); 340 + posts 341 + | Feed.Json jsonfeed -> 342 + let items = Jsonfeed.items jsonfeed in 343 + let posts = List.map (post_of_jsonfeed_item ~feed:c) items in 344 + Log.debug (fun m -> m "Extracted %d posts from JSONFeed '%s'" 345 + (List.length posts) (Source.name (Feed.source c))); 346 + posts 347 + 348 + let get_posts ?n ?(ofs = 0) planet_feeds = 349 + Log.info (fun m -> m "Processing %d feeds for posts" (List.length planet_feeds)); 350 + 351 + let posts = List.concat @@ List.map posts_of_feed planet_feeds in 352 + Log.debug (fun m -> m "Total posts collected: %d" (List.length posts)); 353 + 354 + let posts = List.sort post_compare posts in 355 + Log.debug (fun m -> m "Posts sorted by date (most recent first)"); 356 + 357 + let posts = remove ofs posts in 358 + let result = 359 + match n with 360 + | None -> 361 + Log.debug (fun m -> m "Returning all %d posts (offset=%d)" 362 + (List.length posts) ofs); 363 + posts 364 + | Some n -> 365 + let limited = take n posts in 366 + Log.debug (fun m -> m "Returning %d posts (requested=%d, offset=%d)" 367 + (List.length limited) n ofs); 368 + limited 369 + in 370 + result 371 + 372 + let of_feeds feeds = get_posts feeds 373 + 374 + let feed t = t.feed 375 + let title t = t.title 376 + let link t = t.link 377 + let date t = t.date 378 + let author t = t.author 379 + let email t = t.email 380 + let content t = Soup.to_string t.content 381 + let id t = t.id 382 + let tags t = t.tags 383 + let summary t = t.summary 384 + 385 + let meta_description _t = 386 + (* TODO: This requires environment for HTTP access *) 387 + Log.debug (fun m -> m "meta_description not implemented (requires environment)"); 388 + None 389 + 390 + let seo_image _t = 391 + (* TODO: This requires environment for HTTP access *) 392 + Log.debug (fun m -> m "seo_image not implemented (requires environment)"); 393 + None
+68
stack/river/lib/post.mli
··· 1 + (* 2 + * Copyright (c) 2014, OCaml.org project 3 + * Copyright (c) 2015 KC Sivaramakrishnan <sk826@cl.cam.ac.uk> 4 + * 5 + * Permission to use, copy, modify, and distribute this software for any 6 + * purpose with or without fee is hereby granted, provided that the above 7 + * copyright notice and this permission notice appear in all copies. 8 + * 9 + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10 + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11 + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 12 + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13 + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 14 + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 15 + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 16 + *) 17 + 18 + (** Post representation and extraction from feeds. *) 19 + 20 + type t 21 + (** A post from a feed. *) 22 + 23 + val of_feeds : Feed.t list -> t list 24 + (** [of_feeds feeds] extracts and deduplicates posts from the given feeds. 25 + 26 + Posts are deduplicated by ID. *) 27 + 28 + val feed : t -> Feed.t 29 + (** [feed post] returns the feed this post originated from. *) 30 + 31 + val title : t -> string 32 + (** [title post] returns the post title. *) 33 + 34 + val link : t -> Uri.t option 35 + (** [link post] returns the post link. *) 36 + 37 + val date : t -> Syndic.Date.t option 38 + (** [date post] returns the post date. *) 39 + 40 + val author : t -> string 41 + (** [author post] returns the post author name. *) 42 + 43 + val email : t -> string 44 + (** [email post] returns the post author email. *) 45 + 46 + val content : t -> string 47 + (** [content post] returns the post content. *) 48 + 49 + val id : t -> string 50 + (** [id post] returns the unique identifier of the post. *) 51 + 52 + val tags : t -> string list 53 + (** [tags post] returns the list of tags associated with the post. *) 54 + 55 + val summary : t -> string option 56 + (** [summary post] returns the summary/excerpt of the post, if available. *) 57 + 58 + val meta_description : t -> string option 59 + (** [meta_description post] returns the meta description from the origin site. 60 + 61 + To get the meta description, we fetch the content of [link post] and look 62 + for an HTML meta tag with name "description" or "og:description". *) 63 + 64 + val seo_image : t -> string option 65 + (** [seo_image post] returns the social media image URL. 66 + 67 + To get the SEO image, we fetch the content of [link post] and look for an 68 + HTML meta tag with name "og:image" or "twitter:image". *)
+192
stack/river/lib/quality.ml
··· 1 + (* 2 + * Copyright (c) 2014, OCaml.org project 3 + * Copyright (c) 2015 KC Sivaramakrishnan <sk826@cl.cam.ac.uk> 4 + * 5 + * Permission to use, copy, modify, and distribute this software for any 6 + * purpose with or without fee is hereby granted, provided that the above 7 + * copyright notice and this permission notice appear in all copies. 8 + * 9 + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10 + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11 + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 12 + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13 + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 14 + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 15 + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 16 + *) 17 + 18 + (** Feed quality analysis. *) 19 + 20 + type t = { 21 + total_entries : int; 22 + entries_with_summary : int; 23 + entries_with_author : int; 24 + entries_with_date : int; 25 + entries_with_content : int; 26 + entries_with_tags : int; 27 + avg_content_length : float; 28 + min_content_length : int; 29 + max_content_length : int; 30 + posting_frequency_days : float option; 31 + quality_score : float; 32 + } 33 + 34 + let make ~total_entries ~entries_with_summary ~entries_with_author 35 + ~entries_with_date ~entries_with_content ~entries_with_tags 36 + ~avg_content_length ~min_content_length ~max_content_length 37 + ~posting_frequency_days ~quality_score = 38 + { 39 + total_entries; 40 + entries_with_summary; 41 + entries_with_author; 42 + entries_with_date; 43 + entries_with_content; 44 + entries_with_tags; 45 + avg_content_length; 46 + min_content_length; 47 + max_content_length; 48 + posting_frequency_days; 49 + quality_score; 50 + } 51 + 52 + let total_entries t = t.total_entries 53 + let entries_with_summary t = t.entries_with_summary 54 + let entries_with_author t = t.entries_with_author 55 + let entries_with_date t = t.entries_with_date 56 + let entries_with_content t = t.entries_with_content 57 + let entries_with_tags t = t.entries_with_tags 58 + let avg_content_length t = t.avg_content_length 59 + let min_content_length t = t.min_content_length 60 + let max_content_length t = t.max_content_length 61 + let posting_frequency_days t = t.posting_frequency_days 62 + let quality_score t = t.quality_score 63 + 64 + (** Get content length from an Atom entry *) 65 + let get_content_length (entry : Syndic.Atom.entry) = 66 + match entry.content with 67 + | Some (Syndic.Atom.Text s) -> String.length s 68 + | Some (Syndic.Atom.Html (_, s)) -> String.length s 69 + | Some (Syndic.Atom.Xhtml (_, _)) -> 0 (* Could calculate but complex *) 70 + | Some (Syndic.Atom.Mime _) -> 0 71 + | Some (Syndic.Atom.Src _) -> 0 72 + | None -> ( 73 + match entry.summary with 74 + | Some (Syndic.Atom.Text s) -> String.length s 75 + | Some (Syndic.Atom.Html (_, s)) -> String.length s 76 + | Some (Syndic.Atom.Xhtml (_, _)) -> 0 77 + | None -> 0) 78 + 79 + (** Check if entry has non-empty summary *) 80 + let has_summary (entry : Syndic.Atom.entry) = 81 + match entry.summary with 82 + | Some (Syndic.Atom.Text s) when String.trim s <> "" -> true 83 + | Some (Syndic.Atom.Html (_, s)) when String.trim s <> "" -> true 84 + | Some (Syndic.Atom.Xhtml (_, _)) -> true 85 + | _ -> false 86 + 87 + (** Check if entry has author *) 88 + let has_author (entry : Syndic.Atom.entry) = 89 + let (author, _) = entry.authors in 90 + String.trim author.name <> "" 91 + 92 + (** Check if entry has content *) 93 + let has_content (entry : Syndic.Atom.entry) = 94 + get_content_length entry > 0 95 + 96 + (** Check if entry has tags/categories *) 97 + let has_tags (entry : Syndic.Atom.entry) = 98 + entry.categories <> [] 99 + 100 + (** Calculate quality score from metrics *) 101 + let calculate_quality_score t = 102 + let total = float_of_int t.total_entries in 103 + if total = 0.0 then 0.0 104 + else 105 + let summary_pct = float_of_int t.entries_with_summary /. total *. 100.0 in 106 + let author_pct = float_of_int t.entries_with_author /. total *. 100.0 in 107 + let date_pct = float_of_int t.entries_with_date /. total *. 100.0 in 108 + let content_pct = float_of_int t.entries_with_content /. total *. 100.0 in 109 + let tags_pct = float_of_int t.entries_with_tags /. total *. 100.0 in 110 + 111 + (* Weighted average: content and dates are most important *) 112 + let score = 113 + (content_pct *. 0.30) +. 114 + (date_pct *. 0.25) +. 115 + (author_pct *. 0.20) +. 116 + (summary_pct *. 0.15) +. 117 + (tags_pct *. 0.10) 118 + in 119 + score 120 + 121 + let analyze entries = 122 + if entries = [] then 123 + failwith "No entries to analyze" 124 + else 125 + let total_entries = List.length entries in 126 + 127 + let entries_with_summary = ref 0 in 128 + let entries_with_author = ref 0 in 129 + let entries_with_date = ref total_entries in (* All Atom entries have updated *) 130 + let entries_with_content = ref 0 in 131 + let entries_with_tags = ref 0 in 132 + let content_lengths = ref [] in 133 + let dates = ref [] in 134 + 135 + List.iter (fun (entry : Syndic.Atom.entry) -> 136 + if has_summary entry then incr entries_with_summary; 137 + if has_author entry then incr entries_with_author; 138 + if has_content entry then begin 139 + incr entries_with_content; 140 + content_lengths := get_content_length entry :: !content_lengths 141 + end; 142 + if has_tags entry then incr entries_with_tags; 143 + dates := entry.updated :: !dates 144 + ) entries; 145 + 146 + (* Calculate content statistics *) 147 + let avg_content_length, min_content_length, max_content_length = 148 + if !content_lengths = [] then 149 + (0.0, 0, 0) 150 + else 151 + let sorted = List.sort compare !content_lengths in 152 + let sum = List.fold_left (+) 0 sorted in 153 + let avg = float_of_int sum /. float_of_int (List.length sorted) in 154 + let min_len = List.hd sorted in 155 + let max_len = List.hd (List.rev sorted) in 156 + (avg, min_len, max_len) 157 + in 158 + 159 + (* Calculate posting frequency *) 160 + let posting_frequency_days = 161 + if List.length !dates < 2 then 162 + None 163 + else 164 + try 165 + let timestamps = List.map Ptime.to_float_s !dates in 166 + let sorted_timestamps = List.sort compare timestamps in 167 + let first = List.hd sorted_timestamps in 168 + let last = List.hd (List.rev sorted_timestamps) in 169 + let total_days = (last -. first) /. 86400.0 in 170 + let num_intervals = float_of_int (List.length sorted_timestamps - 1) in 171 + Some (total_days /. num_intervals) 172 + with _ -> None 173 + in 174 + 175 + (* Create metrics record (without quality_score first) *) 176 + let metrics = { 177 + total_entries; 178 + entries_with_summary = !entries_with_summary; 179 + entries_with_author = !entries_with_author; 180 + entries_with_date = !entries_with_date; 181 + entries_with_content = !entries_with_content; 182 + entries_with_tags = !entries_with_tags; 183 + avg_content_length; 184 + min_content_length; 185 + max_content_length; 186 + posting_frequency_days; 187 + quality_score = 0.0; (* Placeholder *) 188 + } in 189 + 190 + (* Calculate quality score *) 191 + let quality_score = calculate_quality_score metrics in 192 + { metrics with quality_score }
+57
stack/river/lib/quality.mli
··· 1 + (* 2 + * Copyright (c) 2014, OCaml.org project 3 + * Copyright (c) 2015 KC Sivaramakrishnan <sk826@cl.cam.ac.uk> 4 + * 5 + * Permission to use, copy, modify, and distribute this software for any 6 + * purpose with or without fee is hereby granted, provided that the above 7 + * copyright notice and this permission notice appear in all copies. 8 + * 9 + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10 + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11 + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 12 + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13 + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 14 + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 15 + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 16 + *) 17 + 18 + (** Feed quality analysis. *) 19 + 20 + type t 21 + (** Quality metrics for a feed or user's aggregated feed. *) 22 + 23 + val make : 24 + total_entries:int -> 25 + entries_with_summary:int -> 26 + entries_with_author:int -> 27 + entries_with_date:int -> 28 + entries_with_content:int -> 29 + entries_with_tags:int -> 30 + avg_content_length:float -> 31 + min_content_length:int -> 32 + max_content_length:int -> 33 + posting_frequency_days:float option -> 34 + quality_score:float -> 35 + t 36 + (** [make ~total_entries ...] creates quality metrics. *) 37 + 38 + val total_entries : t -> int 39 + val entries_with_summary : t -> int 40 + val entries_with_author : t -> int 41 + val entries_with_date : t -> int 42 + val entries_with_content : t -> int 43 + val entries_with_tags : t -> int 44 + val avg_content_length : t -> float 45 + val min_content_length : t -> int 46 + val max_content_length : t -> int 47 + val posting_frequency_days : t -> float option 48 + val quality_score : t -> float 49 + (** Accessors for quality metrics. *) 50 + 51 + val analyze : Syndic.Atom.entry list -> t 52 + (** [analyze entries] computes quality metrics from Atom entries. 53 + 54 + The quality score is a weighted average of: 55 + - Content completeness (40%) 56 + - Metadata completeness (30%) 57 + - Content richness (30%) *)
+9 -1699
stack/river/lib/river.ml
··· 17 17 18 18 (** River RSS/Atom/JSONFeed aggregator library *) 19 19 20 - let src = Logs.Src.create "river" ~doc:"River RSS/Atom aggregator" 21 - module Log = (val Logs.src_log src : Logs.LOG) 22 - 23 - (** {1 Internal Utilities} *) 24 - 25 - module Text_extract = struct 26 - open Syndic 27 - 28 - (* Remove all tags *) 29 - let rec syndic_to_buffer b = function 30 - | XML.Node (_, _, subs) -> List.iter (syndic_to_buffer b) subs 31 - | XML.Data (_, d) -> Buffer.add_string b d 32 - 33 - let syndic_to_string x = 34 - let b = Buffer.create 1024 in 35 - List.iter (syndic_to_buffer b) x; 36 - Buffer.contents b 37 - 38 - let string_of_text_construct : Atom.text_construct -> string = function 39 - | Atom.Text s | Atom.Html (_, s) -> s 40 - | Atom.Xhtml (_, x) -> syndic_to_string x 41 - end 42 - 43 - module Html_meta = struct 44 - [@@@warning "-32"] (* Suppress unused value warnings for internal utilities *) 45 - 46 - (** This module determines an image to be used as preview of a website. 47 - 48 - It does this by following the same logic Google+ and other websites use, and 49 - described in this article: 50 - https://www.raymondcamden.com/2011/07/26/How-are-Facebook-and-Google-creating-link-previews *) 51 - 52 - let og_image html = 53 - let open Soup in 54 - let soup = parse html in 55 - try soup $ "meta[property=og:image]" |> R.attribute "content" |> Option.some 56 - with Failure _ -> None 57 - 58 - let image_src html = 59 - let open Soup in 60 - let soup = parse html in 61 - try soup $ "link[rel=\"image_src\"]" |> R.attribute "href" |> Option.some 62 - with Failure _ -> None 63 - 64 - let twitter_image html = 65 - let open Soup in 66 - let soup = parse html in 67 - try 68 - soup $ "meta[name=\"twitter:image\"]" |> R.attribute "content" 69 - |> Option.some 70 - with Failure _ -> None 71 - 72 - let og_description html = 73 - let open Soup in 74 - let soup = parse html in 75 - try 76 - soup $ "meta[property=og:description]" |> R.attribute "content" 77 - |> Option.some 78 - with Failure _ -> None 79 - 80 - let description html = 81 - let open Soup in 82 - let soup = parse html in 83 - try 84 - soup $ "meta[property=description]" |> R.attribute "content" |> Option.some 85 - with Failure _ -> None 86 - 87 - let preview_image html = 88 - let preview_image = 89 - match og_image html with 90 - | None -> ( 91 - match image_src html with 92 - | None -> twitter_image html 93 - | Some x -> Some x) 94 - | Some x -> Some x 95 - in 96 - match Option.map String.trim preview_image with 97 - | Some "" -> None 98 - | Some x -> Some x 99 - | None -> None 100 - 101 - let description html = 102 - let preview_image = 103 - match og_description html with None -> description html | Some x -> Some x 104 - in 105 - match Option.map String.trim preview_image with 106 - | Some "" -> None 107 - | Some x -> Some x 108 - | None -> None 109 - end 110 - 111 - module Html_markdown = struct 112 - [@@@warning "-32"] (* Suppress unused value warnings for internal utilities *) 113 - 114 - (** HTML to Markdown converter using Lambda Soup *) 115 - 116 - (** Extract all links from HTML content *) 117 - let extract_links html_str = 118 - try 119 - let soup = Soup.parse html_str in 120 - let links = Soup.select "a[href]" soup in 121 - Soup.fold (fun acc link -> 122 - match Soup.attribute "href" link with 123 - | Some href -> 124 - let text = Soup.texts link |> String.concat "" |> String.trim in 125 - (href, text) :: acc 126 - | None -> acc 127 - ) [] links 128 - |> List.rev 129 - with _ -> [] 130 - 131 - (** Check if string contains any whitespace *) 132 - let has_whitespace s = 133 - try 134 - let _ = Str.search_forward (Str.regexp "[ \t\n\r]") s 0 in 135 - true 136 - with Not_found -> false 137 - 138 - (** Clean up excessive newlines and normalize spacing *) 139 - let cleanup_markdown s = 140 - (* Normalize line endings *) 141 - let s = Str.global_replace (Str.regexp "\r\n") "\n" s in 142 - 143 - (* Remove trailing whitespace from each line *) 144 - let lines = String.split_on_char '\n' s in 145 - let lines = List.map (fun line -> 146 - (* Trim trailing spaces but preserve leading spaces for indentation *) 147 - let len = String.length line in 148 - let rec find_last_non_space i = 149 - if i < 0 then -1 150 - else if line.[i] = ' ' || line.[i] = '\t' then find_last_non_space (i - 1) 151 - else i 152 - in 153 - let last = find_last_non_space (len - 1) in 154 - if last < 0 then "" 155 - else String.sub line 0 (last + 1) 156 - ) lines in 157 - 158 - (* Join back and collapse excessive blank lines *) 159 - let s = String.concat "\n" lines in 160 - 161 - (* Replace 3+ consecutive newlines with just 2 *) 162 - let s = Str.global_replace (Str.regexp "\n\n\n+") "\n\n" s in 163 - 164 - (* Trim leading and trailing whitespace *) 165 - String.trim s 166 - 167 - (** Convert HTML to Markdown using state-based whitespace handling *) 168 - let html_to_markdown html_str = 169 - try 170 - let soup = Soup.parse html_str in 171 - let buffer = Buffer.create 256 in 172 - 173 - (* State: track if we need to insert a space before next text *) 174 - let need_space = ref false in 175 - 176 - (* Get last character in buffer, if any *) 177 - let last_char () = 178 - let len = Buffer.length buffer in 179 - if len = 0 then None 180 - else Some (Buffer.nth buffer (len - 1)) 181 - in 182 - 183 - (* Add text with proper spacing *) 184 - let add_text text = 185 - let trimmed = String.trim text in 186 - if trimmed <> "" then begin 187 - (* Check if text starts with punctuation that shouldn't have space before it *) 188 - let starts_with_punctuation = 189 - String.length trimmed > 0 && 190 - (match trimmed.[0] with 191 - | ',' | '.' | ';' | ':' | '!' | '?' | ')' | ']' | '}' -> true 192 - | _ -> false) 193 - in 194 - 195 - (* Add space if needed, unless we're before punctuation *) 196 - if !need_space && not starts_with_punctuation then begin 197 - match last_char () with 198 - | Some (' ' | '\n') -> () 199 - | _ -> Buffer.add_char buffer ' ' 200 - end; 201 - Buffer.add_string buffer trimmed; 202 - need_space := false 203 - end 204 - in 205 - 206 - (* Mark that we need space before next text (for inline elements) *) 207 - let mark_space_needed () = 208 - need_space := has_whitespace (Buffer.contents buffer) || Buffer.length buffer > 0 209 - in 210 - 211 - (* Process header with ID/anchor handling *) 212 - let process_header level elem = 213 - need_space := false; 214 - 215 - (* Check if header contains a link with an ID fragment *) 216 - let link_opt = Soup.select_one "a[href]" elem in 217 - let anchor_id = match link_opt with 218 - | Some link -> 219 - (match Soup.attribute "href" link with 220 - | Some href -> 221 - (* Extract fragment from URL *) 222 - let uri = Uri.of_string href in 223 - Uri.fragment uri 224 - | None -> None) 225 - | None -> None 226 - in 227 - 228 - (* Add anchor if we found an ID *) 229 - (match anchor_id with 230 - | Some id when id <> "" -> 231 - Buffer.add_string buffer (Printf.sprintf "\n<a name=\"%s\"></a>\n" id) 232 - | _ -> ()); 233 - 234 - (* Add the header marker *) 235 - let marker = String.make level '#' in 236 - Buffer.add_string buffer ("\n" ^ marker ^ " "); 237 - 238 - (* Get text content, excluding link tags *) 239 - let text = Soup.texts elem |> String.concat " " |> String.trim in 240 - Buffer.add_string buffer text; 241 - 242 - Buffer.add_string buffer "\n\n"; 243 - need_space := false 244 - in 245 - 246 - let rec process_node node = 247 - match Soup.element node with 248 - | Some elem -> 249 - let tag = Soup.name elem in 250 - (match tag with 251 - (* Block elements - reset space tracking *) 252 - | "h1" -> process_header 1 elem 253 - | "h2" -> process_header 2 elem 254 - | "h3" -> process_header 3 elem 255 - | "h4" -> process_header 4 elem 256 - | "h5" -> process_header 5 elem 257 - | "h6" -> process_header 6 elem 258 - | "p" -> 259 - need_space := false; 260 - Soup.children elem |> Soup.iter process_node; 261 - Buffer.add_string buffer "\n\n"; 262 - need_space := false 263 - | "br" -> 264 - Buffer.add_string buffer "\n"; 265 - need_space := false 266 - (* Inline elements - preserve space tracking *) 267 - | "strong" | "b" -> 268 - (* Add space before if needed *) 269 - if !need_space then begin 270 - match last_char () with 271 - | Some (' ' | '\n') -> () 272 - | _ -> Buffer.add_char buffer ' ' 273 - end; 274 - Buffer.add_string buffer "**"; 275 - need_space := false; 276 - Soup.children elem |> Soup.iter process_node; 277 - Buffer.add_string buffer "**"; 278 - mark_space_needed () 279 - | "em" | "i" -> 280 - (* Add space before if needed *) 281 - if !need_space then begin 282 - match last_char () with 283 - | Some (' ' | '\n') -> () 284 - | _ -> Buffer.add_char buffer ' ' 285 - end; 286 - Buffer.add_string buffer "*"; 287 - need_space := false; 288 - Soup.children elem |> Soup.iter process_node; 289 - Buffer.add_string buffer "*"; 290 - mark_space_needed () 291 - | "code" -> 292 - (* Add space before if needed *) 293 - if !need_space then begin 294 - match last_char () with 295 - | Some (' ' | '\n') -> () 296 - | _ -> Buffer.add_char buffer ' ' 297 - end; 298 - Buffer.add_string buffer "`"; 299 - need_space := false; 300 - Soup.children elem |> Soup.iter process_node; 301 - Buffer.add_string buffer "`"; 302 - mark_space_needed () 303 - | "pre" -> 304 - need_space := false; 305 - Buffer.add_string buffer "\n```\n"; 306 - Soup.children elem |> Soup.iter process_node; 307 - Buffer.add_string buffer "\n```\n\n"; 308 - need_space := false 309 - | "a" -> 310 - let text = Soup.texts elem |> String.concat " " |> String.trim in 311 - let href = Soup.attribute "href" elem in 312 - (match href with 313 - | Some href -> 314 - (* Add space before link if needed *) 315 - if !need_space then begin 316 - match last_char () with 317 - | Some (' ' | '\n') -> () 318 - | _ -> Buffer.add_char buffer ' ' 319 - end; 320 - need_space := false; 321 - 322 - (* Add the link markdown *) 323 - if text = "" then 324 - Buffer.add_string buffer (Printf.sprintf "<%s>" href) 325 - else 326 - Buffer.add_string buffer (Printf.sprintf "[%s](%s)" text href); 327 - 328 - (* Mark that space may be needed after link *) 329 - mark_space_needed () 330 - | None -> 331 - add_text text) 332 - | "ul" | "ol" -> 333 - need_space := false; 334 - Buffer.add_string buffer "\n"; 335 - let is_ordered = tag = "ol" in 336 - let items = Soup.children elem |> Soup.to_list in 337 - List.iteri (fun i item -> 338 - match Soup.element item with 339 - | Some li when Soup.name li = "li" -> 340 - need_space := false; 341 - if is_ordered then 342 - Buffer.add_string buffer (Printf.sprintf "%d. " (i + 1)) 343 - else 344 - Buffer.add_string buffer "- "; 345 - Soup.children li |> Soup.iter process_node; 346 - Buffer.add_string buffer "\n" 347 - | _ -> () 348 - ) items; 349 - Buffer.add_string buffer "\n"; 350 - need_space := false 351 - | "blockquote" -> 352 - need_space := false; 353 - Buffer.add_string buffer "\n> "; 354 - Soup.children elem |> Soup.iter process_node; 355 - Buffer.add_string buffer "\n\n"; 356 - need_space := false 357 - | "img" -> 358 - (* Add space before if needed *) 359 - if !need_space then begin 360 - match last_char () with 361 - | Some (' ' | '\n') -> () 362 - | _ -> Buffer.add_char buffer ' ' 363 - end; 364 - let alt = Soup.attribute "alt" elem |> Option.value ~default:"" in 365 - let src = Soup.attribute "src" elem |> Option.value ~default:"" in 366 - Buffer.add_string buffer (Printf.sprintf "![%s](%s)" alt src); 367 - need_space := false; 368 - mark_space_needed () 369 - | "hr" -> 370 - need_space := false; 371 - Buffer.add_string buffer "\n---\n\n"; 372 - need_space := false 373 - (* Strip these tags but keep content *) 374 - | "div" | "span" | "article" | "section" | "header" | "footer" 375 - | "main" | "nav" | "aside" | "figure" | "figcaption" | "details" | "summary" -> 376 - Soup.children elem |> Soup.iter process_node 377 - (* Ignore script, style, etc *) 378 - | "script" | "style" | "noscript" -> () 379 - (* Default: just process children *) 380 - | _ -> 381 - Soup.children elem |> Soup.iter process_node) 382 - | None -> 383 - (* Text node - handle whitespace properly *) 384 - match Soup.leaf_text node with 385 - | Some text -> 386 - (* If text is only whitespace, mark that we need space *) 387 - let trimmed = String.trim text in 388 - if trimmed = "" then begin 389 - if has_whitespace text then 390 - need_space := true 391 - end else begin 392 - (* Text has content - check if it had leading/trailing whitespace *) 393 - let had_leading_ws = has_whitespace text && 394 - (String.length text > 0 && 395 - (text.[0] = ' ' || text.[0] = '\t' || text.[0] = '\n' || text.[0] = '\r')) in 396 - 397 - (* If had leading whitespace, mark we need space *) 398 - if had_leading_ws then need_space := true; 399 - 400 - (* Add the text content *) 401 - add_text trimmed; 402 - 403 - (* If had trailing whitespace, mark we need space for next *) 404 - let had_trailing_ws = has_whitespace text && 405 - (String.length text > 0 && 406 - let last = text.[String.length text - 1] in 407 - last = ' ' || last = '\t' || last = '\n' || last = '\r') in 408 - if had_trailing_ws then need_space := true 409 - end 410 - | None -> () 411 - in 412 - 413 - Soup.children soup |> Soup.iter process_node; 414 - 415 - (* Clean up the result *) 416 - let result = Buffer.contents buffer in 417 - cleanup_markdown result 418 - with _ -> html_str 419 - 420 - (** Convert HTML content to clean Markdown *) 421 - let to_markdown html_str = 422 - html_to_markdown html_str 423 - end 424 - 425 - (** {1 Feed Sources} *) 426 - 427 - module Source = struct 428 - type t = { 429 - name : string; 430 - url : string; 431 - } 432 - 433 - let make ~name ~url = { name; url } 434 - 435 - let name t = t.name 436 - let url t = t.url 437 - 438 - let jsont = 439 - let make name url = { name; url } in 440 - Jsont.Object.map ~kind:"Source" make 441 - |> Jsont.Object.mem "name" Jsont.string ~enc:(fun s -> s.name) 442 - |> Jsont.Object.mem "url" Jsont.string ~enc:(fun s -> s.url) 443 - |> Jsont.Object.finish 444 - end 445 - 446 - (** {1 HTTP Session Management} *) 447 - 448 - module Session = struct 449 - type t = { 450 - session : (float Eio.Time.clock_ty Eio.Resource.t, 451 - [`Generic | `Unix] Eio.Net.ty Eio.Resource.t) Requests.t; 452 - } 453 - 454 - let init ~sw env = 455 - Log.info (fun m -> m "Initializing River session"); 456 - let session = Requests.create ~sw 457 - ~default_headers:(Requests.Headers.of_list [ 458 - ("User-Agent", "OCaml-River/1.0"); 459 - ]) 460 - ~follow_redirects:true 461 - ~max_redirects:5 462 - ~verify_tls:true 463 - env 464 - in 465 - { session } 466 - 467 - let with_session env f = 468 - Log.info (fun m -> m "Creating River session"); 469 - Eio.Switch.run @@ fun sw -> 470 - let client = init ~sw env in 471 - f client 472 - 473 - let get_requests_session t = t.session 474 - end 475 - 476 - (** {1 Feeds and Posts} *) 477 - 478 - module Feed = struct 479 - type feed_content = 480 - | Atom of Syndic.Atom.feed 481 - | Rss2 of Syndic.Rss2.channel 482 - | Json of Jsonfeed.t 483 - 484 - type t = { 485 - source : Source.t; 486 - title : string; 487 - content : feed_content; 488 - } 489 - 490 - let string_of_feed = function 491 - | Atom _ -> "Atom" 492 - | Rss2 _ -> "Rss2" 493 - | Json _ -> "JSONFeed" 494 - 495 - let classify_feed ~xmlbase (body : string) = 496 - Log.debug (fun m -> m "Attempting to parse feed (%d bytes)" (String.length body)); 497 - 498 - (* Quick check - does it look like JSON? *) 499 - let looks_like_json = 500 - String.length body > 0 && 501 - let first_char = String.get body 0 in 502 - first_char = '{' || first_char = '[' 503 - in 504 - 505 - if looks_like_json then ( 506 - (* Try JSONFeed first *) 507 - Log.debug (fun m -> m "Body looks like JSON, trying JSONFeed parser"); 508 - match Jsonfeed.of_string body with 509 - | Ok jsonfeed -> 510 - Log.debug (fun m -> m "Successfully parsed as JSONFeed"); 511 - Json jsonfeed 512 - | Error err -> 513 - Log.debug (fun m -> m "Not a JSONFeed: %s" (Jsont.Error.to_string err)); 514 - (* Fall through to XML parsing *) 515 - failwith "Not a valid JSONFeed" 516 - ) else ( 517 - (* Try XML formats *) 518 - try 519 - let feed = Atom (Syndic.Atom.parse ~xmlbase (Xmlm.make_input (`String (0, body)))) in 520 - Log.debug (fun m -> m "Successfully parsed as Atom feed"); 521 - feed 522 - with 523 - | Syndic.Atom.Error.Error (pos, msg) -> ( 524 - Log.debug (fun m -> m "Not an Atom feed: %s at position (%d, %d)" 525 - msg (fst pos) (snd pos)); 526 - try 527 - let feed = Rss2 (Syndic.Rss2.parse ~xmlbase (Xmlm.make_input (`String (0, body)))) in 528 - Log.debug (fun m -> m "Successfully parsed as RSS2 feed"); 529 - feed 530 - with Syndic.Rss2.Error.Error (pos, msg) -> 531 - Log.err (fun m -> m "Failed to parse as RSS2: %s at position (%d, %d)" 532 - msg (fst pos) (snd pos)); 533 - failwith "Neither Atom nor RSS2 feed") 534 - | Not_found as e -> 535 - Log.err (fun m -> m "Not_found exception during Atom feed parsing"); 536 - Log.err (fun m -> m "Backtrace:\n%s" (Printexc.get_backtrace ())); 537 - raise e 538 - | e -> 539 - Log.err (fun m -> m "Unexpected exception during feed parsing: %s" 540 - (Printexc.to_string e)); 541 - Log.err (fun m -> m "Backtrace:\n%s" (Printexc.get_backtrace ())); 542 - raise e 543 - ) 544 - 545 - let fetch session source = 546 - Log.info (fun m -> m "Fetching feed: %s" (Source.name source)); 547 - 548 - let xmlbase = Uri.of_string (Source.url source) in 549 - 550 - (* Use Requests_json_api.get_result for clean Result-based error handling *) 551 - let requests_session = Session.get_requests_session session in 552 - let response = 553 - match Requests_json_api.get_result requests_session (Source.url source) with 554 - | Ok body -> 555 - Log.info (fun m -> m "Successfully fetched %s (%d bytes)" 556 - (Source.url source) (String.length body)); 557 - body 558 - | Error (status, msg) -> 559 - Log.err (fun m -> m "Failed to fetch feed '%s': HTTP %d - %s" 560 - (Source.name source) status msg); 561 - failwith (Printf.sprintf "HTTP %d: %s" status msg) 562 - in 563 - 564 - let content = classify_feed ~xmlbase response in 565 - let title = 566 - match content with 567 - | Atom atom -> Text_extract.string_of_text_construct atom.Syndic.Atom.title 568 - | Rss2 ch -> ch.Syndic.Rss2.title 569 - | Json jsonfeed -> Jsonfeed.title jsonfeed 570 - in 571 - 572 - Log.info (fun m -> m "Successfully fetched %s feed '%s' (title: '%s')" 573 - (string_of_feed content) (Source.name source) title); 574 - 575 - { source; title; content } 576 - 577 - let source t = t.source 578 - end 579 - 580 - (** {1 Posts} *) 581 - 582 - module Post = struct 583 - type t = { 584 - id : string; 585 - title : string; 586 - link : Uri.t option; 587 - date : Syndic.Date.t option; 588 - feed : Feed.t; 589 - author : string; 590 - email : string; 591 - content : Soup.soup Soup.node; 592 - mutable link_response : (string, string) result option; 593 - tags : string list; 594 - summary : string option; 595 - } 596 - 597 - (** Generate a stable, unique ID from available data *) 598 - let generate_id ?guid ?link ?title ?date ~feed_url () = 599 - match guid with 600 - | Some id when id <> "" -> 601 - (* Use explicit ID/GUID if available *) 602 - id 603 - | _ -> 604 - match link with 605 - | Some uri when Uri.to_string uri <> "" -> 606 - (* Use permalink as ID (stable and unique) *) 607 - Uri.to_string uri 608 - | _ -> 609 - (* Fallback: hash of feed_url + title + date *) 610 - let title_str = Option.value title ~default:"" in 611 - let date_str = 612 - match date with 613 - | Some d -> Ptime.to_rfc3339 d 614 - | None -> "" 615 - in 616 - let composite = Printf.sprintf "%s|%s|%s" feed_url title_str date_str in 617 - (* Use SHA256 for stable hashing *) 618 - Digest.string composite |> Digest.to_hex 619 - 620 - let resolve_links_attr ~xmlbase attr el = 621 - Soup.R.attribute attr el 622 - |> Uri.of_string 623 - |> Syndic.XML.resolve ~xmlbase 624 - |> Uri.to_string 625 - |> fun value -> Soup.set_attribute attr value el 626 - 627 - (* Things that posts should not contain *) 628 - let undesired_tags = [ "style"; "script" ] 629 - let undesired_attr = [ "id" ] 630 - 631 - let html_of_text ?xmlbase s = 632 - let soup = Soup.parse s in 633 - let ($$) = Soup.($$) in 634 - soup $$ "a[href]" |> Soup.iter (resolve_links_attr ~xmlbase "href"); 635 - soup $$ "img[src]" |> Soup.iter (resolve_links_attr ~xmlbase "src"); 636 - undesired_tags |> List.iter (fun tag -> soup $$ tag |> Soup.iter Soup.delete); 637 - soup $$ "*" |> Soup.iter (fun el -> 638 - undesired_attr |> List.iter (fun attr -> Soup.delete_attribute attr el)); 639 - soup 640 - 641 - (* Do not trust sites using XML for HTML content. Convert to string and parse 642 - back. (Does not always fix bad HTML unfortunately.) *) 643 - let html_of_syndic = 644 - let ns_prefix _ = Some "" in 645 - fun ?xmlbase h -> 646 - html_of_text ?xmlbase 647 - (String.concat "" (List.map (Syndic.XML.to_string ~ns_prefix) h)) 648 - 649 - let string_of_option = function None -> "" | Some s -> s 650 - 651 - let post_compare p1 p2 = 652 - (* Most recent posts first. Posts with no date are always last *) 653 - match (p1.date, p2.date) with 654 - | Some d1, Some d2 -> Syndic.Date.compare d2 d1 655 - | None, Some _ -> 1 656 - | Some _, None -> -1 657 - | None, None -> 1 658 - 659 - let rec remove n l = 660 - if n <= 0 then l else match l with [] -> [] | _ :: tl -> remove (n - 1) tl 661 - 662 - let rec take n = function 663 - | [] -> [] 664 - | e :: tl -> if n > 0 then e :: take (n - 1) tl else [] 665 - 666 - let post_of_atom ~(feed : Feed.t) (e : Syndic.Atom.entry) = 667 - Log.debug (fun m -> m "Processing Atom entry: %s" 668 - (Text_extract.string_of_text_construct e.title)); 669 - 670 - let link = 671 - try 672 - Some 673 - (List.find (fun l -> l.Syndic.Atom.rel = Syndic.Atom.Alternate) e.links) 674 - .href 675 - with Not_found -> ( 676 - Log.debug (fun m -> m "No alternate link found, trying fallback"); 677 - match e.links with 678 - | l :: _ -> Some l.href 679 - | [] -> ( 680 - match Uri.scheme e.id with 681 - | Some "http" -> Some e.id 682 - | Some "https" -> Some e.id 683 - | _ -> None)) 684 - in 685 - let date = 686 - match e.published with Some _ -> e.published | None -> Some e.updated 687 - in 688 - let content = 689 - match e.content with 690 - | Some (Text s) -> html_of_text s 691 - | Some (Html (xmlbase, s)) -> html_of_text ?xmlbase s 692 - | Some (Xhtml (xmlbase, h)) -> html_of_syndic ?xmlbase h 693 - | Some (Mime _) | Some (Src _) | None -> ( 694 - match e.summary with 695 - | Some (Text s) -> html_of_text s 696 - | Some (Html (xmlbase, s)) -> html_of_text ?xmlbase s 697 - | Some (Xhtml (xmlbase, h)) -> html_of_syndic ?xmlbase h 698 - | None -> Soup.parse "") 699 - in 700 - let is_valid_author_name name = 701 - (* Filter out empty strings and placeholder values like "Unknown" *) 702 - let trimmed = String.trim name in 703 - trimmed <> "" && trimmed <> "Unknown" 704 - in 705 - let author_name = 706 - (* Fallback chain for author: 707 - 1. Entry author (if present, not empty, and not "Unknown") 708 - 2. Feed-level author (from Atom feed metadata) 709 - 3. Feed title (from Atom feed metadata) 710 - 4. Source name (manually entered feed name) *) 711 - try 712 - let author, _ = e.authors in 713 - let trimmed = String.trim author.name in 714 - if is_valid_author_name author.name then trimmed 715 - else raise Not_found (* Try feed-level author *) 716 - with Not_found -> ( 717 - match feed.content with 718 - | Feed.Atom atom_feed -> ( 719 - (* Try feed-level authors *) 720 - match atom_feed.Syndic.Atom.authors with 721 - | author :: _ when is_valid_author_name author.name -> 722 - String.trim author.name 723 - | _ -> 724 - (* Use feed title *) 725 - Text_extract.string_of_text_construct atom_feed.Syndic.Atom.title) 726 - | Feed.Rss2 _ | Feed.Json _ -> 727 - (* For RSS2 and JSONFeed, use the source name *) 728 - Source.name feed.source) 729 - in 730 - (* Extract tags from Atom categories *) 731 - let tags = 732 - List.map (fun cat -> cat.Syndic.Atom.term) e.categories 733 - in 734 - (* Extract summary - convert from text_construct to string *) 735 - let summary = 736 - match e.summary with 737 - | Some s -> Some (Text_extract.string_of_text_construct s) 738 - | None -> None 739 - in 740 - (* Generate unique ID *) 741 - let guid = Uri.to_string e.id in 742 - let title_str = Text_extract.string_of_text_construct e.title in 743 - let id = 744 - generate_id ~guid ?link ~title:title_str ?date 745 - ~feed_url:(Source.url feed.source) () 746 - in 747 - { 748 - id; 749 - title = title_str; 750 - link; 751 - date; 752 - feed; 753 - author = author_name; 754 - email = ""; 755 - content; 756 - link_response = None; 757 - tags; 758 - summary; 759 - } 760 - 761 - let post_of_rss2 ~(feed : Feed.t) it = 762 - let title, content = 763 - match it.Syndic.Rss2.story with 764 - | All (t, xmlbase, d) -> ( 765 - ( t, 766 - match it.content with 767 - | _, "" -> html_of_text ?xmlbase d 768 - | xmlbase, c -> html_of_text ?xmlbase c )) 769 - | Title t -> 770 - let xmlbase, c = it.content in 771 - (t, html_of_text ?xmlbase c) 772 - | Description (xmlbase, d) -> ( 773 - ( "", 774 - match it.content with 775 - | _, "" -> html_of_text ?xmlbase d 776 - | xmlbase, c -> html_of_text ?xmlbase c )) 777 - in 778 - (* Note: it.link is of type Uri.t option in Syndic *) 779 - let link = 780 - match (it.guid, it.link) with 781 - | Some u, _ when u.permalink -> Some u.data 782 - | _, Some _ -> it.link 783 - | Some u, _ -> 784 - (* Sometimes the guid is indicated with isPermaLink="false" but is 785 - nonetheless the only URL we get (e.g. ocamlpro). *) 786 - Some u.data 787 - | None, None -> None 788 - in 789 - (* Extract GUID string for ID generation *) 790 - let guid_str = 791 - match it.guid with 792 - | Some u -> Some (Uri.to_string u.data) 793 - | None -> None 794 - in 795 - (* RSS2 doesn't have a categories field exposed, use empty list *) 796 - let tags = [] in 797 - (* RSS2 doesn't have a separate summary field, so leave it empty *) 798 - let summary = None in 799 - (* Generate unique ID *) 800 - let id = 801 - generate_id ?guid:guid_str ?link ~title ?date:it.pubDate 802 - ~feed_url:(Source.url feed.source) () 803 - in 804 - { 805 - id; 806 - title; 807 - link; 808 - feed; 809 - author = Source.name feed.source; 810 - email = string_of_option it.author; 811 - content; 812 - date = it.pubDate; 813 - link_response = None; 814 - tags; 815 - summary; 816 - } 817 - 818 - let post_of_jsonfeed_item ~(feed : Feed.t) (item : Jsonfeed.Item.t) = 819 - Log.debug (fun m -> m "Processing JSONFeed item: %s" 820 - (Option.value (Jsonfeed.Item.title item) ~default:"Untitled")); 821 - 822 - (* Extract content - prefer HTML, fall back to text *) 823 - let content = 824 - match Jsonfeed.Item.content item with 825 - | `Html html -> html_of_text html 826 - | `Text text -> html_of_text text 827 - | `Both (html, _text) -> html_of_text html 828 - in 829 - 830 - (* Extract author - use first author if multiple *) 831 - let author_name, author_email = 832 - match Jsonfeed.Item.authors item with 833 - | Some (first :: _) -> 834 - let name = Jsonfeed.Author.name first |> Option.value ~default:"" in 835 - (* JSONFeed authors don't typically have email *) 836 - (name, "") 837 - | _ -> 838 - (* Fall back to feed-level authors or feed title *) 839 - (match feed.content with 840 - | Feed.Json jsonfeed -> 841 - (match Jsonfeed.authors jsonfeed with 842 - | Some (first :: _) -> 843 - let name = Jsonfeed.Author.name first |> Option.value ~default:feed.title in 844 - (name, "") 845 - | _ -> (feed.title, "")) 846 - | _ -> (feed.title, "")) 847 - in 848 - 849 - (* Link - use url field *) 850 - let link = 851 - Jsonfeed.Item.url item 852 - |> Option.map Uri.of_string 853 - in 854 - 855 - (* Date *) 856 - let date = Jsonfeed.Item.date_published item in 857 - 858 - (* Summary *) 859 - let summary = Jsonfeed.Item.summary item in 860 - 861 - (* Tags *) 862 - let tags = 863 - Jsonfeed.Item.tags item 864 - |> Option.value ~default:[] 865 - in 866 - 867 - (* Generate unique ID - JSONFeed items always have an id field (required) *) 868 - let guid = Jsonfeed.Item.id item in 869 - let title_str = Jsonfeed.Item.title item |> Option.value ~default:"Untitled" in 870 - let id = 871 - generate_id ~guid ?link ~title:title_str ?date 872 - ~feed_url:(Source.url feed.source) () 873 - in 874 - 875 - { 876 - id; 877 - title = title_str; 878 - link; 879 - date; 880 - feed; 881 - author = author_name; 882 - email = author_email; 883 - content; 884 - link_response = None; 885 - tags; 886 - summary; 887 - } 888 - 889 - let posts_of_feed c = 890 - match c.Feed.content with 891 - | Feed.Atom f -> 892 - let posts = List.map (post_of_atom ~feed:c) f.Syndic.Atom.entries in 893 - Log.debug (fun m -> m "Extracted %d posts from Atom feed '%s'" 894 - (List.length posts) (Source.name c.source)); 895 - posts 896 - | Feed.Rss2 ch -> 897 - let posts = List.map (post_of_rss2 ~feed:c) ch.Syndic.Rss2.items in 898 - Log.debug (fun m -> m "Extracted %d posts from RSS2 feed '%s'" 899 - (List.length posts) (Source.name c.source)); 900 - posts 901 - | Feed.Json jsonfeed -> 902 - let items = Jsonfeed.items jsonfeed in 903 - let posts = List.map (post_of_jsonfeed_item ~feed:c) items in 904 - Log.debug (fun m -> m "Extracted %d posts from JSONFeed '%s'" 905 - (List.length posts) (Source.name c.source)); 906 - posts 907 - 908 - let get_posts ?n ?(ofs = 0) planet_feeds = 909 - Log.info (fun m -> m "Processing %d feeds for posts" (List.length planet_feeds)); 910 - 911 - let posts = List.concat @@ List.map posts_of_feed planet_feeds in 912 - Log.debug (fun m -> m "Total posts collected: %d" (List.length posts)); 913 - 914 - let posts = List.sort post_compare posts in 915 - Log.debug (fun m -> m "Posts sorted by date (most recent first)"); 916 - 917 - let posts = remove ofs posts in 918 - let result = 919 - match n with 920 - | None -> 921 - Log.debug (fun m -> m "Returning all %d posts (offset=%d)" 922 - (List.length posts) ofs); 923 - posts 924 - | Some n -> 925 - let limited = take n posts in 926 - Log.debug (fun m -> m "Returning %d posts (requested=%d, offset=%d)" 927 - (List.length limited) n ofs); 928 - limited 929 - in 930 - result 931 - 932 - let of_feeds feeds = get_posts feeds 933 - 934 - let feed t = t.feed 935 - let title t = t.title 936 - let link t = t.link 937 - let date t = t.date 938 - let author t = t.author 939 - let email t = t.email 940 - let content t = Soup.to_string t.content 941 - let id t = t.id 942 - let tags t = t.tags 943 - let summary t = t.summary 944 - 945 - let meta_description _t = 946 - (* TODO: This requires environment for HTTP access *) 947 - Log.debug (fun m -> m "meta_description not implemented (requires environment)"); 948 - None 949 - 950 - let seo_image _t = 951 - (* TODO: This requires environment for HTTP access *) 952 - Log.debug (fun m -> m "seo_image not implemented (requires environment)"); 953 - None 954 - end 955 - 956 - (** {1 Format Conversion and Export} *) 957 - 958 - module Format = struct 959 - module Atom = struct 960 - let entry_of_post post = 961 - let content = Syndic.Atom.Html (None, Post.content post) in 962 - let contributors = 963 - [ Syndic.Atom.author ~uri:(Uri.of_string (Source.url (Feed.source (Post.feed post)))) 964 - (Source.name (Feed.source (Post.feed post))) ] 965 - in 966 - let links = 967 - match Post.link post with 968 - | Some l -> [ Syndic.Atom.link ~rel:Syndic.Atom.Alternate l ] 969 - | None -> [] 970 - in 971 - let id = 972 - match Post.link post with 973 - | Some l -> l 974 - | None -> Uri.of_string (Digest.to_hex (Digest.string (Post.title post))) 975 - in 976 - let authors = (Syndic.Atom.author ~email:(Post.email post) (Post.author post), []) in 977 - let title : Syndic.Atom.text_construct = Syndic.Atom.Text (Post.title post) in 978 - let updated = 979 - match Post.date post with 980 - (* Atom entry requires a date but RSS2 does not. So if a date 981 - * is not available, just capture the current date. *) 982 - | None -> Ptime.of_float_s (Unix.gettimeofday ()) |> Option.get 983 - | Some d -> d 984 - in 985 - Syndic.Atom.entry ~content ~contributors ~links ~id ~authors ~title ~updated 986 - () 987 - 988 - let entries_of_posts posts = List.map entry_of_post posts 989 - 990 - let feed_of_entries ~title ?id ?(authors = []) entries = 991 - let feed_id = match id with 992 - | Some i -> Uri.of_string i 993 - | None -> Uri.of_string "urn:river:merged" 994 - in 995 - let feed_authors = List.map (fun (name, email) -> 996 - match email with 997 - | Some e -> Syndic.Atom.author ~email:e name 998 - | None -> Syndic.Atom.author name 999 - ) authors in 1000 - { 1001 - Syndic.Atom.id = feed_id; 1002 - title = Syndic.Atom.Text title; 1003 - updated = Ptime.of_float_s (Unix.time ()) |> Option.get; 1004 - entries; 1005 - authors = feed_authors; 1006 - categories = []; 1007 - contributors = []; 1008 - generator = Some { 1009 - Syndic.Atom.version = Some "1.0"; 1010 - uri = None; 1011 - content = "River Feed Aggregator"; 1012 - }; 1013 - icon = None; 1014 - links = []; 1015 - logo = None; 1016 - rights = None; 1017 - subtitle = None; 1018 - } 1019 - 1020 - let to_string feed = 1021 - let output = Buffer.create 4096 in 1022 - Syndic.Atom.output feed (`Buffer output); 1023 - Buffer.contents output 1024 - end 1025 - 1026 - module Rss2 = struct 1027 - let of_feed feed = 1028 - match feed.Feed.content with 1029 - | Feed.Rss2 ch -> Some ch 1030 - | _ -> None 1031 - end 1032 - 1033 - module Jsonfeed = struct 1034 - let item_of_post post = 1035 - (* Convert HTML content back to string *) 1036 - let html = Post.content post in 1037 - let content = `Html html in 1038 - 1039 - (* Create author *) 1040 - let authors = 1041 - if Post.author post <> "" then 1042 - let author = Jsonfeed.Author.create ~name:(Post.author post) () in 1043 - Some [author] 1044 - else 1045 - None 1046 - in 1047 - 1048 - (* Create item *) 1049 - Jsonfeed.Item.create 1050 - ~id:(Post.id post) 1051 - ~content 1052 - ?url:(Option.map Uri.to_string (Post.link post)) 1053 - ~title:(Post.title post) 1054 - ?summary:(Post.summary post) 1055 - ?date_published:(Post.date post) 1056 - ?authors 1057 - ~tags:(Post.tags post) 1058 - () 1059 - 1060 - let items_of_posts posts = List.map item_of_post posts 1061 - 1062 - let feed_of_items ~title ?home_page_url ?feed_url ?description ?icon ?favicon items = 1063 - Jsonfeed.create ~title ?home_page_url ?feed_url ?description ?icon ?favicon ~items () 1064 - 1065 - let feed_of_posts ~title ?home_page_url ?feed_url ?description ?icon ?favicon posts = 1066 - let items = items_of_posts posts in 1067 - feed_of_items ~title ?home_page_url ?feed_url ?description ?icon ?favicon items 1068 - 1069 - let to_string ?(minify = false) jsonfeed = 1070 - match Jsonfeed.to_string ~minify jsonfeed with 1071 - | Ok s -> Ok s 1072 - | Error err -> Error (Jsont.Error.to_string err) 1073 - 1074 - let of_feed feed = 1075 - match feed.Feed.content with 1076 - | Feed.Json jf -> Some jf 1077 - | _ -> None 1078 - end 1079 - end 1080 - 1081 - (** {1 User Management} *) 1082 - 1083 - module User = struct 1084 - type t = { 1085 - username : string; 1086 - fullname : string; 1087 - email : string option; 1088 - feeds : Source.t list; 1089 - last_synced : string option; 1090 - } 1091 - 1092 - let make ~username ~fullname ?email ?(feeds = []) ?last_synced () = 1093 - { username; fullname; email; feeds; last_synced } 1094 - 1095 - let username t = t.username 1096 - let fullname t = t.fullname 1097 - let email t = t.email 1098 - let feeds t = t.feeds 1099 - let last_synced t = t.last_synced 1100 - 1101 - let add_feed t source = 1102 - { t with feeds = source :: t.feeds } 1103 - 1104 - let remove_feed t ~url = 1105 - let feeds = List.filter (fun s -> Source.url s <> url) t.feeds in 1106 - { t with feeds } 1107 - 1108 - let set_last_synced t timestamp = 1109 - { t with last_synced = Some timestamp } 1110 - 1111 - let jsont = 1112 - let make username fullname email feeds last_synced = 1113 - { username; fullname; email; feeds; last_synced } 1114 - in 1115 - Jsont.Object.map ~kind:"User" make 1116 - |> Jsont.Object.mem "username" Jsont.string ~enc:(fun u -> u.username) 1117 - |> Jsont.Object.mem "fullname" Jsont.string ~enc:(fun u -> u.fullname) 1118 - |> Jsont.Object.opt_mem "email" Jsont.string ~enc:(fun u -> u.email) 1119 - |> Jsont.Object.mem "feeds" (Jsont.list Source.jsont) ~enc:(fun u -> u.feeds) 1120 - |> Jsont.Object.opt_mem "last_synced" Jsont.string ~enc:(fun u -> u.last_synced) 1121 - |> Jsont.Object.finish 1122 - end 1123 - 1124 - (** {1 Feed Quality Analysis} *) 1125 - 1126 - module Quality = struct 1127 - type t = { 1128 - total_entries : int; 1129 - entries_with_summary : int; 1130 - entries_with_author : int; 1131 - entries_with_date : int; 1132 - entries_with_content : int; 1133 - entries_with_tags : int; 1134 - avg_content_length : float; 1135 - min_content_length : int; 1136 - max_content_length : int; 1137 - posting_frequency_days : float option; 1138 - quality_score : float; 1139 - } 1140 - 1141 - let make ~total_entries ~entries_with_summary ~entries_with_author 1142 - ~entries_with_date ~entries_with_content ~entries_with_tags 1143 - ~avg_content_length ~min_content_length ~max_content_length 1144 - ~posting_frequency_days ~quality_score = 1145 - { 1146 - total_entries; 1147 - entries_with_summary; 1148 - entries_with_author; 1149 - entries_with_date; 1150 - entries_with_content; 1151 - entries_with_tags; 1152 - avg_content_length; 1153 - min_content_length; 1154 - max_content_length; 1155 - posting_frequency_days; 1156 - quality_score; 1157 - } 1158 - 1159 - let total_entries t = t.total_entries 1160 - let entries_with_summary t = t.entries_with_summary 1161 - let entries_with_author t = t.entries_with_author 1162 - let entries_with_date t = t.entries_with_date 1163 - let entries_with_content t = t.entries_with_content 1164 - let entries_with_tags t = t.entries_with_tags 1165 - let avg_content_length t = t.avg_content_length 1166 - let min_content_length t = t.min_content_length 1167 - let max_content_length t = t.max_content_length 1168 - let posting_frequency_days t = t.posting_frequency_days 1169 - let quality_score t = t.quality_score 1170 - 1171 - (** Get content length from an Atom entry *) 1172 - let get_content_length (entry : Syndic.Atom.entry) = 1173 - match entry.content with 1174 - | Some (Syndic.Atom.Text s) -> String.length s 1175 - | Some (Syndic.Atom.Html (_, s)) -> String.length s 1176 - | Some (Syndic.Atom.Xhtml (_, _)) -> 0 (* Could calculate but complex *) 1177 - | Some (Syndic.Atom.Mime _) -> 0 1178 - | Some (Syndic.Atom.Src _) -> 0 1179 - | None -> ( 1180 - match entry.summary with 1181 - | Some (Syndic.Atom.Text s) -> String.length s 1182 - | Some (Syndic.Atom.Html (_, s)) -> String.length s 1183 - | Some (Syndic.Atom.Xhtml (_, _)) -> 0 1184 - | None -> 0) 1185 - 1186 - (** Check if entry has non-empty summary *) 1187 - let has_summary (entry : Syndic.Atom.entry) = 1188 - match entry.summary with 1189 - | Some (Syndic.Atom.Text s) when String.trim s <> "" -> true 1190 - | Some (Syndic.Atom.Html (_, s)) when String.trim s <> "" -> true 1191 - | Some (Syndic.Atom.Xhtml (_, _)) -> true 1192 - | _ -> false 1193 - 1194 - (** Check if entry has author *) 1195 - let has_author (entry : Syndic.Atom.entry) = 1196 - let (author, _) = entry.authors in 1197 - String.trim author.name <> "" 1198 - 1199 - (** Check if entry has content *) 1200 - let has_content (entry : Syndic.Atom.entry) = 1201 - get_content_length entry > 0 1202 - 1203 - (** Check if entry has tags/categories *) 1204 - let has_tags (entry : Syndic.Atom.entry) = 1205 - entry.categories <> [] 1206 - 1207 - (** Calculate quality score from metrics *) 1208 - let calculate_quality_score t = 1209 - let total = float_of_int t.total_entries in 1210 - if total = 0.0 then 0.0 1211 - else 1212 - let summary_pct = float_of_int t.entries_with_summary /. total *. 100.0 in 1213 - let author_pct = float_of_int t.entries_with_author /. total *. 100.0 in 1214 - let date_pct = float_of_int t.entries_with_date /. total *. 100.0 in 1215 - let content_pct = float_of_int t.entries_with_content /. total *. 100.0 in 1216 - let tags_pct = float_of_int t.entries_with_tags /. total *. 100.0 in 1217 - 1218 - (* Weighted average: content and dates are most important *) 1219 - let score = 1220 - (content_pct *. 0.30) +. 1221 - (date_pct *. 0.25) +. 1222 - (author_pct *. 0.20) +. 1223 - (summary_pct *. 0.15) +. 1224 - (tags_pct *. 0.10) 1225 - in 1226 - score 1227 - 1228 - let analyze entries = 1229 - if entries = [] then 1230 - failwith "No entries to analyze" 1231 - else 1232 - let total_entries = List.length entries in 1233 - 1234 - let entries_with_summary = ref 0 in 1235 - let entries_with_author = ref 0 in 1236 - let entries_with_date = ref total_entries in (* All Atom entries have updated *) 1237 - let entries_with_content = ref 0 in 1238 - let entries_with_tags = ref 0 in 1239 - let content_lengths = ref [] in 1240 - let dates = ref [] in 1241 - 1242 - List.iter (fun (entry : Syndic.Atom.entry) -> 1243 - if has_summary entry then incr entries_with_summary; 1244 - if has_author entry then incr entries_with_author; 1245 - if has_content entry then begin 1246 - incr entries_with_content; 1247 - content_lengths := get_content_length entry :: !content_lengths 1248 - end; 1249 - if has_tags entry then incr entries_with_tags; 1250 - dates := entry.updated :: !dates 1251 - ) entries; 1252 - 1253 - (* Calculate content statistics *) 1254 - let avg_content_length, min_content_length, max_content_length = 1255 - if !content_lengths = [] then 1256 - (0.0, 0, 0) 1257 - else 1258 - let sorted = List.sort compare !content_lengths in 1259 - let sum = List.fold_left (+) 0 sorted in 1260 - let avg = float_of_int sum /. float_of_int (List.length sorted) in 1261 - let min_len = List.hd sorted in 1262 - let max_len = List.hd (List.rev sorted) in 1263 - (avg, min_len, max_len) 1264 - in 1265 - 1266 - (* Calculate posting frequency *) 1267 - let posting_frequency_days = 1268 - if List.length !dates < 2 then 1269 - None 1270 - else 1271 - try 1272 - let timestamps = List.map Ptime.to_float_s !dates in 1273 - let sorted_timestamps = List.sort compare timestamps in 1274 - let first = List.hd sorted_timestamps in 1275 - let last = List.hd (List.rev sorted_timestamps) in 1276 - let total_days = (last -. first) /. 86400.0 in 1277 - let num_intervals = float_of_int (List.length sorted_timestamps - 1) in 1278 - Some (total_days /. num_intervals) 1279 - with _ -> None 1280 - in 1281 - 1282 - (* Create metrics record (without quality_score first) *) 1283 - let metrics = { 1284 - total_entries; 1285 - entries_with_summary = !entries_with_summary; 1286 - entries_with_author = !entries_with_author; 1287 - entries_with_date = !entries_with_date; 1288 - entries_with_content = !entries_with_content; 1289 - entries_with_tags = !entries_with_tags; 1290 - avg_content_length; 1291 - min_content_length; 1292 - max_content_length; 1293 - posting_frequency_days; 1294 - quality_score = 0.0; (* Placeholder *) 1295 - } in 1296 - 1297 - (* Calculate quality score *) 1298 - let quality_score = calculate_quality_score metrics in 1299 - { metrics with quality_score } 1300 - end 1301 - 1302 - (** {1 State Management} *) 1303 - 1304 - module State = struct 1305 - type t = { 1306 - xdg : Xdge.t; 1307 - } 1308 - 1309 - module Paths = struct 1310 - (** Get the users directory path *) 1311 - let users_dir state = Eio.Path.(Xdge.state_dir state.xdg / "users") 1312 - 1313 - (** Get the feeds directory path *) 1314 - let feeds_dir state = Eio.Path.(Xdge.state_dir state.xdg / "feeds") 1315 - 1316 - (** Get the user feeds directory path *) 1317 - let user_feeds_dir state = Eio.Path.(feeds_dir state / "user") 1318 - 1319 - (** Get the path to a user's JSON file *) 1320 - let user_file state username = 1321 - Eio.Path.(users_dir state / (username ^ ".json")) 1322 - 1323 - (** Get the path to a user's Atom feed file *) 1324 - let user_feed_file state username = 1325 - Eio.Path.(user_feeds_dir state / (username ^ ".xml")) 1326 - 1327 - (** Ensure all necessary directories exist *) 1328 - let ensure_directories state = 1329 - let dirs = [ 1330 - users_dir state; 1331 - feeds_dir state; 1332 - user_feeds_dir state; 1333 - ] in 1334 - List.iter (fun dir -> 1335 - try Eio.Path.mkdir ~perm:0o755 dir 1336 - with Eio.Io (Eio.Fs.E (Already_exists _), _) -> () 1337 - ) dirs 1338 - end 1339 - 1340 - module Json = struct 1341 - (** Decode a user from JSON string *) 1342 - let user_of_string s = 1343 - match Jsont_bytesrw.decode_string' User.jsont s with 1344 - | Ok user -> Some user 1345 - | Error err -> 1346 - Log.err (fun m -> m "Failed to parse user JSON: %s" (Jsont.Error.to_string err)); 1347 - None 1348 - 1349 - (** Encode a user to JSON string *) 1350 - let user_to_string user = 1351 - match Jsont_bytesrw.encode_string' ~format:Jsont.Indent User.jsont user with 1352 - | Ok s -> s 1353 - | Error err -> failwith ("Failed to encode user: " ^ Jsont.Error.to_string err) 1354 - end 1355 - 1356 - module Storage = struct 1357 - (** Load a user from disk *) 1358 - let load_user state username = 1359 - let file = Paths.user_file state username in 1360 - try 1361 - let content = Eio.Path.load file in 1362 - Json.user_of_string content 1363 - with 1364 - | Eio.Io (Eio.Fs.E (Not_found _), _) -> None 1365 - | e -> 1366 - Log.err (fun m -> m "Error loading user %s: %s" username (Printexc.to_string e)); 1367 - None 1368 - 1369 - (** Save a user to disk *) 1370 - let save_user state user = 1371 - let file = Paths.user_file state (User.username user) in 1372 - let json = Json.user_to_string user in 1373 - Eio.Path.save ~create:(`Or_truncate 0o644) file json 1374 - 1375 - (** List all usernames *) 1376 - let list_users state = 1377 - try 1378 - Eio.Path.read_dir (Paths.users_dir state) 1379 - |> List.filter_map (fun name -> 1380 - if Filename.check_suffix name ".json" then 1381 - Some (Filename.chop_suffix name ".json") 1382 - else None 1383 - ) 1384 - with _ -> [] 1385 - 1386 - (** Load existing Atom entries for a user *) 1387 - let load_existing_posts state username = 1388 - let file = Paths.user_feed_file state username in 1389 - try 1390 - let content = Eio.Path.load file in 1391 - (* Parse existing Atom feed *) 1392 - let input = Xmlm.make_input (`String (0, content)) in 1393 - let feed = Syndic.Atom.parse input in 1394 - feed.Syndic.Atom.entries 1395 - with 1396 - | Eio.Io (Eio.Fs.E (Not_found _), _) -> [] 1397 - | e -> 1398 - Log.err (fun m -> m "Error loading existing posts for %s: %s" 1399 - username (Printexc.to_string e)); 1400 - [] 1401 - 1402 - (** Save Atom entries for a user *) 1403 - let save_atom_feed state username entries = 1404 - let file = Paths.user_feed_file state username in 1405 - let feed = Format.Atom.feed_of_entries ~title:username entries in 1406 - let xml = Format.Atom.to_string feed in 1407 - Eio.Path.save ~create:(`Or_truncate 0o644) file xml 1408 - 1409 - (** Delete a user and their feed file *) 1410 - let delete_user state username = 1411 - let user_file = Paths.user_file state username in 1412 - let feed_file = Paths.user_feed_file state username in 1413 - (try Eio.Path.unlink user_file with _ -> ()); 1414 - (try Eio.Path.unlink feed_file with _ -> ()) 1415 - end 1416 - 1417 - module Sync = struct 1418 - (** Merge new entries with existing ones, updating matching IDs *) 1419 - let merge_entries ~existing ~new_entries = 1420 - (* Create a map of new entry IDs for efficient lookup and updates *) 1421 - let module UriMap = Map.Make(Uri) in 1422 - let new_entries_map = 1423 - List.fold_left (fun acc (entry : Syndic.Atom.entry) -> 1424 - UriMap.add entry.id entry acc 1425 - ) UriMap.empty new_entries 1426 - in 1427 - 1428 - (* Update existing entries with new ones if IDs match, otherwise keep existing *) 1429 - let updated_existing = 1430 - List.filter_map (fun (entry : Syndic.Atom.entry) -> 1431 - if UriMap.mem entry.id new_entries_map then 1432 - None (* Will be replaced by new entry *) 1433 - else 1434 - Some entry (* Keep existing entry *) 1435 - ) existing 1436 - in 20 + (** {1 Public API Modules} *) 1437 21 1438 - (* Combine new entries with non-replaced existing entries *) 1439 - let combined = new_entries @ updated_existing in 1440 - List.sort (fun (a : Syndic.Atom.entry) (b : Syndic.Atom.entry) -> 1441 - Ptime.compare b.updated a.updated 1442 - ) combined 1443 - 1444 - (** Get current timestamp in ISO 8601 format *) 1445 - let current_timestamp () = 1446 - let open Unix in 1447 - let tm = gmtime (time ()) in 1448 - Printf.sprintf "%04d-%02d-%02dT%02d:%02d:%02dZ" 1449 - (tm.tm_year + 1900) (tm.tm_mon + 1) tm.tm_mday 1450 - tm.tm_hour tm.tm_min tm.tm_sec 1451 - 1452 - (** Sync feeds for a single user *) 1453 - let sync_user session state ~username = 1454 - match Storage.load_user state username with 1455 - | None -> 1456 - Error (Printf.sprintf "User %s not found" username) 1457 - | Some user when User.feeds user = [] -> 1458 - Log.info (fun m -> m "No feeds configured for user %s" username); 1459 - Ok () 1460 - | Some user -> 1461 - Log.info (fun m -> m "Syncing feeds for user %s..." username); 1462 - 1463 - (* Fetch all feeds concurrently *) 1464 - let fetched_feeds = 1465 - Eio.Fiber.List.filter_map (fun source -> 1466 - try 1467 - Log.info (fun m -> m " Fetching %s (%s)..." 1468 - (Source.name source) (Source.url source)); 1469 - Some (Feed.fetch session source) 1470 - with e -> 1471 - Log.err (fun m -> m " Failed to fetch %s: %s" 1472 - (Source.name source) (Printexc.to_string e)); 1473 - None 1474 - ) (User.feeds user) 1475 - in 1476 - 1477 - if fetched_feeds = [] then begin 1478 - Error "No feeds successfully fetched" 1479 - end else begin 1480 - (* Get posts from fetched feeds *) 1481 - let posts = Post.of_feeds fetched_feeds in 1482 - Log.info (fun m -> m " Found %d new posts" (List.length posts)); 1483 - 1484 - (* Convert to Atom entries *) 1485 - let new_entries = Format.Atom.entries_of_posts posts in 1486 - 1487 - (* Load existing entries *) 1488 - let existing = Storage.load_existing_posts state username in 1489 - Log.info (fun m -> m " Found %d existing posts" (List.length existing)); 1490 - 1491 - (* Merge entries *) 1492 - let merged = merge_entries ~existing ~new_entries in 1493 - Log.info (fun m -> m " Total posts after merge: %d" (List.length merged)); 1494 - 1495 - (* Save updated feed *) 1496 - Storage.save_atom_feed state username merged; 1497 - 1498 - (* Update last_synced timestamp *) 1499 - let now = current_timestamp () in 1500 - let user = User.set_last_synced user now in 1501 - Storage.save_user state user; 1502 - 1503 - Log.info (fun m -> m "Sync completed for user %s" username); 1504 - Ok () 1505 - end 1506 - end 1507 - 1508 - module Export = struct 1509 - (** Convert Atom entry to JSONFeed item *) 1510 - let atom_entry_to_jsonfeed_item (entry : Syndic.Atom.entry) = 1511 - (* Extract ID *) 1512 - let id = Uri.to_string entry.id in 1513 - 1514 - (* Extract title *) 1515 - let title = 1516 - match entry.title with 1517 - | Syndic.Atom.Text s -> Some s 1518 - | Syndic.Atom.Html (_, s) -> Some s 1519 - | Syndic.Atom.Xhtml (_, _) -> Some "Untitled" 1520 - in 1521 - 1522 - (* Extract URL *) 1523 - let url = 1524 - match entry.links with 1525 - | link :: _ -> Some (Uri.to_string link.href) 1526 - | [] -> None 1527 - in 1528 - 1529 - (* Extract content *) 1530 - let content = 1531 - match entry.content with 1532 - | Some (Syndic.Atom.Text s) -> `Text s 1533 - | Some (Syndic.Atom.Html (_, s)) -> `Html s 1534 - | Some (Syndic.Atom.Xhtml (_, nodes)) -> 1535 - let html = String.concat "" (List.map Syndic.XML.to_string nodes) in 1536 - `Html html 1537 - | Some (Syndic.Atom.Mime _) | Some (Syndic.Atom.Src _) | None -> 1538 - `Text "" 1539 - in 1540 - 1541 - (* Extract summary *) 1542 - let summary = 1543 - match entry.summary with 1544 - | Some (Syndic.Atom.Text s) when String.trim s <> "" -> Some s 1545 - | Some (Syndic.Atom.Html (_, s)) when String.trim s <> "" -> Some s 1546 - | _ -> None 1547 - in 1548 - 1549 - (* Extract authors *) 1550 - let authors = 1551 - let (author, contributors) = entry.authors in 1552 - let author_list = author :: contributors in 1553 - let jsonfeed_authors = List.filter_map (fun (a : Syndic.Atom.author) -> 1554 - let name = String.trim a.name in 1555 - if name = "" then None 1556 - else Some (Jsonfeed.Author.create ~name ()) 1557 - ) author_list in 1558 - if jsonfeed_authors = [] then None else Some jsonfeed_authors 1559 - in 1560 - 1561 - (* Extract tags *) 1562 - let tags = 1563 - match entry.categories with 1564 - | [] -> None 1565 - | cats -> 1566 - let tag_list = List.map (fun (c : Syndic.Atom.category) -> 1567 - match c.label with 1568 - | Some lbl -> lbl 1569 - | None -> c.term 1570 - ) cats in 1571 - if tag_list = [] then None else Some tag_list 1572 - in 1573 - 1574 - (* Create JSONFeed item *) 1575 - Jsonfeed.Item.create 1576 - ~id 1577 - ~content 1578 - ?title 1579 - ?url 1580 - ?summary 1581 - ?authors 1582 - ?tags 1583 - ~date_published:entry.updated 1584 - () 1585 - 1586 - (** Export entries as JSONFeed *) 1587 - let export_jsonfeed ~title entries = 1588 - let items = List.map atom_entry_to_jsonfeed_item entries in 1589 - let feed = Jsonfeed.create ~title ~items () in 1590 - match Jsonfeed.to_string ~minify:false feed with 1591 - | Ok json -> Ok json 1592 - | Error err -> Error (Printf.sprintf "Failed to serialize JSON Feed: %s" (Jsont.Error.to_string err)) 1593 - end 1594 - 1595 - let create env ~app_name = 1596 - let xdg = Xdge.create env#fs app_name in 1597 - let state = { xdg } in 1598 - Paths.ensure_directories state; 1599 - state 1600 - 1601 - let create_user state user = 1602 - match Storage.load_user state (User.username user) with 1603 - | Some _ -> 1604 - Error (Printf.sprintf "User %s already exists" (User.username user)) 1605 - | None -> 1606 - Storage.save_user state user; 1607 - Log.info (fun m -> m "User %s created" (User.username user)); 1608 - Ok () 1609 - 1610 - let delete_user state ~username = 1611 - match Storage.load_user state username with 1612 - | None -> 1613 - Error (Printf.sprintf "User %s not found" username) 1614 - | Some _ -> 1615 - Storage.delete_user state username; 1616 - Log.info (fun m -> m "User %s deleted" username); 1617 - Ok () 1618 - 1619 - let get_user state ~username = 1620 - Storage.load_user state username 1621 - 1622 - let update_user state user = 1623 - match Storage.load_user state (User.username user) with 1624 - | None -> 1625 - Error (Printf.sprintf "User %s not found" (User.username user)) 1626 - | Some _ -> 1627 - Storage.save_user state user; 1628 - Log.info (fun m -> m "User %s updated" (User.username user)); 1629 - Ok () 1630 - 1631 - let list_users state = 1632 - Storage.list_users state 1633 - 1634 - let sync_user env state ~username = 1635 - Session.with_session env @@ fun session -> 1636 - Sync.sync_user session state ~username 1637 - 1638 - let sync_all env state = 1639 - let users = Storage.list_users state in 1640 - if users = [] then begin 1641 - Log.info (fun m -> m "No users to sync"); 1642 - Ok (0, 0) 1643 - end else begin 1644 - Log.info (fun m -> m "Syncing %d users concurrently..." (List.length users)); 1645 - 1646 - Session.with_session env @@ fun session -> 1647 - let results = 1648 - Eio.Fiber.List.map (fun username -> 1649 - match Sync.sync_user session state ~username with 1650 - | Ok () -> true 1651 - | Error err -> 1652 - Log.err (fun m -> m "Failed to sync user %s: %s" username err); 1653 - false 1654 - ) users 1655 - in 1656 - let success_count = List.length (List.filter (fun x -> x) results) in 1657 - let fail_count = List.length users - success_count in 1658 - 1659 - if fail_count = 0 then 1660 - Log.info (fun m -> m "All users synced successfully"); 1661 - 1662 - Ok (success_count, fail_count) 1663 - end 1664 - 1665 - let get_user_posts state ~username ?limit () = 1666 - let entries = Storage.load_existing_posts state username in 1667 - match limit with 1668 - | None -> entries 1669 - | Some n -> List.filteri (fun i _ -> i < n) entries 1670 - 1671 - let get_all_posts state ?limit () = 1672 - let users = Storage.list_users state in 1673 - 1674 - (* Collect all entries from all users with username tag *) 1675 - let all_entries = 1676 - List.concat_map (fun username -> 1677 - let entries = Storage.load_existing_posts state username in 1678 - List.map (fun entry -> (username, entry)) entries 1679 - ) users 1680 - in 1681 - 1682 - (* Sort by date (newest first) *) 1683 - let sorted = List.sort (fun (_, a : string * Syndic.Atom.entry) (_, b) -> 1684 - Ptime.compare b.updated a.updated 1685 - ) all_entries in 1686 - 1687 - match limit with 1688 - | None -> sorted 1689 - | Some n -> List.filteri (fun i _ -> i < n) sorted 1690 - 1691 - let export_merged_feed state ~title ~format ?limit () = 1692 - let all_posts = get_all_posts state ?limit () in 1693 - let entries = List.map snd all_posts in 1694 - 1695 - match format with 1696 - | `Atom -> 1697 - let xml = Format.Atom.to_string (Format.Atom.feed_of_entries ~title entries) in 1698 - Ok xml 1699 - | `Jsonfeed -> 1700 - if entries = [] then 1701 - (* Empty JSONFeed *) 1702 - let feed = Jsonfeed.create ~title ~items:[] () in 1703 - match Jsonfeed.to_string ~minify:false feed with 1704 - | Ok json -> Ok json 1705 - | Error err -> Error (Printf.sprintf "Failed to serialize JSON Feed: %s" (Jsont.Error.to_string err)) 1706 - else 1707 - Export.export_jsonfeed ~title entries 1708 - 1709 - let analyze_user_quality state ~username = 1710 - match Storage.load_user state username with 1711 - | None -> 1712 - Error (Printf.sprintf "User %s not found" username) 1713 - | Some _ -> 1714 - let entries = Storage.load_existing_posts state username in 1715 - if entries = [] then 1716 - Error "No entries to analyze" 1717 - else 1718 - Ok (Quality.analyze entries) 1719 - end 22 + module Source = Source 23 + module Session = Session 24 + module Feed = Feed 25 + module Post = Post 26 + module Format = Format 27 + module User = User 28 + module Quality = Quality 29 + module State = State
+47
stack/river/lib/session.ml
··· 1 + (* 2 + * Copyright (c) 2014, OCaml.org project 3 + * Copyright (c) 2015 KC Sivaramakrishnan <sk826@cl.cam.ac.uk> 4 + * 5 + * Permission to use, copy, modify, and distribute this software for any 6 + * purpose with or without fee is hereby granted, provided that the above 7 + * copyright notice and this permission notice appear in all copies. 8 + * 9 + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10 + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11 + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 12 + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13 + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 14 + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 15 + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 16 + *) 17 + 18 + (** HTTP session management for fetching feeds. *) 19 + 20 + let src = Logs.Src.create "river" ~doc:"River RSS/Atom aggregator" 21 + module Log = (val Logs.src_log src : Logs.LOG) 22 + 23 + type t = { 24 + session : (float Eio.Time.clock_ty Eio.Resource.t, 25 + [`Generic | `Unix] Eio.Net.ty Eio.Resource.t) Requests.t; 26 + } 27 + 28 + let init ~sw env = 29 + Log.info (fun m -> m "Initializing River session"); 30 + let session = Requests.create ~sw 31 + ~default_headers:(Requests.Headers.of_list [ 32 + ("User-Agent", "OCaml-River/1.0"); 33 + ]) 34 + ~follow_redirects:true 35 + ~max_redirects:5 36 + ~verify_tls:true 37 + env 38 + in 39 + { session } 40 + 41 + let with_session env f = 42 + Log.info (fun m -> m "Creating River session"); 43 + Eio.Switch.run @@ fun sw -> 44 + let client = init ~sw env in 45 + f client 46 + 47 + let get_requests_session t = t.session
+59
stack/river/lib/session.mli
··· 1 + (* 2 + * Copyright (c) 2014, OCaml.org project 3 + * Copyright (c) 2015 KC Sivaramakrishnan <sk826@cl.cam.ac.uk> 4 + * 5 + * Permission to use, copy, modify, and distribute this software for any 6 + * purpose with or without fee is hereby granted, provided that the above 7 + * copyright notice and this permission notice appear in all copies. 8 + * 9 + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10 + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11 + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 12 + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13 + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 14 + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 15 + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 16 + *) 17 + 18 + (** HTTP session management for fetching feeds. *) 19 + 20 + type t 21 + (** An abstract HTTP session for fetching feeds. 22 + 23 + The session manages HTTP connections and is tied to an Eio switch 24 + for proper resource cleanup. *) 25 + 26 + val init : 27 + sw:Eio.Switch.t -> 28 + < clock : float Eio.Time.clock_ty Eio.Resource.t; 29 + fs : Eio.Fs.dir_ty Eio.Path.t; 30 + net : [ `Generic | `Unix ] Eio.Net.ty Eio.Resource.t; .. > -> 31 + t 32 + (** [init ~sw env] creates a new HTTP session. 33 + 34 + The session is configured with appropriate defaults for fetching feeds: 35 + - User-Agent: "OCaml-River/1.0" 36 + - Automatic redirect following (max 5 redirects) 37 + - TLS verification enabled 38 + 39 + @param sw The switch for resource management 40 + @param env The Eio environment *) 41 + 42 + val with_session : 43 + < clock : float Eio.Time.clock_ty Eio.Resource.t; 44 + fs : Eio.Fs.dir_ty Eio.Path.t; 45 + net : [ `Generic | `Unix ] Eio.Net.ty Eio.Resource.t; .. > -> 46 + (t -> 'a) -> 'a 47 + (** [with_session env f] creates a session and automatically manages its lifecycle. 48 + 49 + This is the recommended way to use River as it ensures proper cleanup. 50 + 51 + @param env The Eio environment 52 + @param f The function to run with the session *) 53 + 54 + val get_requests_session : t -> 55 + (float Eio.Time.clock_ty Eio.Resource.t, 56 + [`Generic | `Unix] Eio.Net.ty Eio.Resource.t) Requests.t 57 + (** [get_requests_session t] returns the underlying Requests session. 58 + 59 + This is used internally by the Feed module. *)
+35
stack/river/lib/source.ml
··· 1 + (* 2 + * Copyright (c) 2014, OCaml.org project 3 + * Copyright (c) 2015 KC Sivaramakrishnan <sk826@cl.cam.ac.uk> 4 + * 5 + * Permission to use, copy, modify, and distribute this software for any 6 + * purpose with or without fee is hereby granted, provided that the above 7 + * copyright notice and this permission notice appear in all copies. 8 + * 9 + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10 + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11 + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 12 + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13 + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 14 + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 15 + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 16 + *) 17 + 18 + (** Feed source with name and URL. *) 19 + 20 + type t = { 21 + name : string; 22 + url : string; 23 + } 24 + 25 + let make ~name ~url = { name; url } 26 + 27 + let name t = t.name 28 + let url t = t.url 29 + 30 + let jsont = 31 + let make name url = { name; url } in 32 + Jsont.Object.map ~kind:"Source" make 33 + |> Jsont.Object.mem "name" Jsont.string ~enc:(fun s -> s.name) 34 + |> Jsont.Object.mem "url" Jsont.string ~enc:(fun s -> s.url) 35 + |> Jsont.Object.finish
+33
stack/river/lib/source.mli
··· 1 + (* 2 + * Copyright (c) 2014, OCaml.org project 3 + * Copyright (c) 2015 KC Sivaramakrishnan <sk826@cl.cam.ac.uk> 4 + * 5 + * Permission to use, copy, modify, and distribute this software for any 6 + * purpose with or without fee is hereby granted, provided that the above 7 + * copyright notice and this permission notice appear in all copies. 8 + * 9 + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10 + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11 + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 12 + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13 + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 14 + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 15 + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 16 + *) 17 + 18 + (** Feed source with name and URL. *) 19 + 20 + type t 21 + (** A feed source with name and URL. *) 22 + 23 + val make : name:string -> url:string -> t 24 + (** [make ~name ~url] creates a new feed source. *) 25 + 26 + val name : t -> string 27 + (** [name source] returns the feed name/label. *) 28 + 29 + val url : t -> string 30 + (** [url source] returns the feed URL. *) 31 + 32 + val jsont : t Jsont.t 33 + (** JSON codec for sources. *)
+436
stack/river/lib/state.ml
··· 1 + (* 2 + * Copyright (c) 2014, OCaml.org project 3 + * Copyright (c) 2015 KC Sivaramakrishnan <sk826@cl.cam.ac.uk> 4 + * 5 + * Permission to use, copy, modify, and distribute this software for any 6 + * purpose with or without fee is hereby granted, provided that the above 7 + * copyright notice and this permission notice appear in all copies. 8 + * 9 + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10 + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11 + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 12 + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13 + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 14 + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 15 + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 16 + *) 17 + 18 + (** State management for user data and feeds. *) 19 + 20 + let src = Logs.Src.create "river" ~doc:"River RSS/Atom aggregator" 21 + module Log = (val Logs.src_log src : Logs.LOG) 22 + 23 + type t = { 24 + xdg : Xdge.t; 25 + } 26 + 27 + module Paths = struct 28 + (** Get the users directory path *) 29 + let users_dir state = Eio.Path.(Xdge.state_dir state.xdg / "users") 30 + 31 + (** Get the feeds directory path *) 32 + let feeds_dir state = Eio.Path.(Xdge.state_dir state.xdg / "feeds") 33 + 34 + (** Get the user feeds directory path *) 35 + let user_feeds_dir state = Eio.Path.(feeds_dir state / "user") 36 + 37 + (** Get the path to a user's JSON file *) 38 + let user_file state username = 39 + Eio.Path.(users_dir state / (username ^ ".json")) 40 + 41 + (** Get the path to a user's Atom feed file *) 42 + let user_feed_file state username = 43 + Eio.Path.(user_feeds_dir state / (username ^ ".xml")) 44 + 45 + (** Ensure all necessary directories exist *) 46 + let ensure_directories state = 47 + let dirs = [ 48 + users_dir state; 49 + feeds_dir state; 50 + user_feeds_dir state; 51 + ] in 52 + List.iter (fun dir -> 53 + try Eio.Path.mkdir ~perm:0o755 dir 54 + with Eio.Io (Eio.Fs.E (Already_exists _), _) -> () 55 + ) dirs 56 + end 57 + 58 + module Json = struct 59 + (** Decode a user from JSON string *) 60 + let user_of_string s = 61 + match Jsont_bytesrw.decode_string' User.jsont s with 62 + | Ok user -> Some user 63 + | Error err -> 64 + Log.err (fun m -> m "Failed to parse user JSON: %s" (Jsont.Error.to_string err)); 65 + None 66 + 67 + (** Encode a user to JSON string *) 68 + let user_to_string user = 69 + match Jsont_bytesrw.encode_string' ~format:Jsont.Indent User.jsont user with 70 + | Ok s -> s 71 + | Error err -> failwith ("Failed to encode user: " ^ Jsont.Error.to_string err) 72 + end 73 + 74 + module Storage = struct 75 + (** Load a user from disk *) 76 + let load_user state username = 77 + let file = Paths.user_file state username in 78 + try 79 + let content = Eio.Path.load file in 80 + Json.user_of_string content 81 + with 82 + | Eio.Io (Eio.Fs.E (Not_found _), _) -> None 83 + | e -> 84 + Log.err (fun m -> m "Error loading user %s: %s" username (Printexc.to_string e)); 85 + None 86 + 87 + (** Save a user to disk *) 88 + let save_user state user = 89 + let file = Paths.user_file state (User.username user) in 90 + let json = Json.user_to_string user in 91 + Eio.Path.save ~create:(`Or_truncate 0o644) file json 92 + 93 + (** List all usernames *) 94 + let list_users state = 95 + try 96 + Eio.Path.read_dir (Paths.users_dir state) 97 + |> List.filter_map (fun name -> 98 + if Filename.check_suffix name ".json" then 99 + Some (Filename.chop_suffix name ".json") 100 + else None 101 + ) 102 + with _ -> [] 103 + 104 + (** Load existing Atom entries for a user *) 105 + let load_existing_posts state username = 106 + let file = Paths.user_feed_file state username in 107 + try 108 + let content = Eio.Path.load file in 109 + (* Parse existing Atom feed *) 110 + let input = Xmlm.make_input (`String (0, content)) in 111 + let feed = Syndic.Atom.parse input in 112 + feed.Syndic.Atom.entries 113 + with 114 + | Eio.Io (Eio.Fs.E (Not_found _), _) -> [] 115 + | e -> 116 + Log.err (fun m -> m "Error loading existing posts for %s: %s" 117 + username (Printexc.to_string e)); 118 + [] 119 + 120 + (** Save Atom entries for a user *) 121 + let save_atom_feed state username entries = 122 + let file = Paths.user_feed_file state username in 123 + let feed = Format.Atom.feed_of_entries ~title:username entries in 124 + let xml = Format.Atom.to_string feed in 125 + Eio.Path.save ~create:(`Or_truncate 0o644) file xml 126 + 127 + (** Delete a user and their feed file *) 128 + let delete_user state username = 129 + let user_file = Paths.user_file state username in 130 + let feed_file = Paths.user_feed_file state username in 131 + (try Eio.Path.unlink user_file with _ -> ()); 132 + (try Eio.Path.unlink feed_file with _ -> ()) 133 + end 134 + 135 + module Sync = struct 136 + (** Merge new entries with existing ones, updating matching IDs *) 137 + let merge_entries ~existing ~new_entries = 138 + (* Create a map of new entry IDs for efficient lookup and updates *) 139 + let module UriMap = Map.Make(Uri) in 140 + let new_entries_map = 141 + List.fold_left (fun acc (entry : Syndic.Atom.entry) -> 142 + UriMap.add entry.id entry acc 143 + ) UriMap.empty new_entries 144 + in 145 + 146 + (* Update existing entries with new ones if IDs match, otherwise keep existing *) 147 + let updated_existing = 148 + List.filter_map (fun (entry : Syndic.Atom.entry) -> 149 + if UriMap.mem entry.id new_entries_map then 150 + None (* Will be replaced by new entry *) 151 + else 152 + Some entry (* Keep existing entry *) 153 + ) existing 154 + in 155 + 156 + (* Combine new entries with non-replaced existing entries *) 157 + let combined = new_entries @ updated_existing in 158 + List.sort (fun (a : Syndic.Atom.entry) (b : Syndic.Atom.entry) -> 159 + Ptime.compare b.updated a.updated 160 + ) combined 161 + 162 + (** Get current timestamp in ISO 8601 format *) 163 + let current_timestamp () = 164 + let open Unix in 165 + let tm = gmtime (time ()) in 166 + Printf.sprintf "%04d-%02d-%02dT%02d:%02d:%02dZ" 167 + (tm.tm_year + 1900) (tm.tm_mon + 1) tm.tm_mday 168 + tm.tm_hour tm.tm_min tm.tm_sec 169 + 170 + (** Sync feeds for a single user *) 171 + let sync_user session state ~username = 172 + match Storage.load_user state username with 173 + | None -> 174 + Error (Printf.sprintf "User %s not found" username) 175 + | Some user when User.feeds user = [] -> 176 + Log.info (fun m -> m "No feeds configured for user %s" username); 177 + Ok () 178 + | Some user -> 179 + Log.info (fun m -> m "Syncing feeds for user %s..." username); 180 + 181 + (* Fetch all feeds concurrently *) 182 + let fetched_feeds = 183 + Eio.Fiber.List.filter_map (fun source -> 184 + try 185 + Log.info (fun m -> m " Fetching %s (%s)..." 186 + (Source.name source) (Source.url source)); 187 + Some (Feed.fetch session source) 188 + with e -> 189 + Log.err (fun m -> m " Failed to fetch %s: %s" 190 + (Source.name source) (Printexc.to_string e)); 191 + None 192 + ) (User.feeds user) 193 + in 194 + 195 + if fetched_feeds = [] then begin 196 + Error "No feeds successfully fetched" 197 + end else begin 198 + (* Get posts from fetched feeds *) 199 + let posts = Post.of_feeds fetched_feeds in 200 + Log.info (fun m -> m " Found %d new posts" (List.length posts)); 201 + 202 + (* Convert to Atom entries *) 203 + let new_entries = Format.Atom.entries_of_posts posts in 204 + 205 + (* Load existing entries *) 206 + let existing = Storage.load_existing_posts state username in 207 + Log.info (fun m -> m " Found %d existing posts" (List.length existing)); 208 + 209 + (* Merge entries *) 210 + let merged = merge_entries ~existing ~new_entries in 211 + Log.info (fun m -> m " Total posts after merge: %d" (List.length merged)); 212 + 213 + (* Save updated feed *) 214 + Storage.save_atom_feed state username merged; 215 + 216 + (* Update last_synced timestamp *) 217 + let now = current_timestamp () in 218 + let user = User.set_last_synced user now in 219 + Storage.save_user state user; 220 + 221 + Log.info (fun m -> m "Sync completed for user %s" username); 222 + Ok () 223 + end 224 + end 225 + 226 + module Export = struct 227 + (** Convert Atom entry to JSONFeed item *) 228 + let atom_entry_to_jsonfeed_item (entry : Syndic.Atom.entry) = 229 + (* Extract ID *) 230 + let id = Uri.to_string entry.id in 231 + 232 + (* Extract title *) 233 + let title = 234 + match entry.title with 235 + | Syndic.Atom.Text s -> Some s 236 + | Syndic.Atom.Html (_, s) -> Some s 237 + | Syndic.Atom.Xhtml (_, _) -> Some "Untitled" 238 + in 239 + 240 + (* Extract URL *) 241 + let url = 242 + match entry.links with 243 + | link :: _ -> Some (Uri.to_string link.href) 244 + | [] -> None 245 + in 246 + 247 + (* Extract content *) 248 + let content = 249 + match entry.content with 250 + | Some (Syndic.Atom.Text s) -> `Text s 251 + | Some (Syndic.Atom.Html (_, s)) -> `Html s 252 + | Some (Syndic.Atom.Xhtml (_, nodes)) -> 253 + let html = String.concat "" (List.map Syndic.XML.to_string nodes) in 254 + `Html html 255 + | Some (Syndic.Atom.Mime _) | Some (Syndic.Atom.Src _) | None -> 256 + `Text "" 257 + in 258 + 259 + (* Extract summary *) 260 + let summary = 261 + match entry.summary with 262 + | Some (Syndic.Atom.Text s) when String.trim s <> "" -> Some s 263 + | Some (Syndic.Atom.Html (_, s)) when String.trim s <> "" -> Some s 264 + | _ -> None 265 + in 266 + 267 + (* Extract authors *) 268 + let authors = 269 + let (author, contributors) = entry.authors in 270 + let author_list = author :: contributors in 271 + let jsonfeed_authors = List.filter_map (fun (a : Syndic.Atom.author) -> 272 + let name = String.trim a.name in 273 + if name = "" then None 274 + else Some (Jsonfeed.Author.create ~name ()) 275 + ) author_list in 276 + if jsonfeed_authors = [] then None else Some jsonfeed_authors 277 + in 278 + 279 + (* Extract tags *) 280 + let tags = 281 + match entry.categories with 282 + | [] -> None 283 + | cats -> 284 + let tag_list = List.map (fun (c : Syndic.Atom.category) -> 285 + match c.label with 286 + | Some lbl -> lbl 287 + | None -> c.term 288 + ) cats in 289 + if tag_list = [] then None else Some tag_list 290 + in 291 + 292 + (* Create JSONFeed item *) 293 + Jsonfeed.Item.create 294 + ~id 295 + ~content 296 + ?title 297 + ?url 298 + ?summary 299 + ?authors 300 + ?tags 301 + ~date_published:entry.updated 302 + () 303 + 304 + (** Export entries as JSONFeed *) 305 + let export_jsonfeed ~title entries = 306 + let items = List.map atom_entry_to_jsonfeed_item entries in 307 + let feed = Jsonfeed.create ~title ~items () in 308 + match Jsonfeed.to_string ~minify:false feed with 309 + | Ok json -> Ok json 310 + | Error err -> Error (Printf.sprintf "Failed to serialize JSON Feed: %s" (Jsont.Error.to_string err)) 311 + end 312 + 313 + let create env ~app_name = 314 + let xdg = Xdge.create env#fs app_name in 315 + let state = { xdg } in 316 + Paths.ensure_directories state; 317 + state 318 + 319 + let create_user state user = 320 + match Storage.load_user state (User.username user) with 321 + | Some _ -> 322 + Error (Printf.sprintf "User %s already exists" (User.username user)) 323 + | None -> 324 + Storage.save_user state user; 325 + Log.info (fun m -> m "User %s created" (User.username user)); 326 + Ok () 327 + 328 + let delete_user state ~username = 329 + match Storage.load_user state username with 330 + | None -> 331 + Error (Printf.sprintf "User %s not found" username) 332 + | Some _ -> 333 + Storage.delete_user state username; 334 + Log.info (fun m -> m "User %s deleted" username); 335 + Ok () 336 + 337 + let get_user state ~username = 338 + Storage.load_user state username 339 + 340 + let update_user state user = 341 + match Storage.load_user state (User.username user) with 342 + | None -> 343 + Error (Printf.sprintf "User %s not found" (User.username user)) 344 + | Some _ -> 345 + Storage.save_user state user; 346 + Log.info (fun m -> m "User %s updated" (User.username user)); 347 + Ok () 348 + 349 + let list_users state = 350 + Storage.list_users state 351 + 352 + let sync_user env state ~username = 353 + Session.with_session env @@ fun session -> 354 + Sync.sync_user session state ~username 355 + 356 + let sync_all env state = 357 + let users = Storage.list_users state in 358 + if users = [] then begin 359 + Log.info (fun m -> m "No users to sync"); 360 + Ok (0, 0) 361 + end else begin 362 + Log.info (fun m -> m "Syncing %d users concurrently..." (List.length users)); 363 + 364 + Session.with_session env @@ fun session -> 365 + let results = 366 + Eio.Fiber.List.map (fun username -> 367 + match Sync.sync_user session state ~username with 368 + | Ok () -> true 369 + | Error err -> 370 + Log.err (fun m -> m "Failed to sync user %s: %s" username err); 371 + false 372 + ) users 373 + in 374 + let success_count = List.length (List.filter (fun x -> x) results) in 375 + let fail_count = List.length users - success_count in 376 + 377 + if fail_count = 0 then 378 + Log.info (fun m -> m "All users synced successfully"); 379 + 380 + Ok (success_count, fail_count) 381 + end 382 + 383 + let get_user_posts state ~username ?limit () = 384 + let entries = Storage.load_existing_posts state username in 385 + match limit with 386 + | None -> entries 387 + | Some n -> List.filteri (fun i _ -> i < n) entries 388 + 389 + let get_all_posts state ?limit () = 390 + let users = Storage.list_users state in 391 + 392 + (* Collect all entries from all users with username tag *) 393 + let all_entries = 394 + List.concat_map (fun username -> 395 + let entries = Storage.load_existing_posts state username in 396 + List.map (fun entry -> (username, entry)) entries 397 + ) users 398 + in 399 + 400 + (* Sort by date (newest first) *) 401 + let sorted = List.sort (fun (_, a : string * Syndic.Atom.entry) (_, b) -> 402 + Ptime.compare b.updated a.updated 403 + ) all_entries in 404 + 405 + match limit with 406 + | None -> sorted 407 + | Some n -> List.filteri (fun i _ -> i < n) sorted 408 + 409 + let export_merged_feed state ~title ~format ?limit () = 410 + let all_posts = get_all_posts state ?limit () in 411 + let entries = List.map snd all_posts in 412 + 413 + match format with 414 + | `Atom -> 415 + let xml = Format.Atom.to_string (Format.Atom.feed_of_entries ~title entries) in 416 + Ok xml 417 + | `Jsonfeed -> 418 + if entries = [] then 419 + (* Empty JSONFeed *) 420 + let feed = Jsonfeed.create ~title ~items:[] () in 421 + match Jsonfeed.to_string ~minify:false feed with 422 + | Ok json -> Ok json 423 + | Error err -> Error (Printf.sprintf "Failed to serialize JSON Feed: %s" (Jsont.Error.to_string err)) 424 + else 425 + Export.export_jsonfeed ~title entries 426 + 427 + let analyze_user_quality state ~username = 428 + match Storage.load_user state username with 429 + | None -> 430 + Error (Printf.sprintf "User %s not found" username) 431 + | Some _ -> 432 + let entries = Storage.load_existing_posts state username in 433 + if entries = [] then 434 + Error "No entries to analyze" 435 + else 436 + Ok (Quality.analyze entries)
+120
stack/river/lib/state.mli
··· 1 + (* 2 + * Copyright (c) 2014, OCaml.org project 3 + * Copyright (c) 2015 KC Sivaramakrishnan <sk826@cl.cam.ac.uk> 4 + * 5 + * Permission to use, copy, modify, and distribute this software for any 6 + * purpose with or without fee is hereby granted, provided that the above 7 + * copyright notice and this permission notice appear in all copies. 8 + * 9 + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10 + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11 + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 12 + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13 + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 14 + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 15 + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 16 + *) 17 + 18 + (** State management for user data and feeds. *) 19 + 20 + type t 21 + (** State handle for managing user data and feeds on disk. *) 22 + 23 + val create : 24 + < fs : Eio.Fs.dir_ty Eio.Path.t; .. > -> 25 + app_name:string -> 26 + t 27 + (** [create env ~app_name] creates a state handle using XDG directories. 28 + 29 + Data is stored in: 30 + - Users: $XDG_STATE_HOME/[app_name]/users/ 31 + - Feeds: $XDG_STATE_HOME/[app_name]/feeds/user/ 32 + 33 + @param env The Eio environment with filesystem access 34 + @param app_name Application name for XDG paths *) 35 + 36 + (** {2 User Operations} *) 37 + 38 + val create_user : t -> User.t -> (unit, string) result 39 + (** [create_user state user] creates a new user. 40 + 41 + Returns [Error] if the user already exists. *) 42 + 43 + val delete_user : t -> username:string -> (unit, string) result 44 + (** [delete_user state ~username] deletes a user and their feed data. *) 45 + 46 + val get_user : t -> username:string -> User.t option 47 + (** [get_user state ~username] retrieves a user by username. *) 48 + 49 + val update_user : t -> User.t -> (unit, string) result 50 + (** [update_user state user] saves updated user configuration. *) 51 + 52 + val list_users : t -> string list 53 + (** [list_users state] returns all usernames. *) 54 + 55 + (** {2 Feed Operations} *) 56 + 57 + val sync_user : 58 + < clock : float Eio.Time.clock_ty Eio.Resource.t; 59 + fs : Eio.Fs.dir_ty Eio.Path.t; 60 + net : [ `Generic | `Unix ] Eio.Net.ty Eio.Resource.t; .. > -> 61 + t -> 62 + username:string -> 63 + (unit, string) result 64 + (** [sync_user env state ~username] fetches all feeds for the user and stores merged result. 65 + 66 + Posts are fetched concurrently and merged with existing posts. 67 + The result is stored as an Atom feed. *) 68 + 69 + val sync_all : 70 + < clock : float Eio.Time.clock_ty Eio.Resource.t; 71 + fs : Eio.Fs.dir_ty Eio.Path.t; 72 + net : [ `Generic | `Unix ] Eio.Net.ty Eio.Resource.t; .. > -> 73 + t -> 74 + (int * int, string) result 75 + (** [sync_all env state] syncs all users concurrently. 76 + 77 + Returns [Ok (success_count, fail_count)]. *) 78 + 79 + val get_user_posts : 80 + t -> 81 + username:string -> 82 + ?limit:int -> 83 + unit -> 84 + Syndic.Atom.entry list 85 + (** [get_user_posts state ~username ()] retrieves stored posts for a user. 86 + 87 + @param limit Optional maximum number of posts to return *) 88 + 89 + val get_all_posts : 90 + t -> 91 + ?limit:int -> 92 + unit -> 93 + (string * Syndic.Atom.entry) list 94 + (** [get_all_posts state ()] retrieves posts from all users, sorted by date. 95 + 96 + Returns list of (username, entry) tuples. 97 + @param limit Optional maximum number of posts to return *) 98 + 99 + (** {2 Export} *) 100 + 101 + val export_merged_feed : 102 + t -> 103 + title:string -> 104 + format:[ `Atom | `Jsonfeed ] -> 105 + ?limit:int -> 106 + unit -> 107 + (string, string) result 108 + (** [export_merged_feed state ~title ~format ()] exports a merged feed of all users. 109 + 110 + @param title Feed title 111 + @param format Output format 112 + @param limit Optional maximum number of entries *) 113 + 114 + (** {2 Analysis} *) 115 + 116 + val analyze_user_quality : 117 + t -> 118 + username:string -> 119 + (Quality.t, string) result 120 + (** [analyze_user_quality state ~username] analyzes quality metrics for a user's feed. *)
+34
stack/river/lib/text_extract.ml
··· 1 + (* 2 + * Copyright (c) 2014, OCaml.org project 3 + * Copyright (c) 2015 KC Sivaramakrishnan <sk826@cl.cam.ac.uk> 4 + * 5 + * Permission to use, copy, modify, and distribute this software for any 6 + * purpose with or without fee is hereby granted, provided that the above 7 + * copyright notice and this permission notice appear in all copies. 8 + * 9 + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10 + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11 + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 12 + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13 + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 14 + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 15 + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 16 + *) 17 + 18 + (** Internal utility for Syndic text extraction *) 19 + 20 + open Syndic 21 + 22 + (* Remove all tags *) 23 + let rec syndic_to_buffer b = function 24 + | XML.Node (_, _, subs) -> List.iter (syndic_to_buffer b) subs 25 + | XML.Data (_, d) -> Buffer.add_string b d 26 + 27 + let syndic_to_string x = 28 + let b = Buffer.create 1024 in 29 + List.iter (syndic_to_buffer b) x; 30 + Buffer.contents b 31 + 32 + let string_of_text_construct : Atom.text_construct -> string = function 33 + | Atom.Text s | Atom.Html (_, s) -> s 34 + | Atom.Xhtml (_, x) -> syndic_to_string x
+57
stack/river/lib/user.ml
··· 1 + (* 2 + * Copyright (c) 2014, OCaml.org project 3 + * Copyright (c) 2015 KC Sivaramakrishnan <sk826@cl.cam.ac.uk> 4 + * 5 + * Permission to use, copy, modify, and distribute this software for any 6 + * purpose with or without fee is hereby granted, provided that the above 7 + * copyright notice and this permission notice appear in all copies. 8 + * 9 + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10 + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11 + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 12 + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13 + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 14 + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 15 + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 16 + *) 17 + 18 + (** User management. *) 19 + 20 + type t = { 21 + username : string; 22 + fullname : string; 23 + email : string option; 24 + feeds : Source.t list; 25 + last_synced : string option; 26 + } 27 + 28 + let make ~username ~fullname ?email ?(feeds = []) ?last_synced () = 29 + { username; fullname; email; feeds; last_synced } 30 + 31 + let username t = t.username 32 + let fullname t = t.fullname 33 + let email t = t.email 34 + let feeds t = t.feeds 35 + let last_synced t = t.last_synced 36 + 37 + let add_feed t source = 38 + { t with feeds = source :: t.feeds } 39 + 40 + let remove_feed t ~url = 41 + let feeds = List.filter (fun s -> Source.url s <> url) t.feeds in 42 + { t with feeds } 43 + 44 + let set_last_synced t timestamp = 45 + { t with last_synced = Some timestamp } 46 + 47 + let jsont = 48 + let make username fullname email feeds last_synced = 49 + { username; fullname; email; feeds; last_synced } 50 + in 51 + Jsont.Object.map ~kind:"User" make 52 + |> Jsont.Object.mem "username" Jsont.string ~enc:(fun u -> u.username) 53 + |> Jsont.Object.mem "fullname" Jsont.string ~enc:(fun u -> u.fullname) 54 + |> Jsont.Object.opt_mem "email" Jsont.string ~enc:(fun u -> u.email) 55 + |> Jsont.Object.mem "feeds" (Jsont.list Source.jsont) ~enc:(fun u -> u.feeds) 56 + |> Jsont.Object.opt_mem "last_synced" Jsont.string ~enc:(fun u -> u.last_synced) 57 + |> Jsont.Object.finish
+64
stack/river/lib/user.mli
··· 1 + (* 2 + * Copyright (c) 2014, OCaml.org project 3 + * Copyright (c) 2015 KC Sivaramakrishnan <sk826@cl.cam.ac.uk> 4 + * 5 + * Permission to use, copy, modify, and distribute this software for any 6 + * purpose with or without fee is hereby granted, provided that the above 7 + * copyright notice and this permission notice appear in all copies. 8 + * 9 + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10 + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11 + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 12 + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13 + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 14 + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 15 + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 16 + *) 17 + 18 + (** User management. *) 19 + 20 + type t 21 + (** User configuration with feed subscriptions. *) 22 + 23 + val make : 24 + username:string -> 25 + fullname:string -> 26 + ?email:string -> 27 + ?feeds:Source.t list -> 28 + ?last_synced:string -> 29 + unit -> 30 + t 31 + (** [make ~username ~fullname ()] creates a new user. 32 + 33 + @param username Unique username identifier 34 + @param fullname User's display name 35 + @param email Optional email address 36 + @param feeds Optional list of feed sources (default: []) 37 + @param last_synced Optional ISO 8601 timestamp of last sync *) 38 + 39 + val username : t -> string 40 + (** [username user] returns the username. *) 41 + 42 + val fullname : t -> string 43 + (** [fullname user] returns the full name. *) 44 + 45 + val email : t -> string option 46 + (** [email user] returns the email address if set. *) 47 + 48 + val feeds : t -> Source.t list 49 + (** [feeds user] returns the list of subscribed feeds. *) 50 + 51 + val last_synced : t -> string option 52 + (** [last_synced user] returns the last sync timestamp if set. *) 53 + 54 + val add_feed : t -> Source.t -> t 55 + (** [add_feed user source] returns a new user with the feed added. *) 56 + 57 + val remove_feed : t -> url:string -> t 58 + (** [remove_feed user ~url] returns a new user with the feed removed by URL. *) 59 + 60 + val set_last_synced : t -> string -> t 61 + (** [set_last_synced user timestamp] returns a new user with updated sync time. *) 62 + 63 + val jsont : t Jsont.t 64 + (** JSON codec for users. *)