feat(wip): internal readability + xpath · desertthunder.dev/malfestio@51785fb

+183 -103

Cargo.lock

··· 518 518 519 519 [[package]] 520 520 name = "cssparser" 521 - version = "0.34.0" 521 + version = "0.36.0" 522 522 source = "registry+https://github.com/rust-lang/crates.io-index" 523 - checksum = "b7c66d1cd8ed61bf80b38432613a7a2f09401ab8d0501110655f8b341484a3e3" 523 + checksum = "dae61cf9c0abb83bd659dab65b7e4e38d8236824c85f0f804f173567bda257d2" 524 524 dependencies = [ 525 525 "cssparser-macros", 526 526 "dtoa-short", 527 527 "itoa", 528 - "phf 0.11.3", 528 + "phf 0.13.1", 529 529 "smallvec", 530 530 ] 531 531 ··· 649 649 650 650 [[package]] 651 651 name = "derive_more" 652 - version = "0.99.20" 652 + version = "2.1.1" 653 653 source = "registry+https://github.com/rust-lang/crates.io-index" 654 - checksum = "6edb4b64a43d977b8e99788fe3a04d483834fba1215a7e02caa415b626497f7f" 654 + checksum = "d751e9e49156b02b44f9c1815bcb94b984cdcc4396ecc32521c739452808b134" 655 + dependencies = [ 656 + "derive_more-impl", 657 + ] 658 + 659 + [[package]] 660 + name = "derive_more-impl" 661 + version = "2.1.1" 662 + source = "registry+https://github.com/rust-lang/crates.io-index" 663 + checksum = "799a97264921d8623a957f6c3b9011f3b5492f557bbb7a5a19b7fa6d06ba8dcb" 655 664 dependencies = [ 656 665 "proc-macro2", 657 666 "quote", 667 + "rustc_version", 658 668 "syn 2.0.111", 659 669 ] 660 670 ··· 682 692 ] 683 693 684 694 [[package]] 685 - name = "dom_query" 686 - version = "0.12.0" 687 - source = "registry+https://github.com/rust-lang/crates.io-index" 688 - checksum = "688b93023aba6768721b48ec5588308e45ac42d788c6dd974d1c2b9a1d04ea29" 689 - dependencies = [ 690 - "cssparser", 691 - "foldhash", 692 - "html5ever 0.29.1", 693 - "precomputed-hash", 694 - "selectors", 695 - "tendril", 696 - ] 697 - 698 - [[package]] 699 - name = "dom_smoothie" 700 - version = "0.4.0" 701 - source = "registry+https://github.com/rust-lang/crates.io-index" 702 - checksum = "d23bf500fc0a79f9bf12c38816574820929ecf4f6b39ec07743f7ed485439c31" 703 - dependencies = [ 704 - "dom_query", 705 - "flagset", 706 - "gjson", 707 - "html-escape", 708 - "once_cell", 709 - "phf 0.11.3", 710 - "regex", 711 - "tendril", 712 - "thiserror 2.0.17", 713 - "unicode-segmentation", 714 - "url", 715 - ] 716 - 717 - [[package]] 718 695 name = "dotenvy" 719 696 version = "0.15.7" 720 697 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 775 752 ] 776 753 777 754 [[package]] 755 + name = "ego-tree" 756 + version = "0.10.0" 757 + source = "registry+https://github.com/rust-lang/crates.io-index" 758 + checksum = "b2972feb8dffe7bc8c5463b1dacda1b0dfbed3710e50f977d965429692d74cd8" 759 + 760 + [[package]] 778 761 name = "elliptic-curve" 779 762 version = "0.13.8" 780 763 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 868 851 version = "0.1.6" 869 852 source = "registry+https://github.com/rust-lang/crates.io-index" 870 853 checksum = "645cbb3a84e60b7531617d5ae4e57f7e27308f6445f5abf653209ea76dec8dff" 871 - 872 - [[package]] 873 - name = "flagset" 874 - version = "0.4.7" 875 - source = "registry+https://github.com/rust-lang/crates.io-index" 876 - checksum = "b7ac824320a75a52197e8f2d787f6a38b6718bb6897a35142d749af3c0e8f4fe" 877 854 878 855 [[package]] 879 856 name = "fnv" ··· 1011 988 ] 1012 989 1013 990 [[package]] 1014 - name = "fxhash" 1015 - version = "0.2.1" 1016 - source = "registry+https://github.com/rust-lang/crates.io-index" 1017 - checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c" 1018 - dependencies = [ 1019 - "byteorder", 1020 - ] 1021 - 1022 - [[package]] 1023 991 name = "generic-array" 1024 992 version = "0.14.7" 1025 993 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 1028 996 "typenum", 1029 997 "version_check", 1030 998 "zeroize", 999 + ] 1000 + 1001 + [[package]] 1002 + name = "getopts" 1003 + version = "0.2.24" 1004 + source = "registry+https://github.com/rust-lang/crates.io-index" 1005 + checksum = "cfe4fbac503b8d1f88e6676011885f34b7174f46e59956bba534ba83abded4df" 1006 + dependencies = [ 1007 + "unicode-width", 1031 1008 ] 1032 1009 1033 1010 [[package]] ··· 1056 1033 "wasip2", 1057 1034 "wasm-bindgen", 1058 1035 ] 1059 - 1060 - [[package]] 1061 - name = "gjson" 1062 - version = "0.8.1" 1063 - source = "registry+https://github.com/rust-lang/crates.io-index" 1064 - checksum = "43503cc176394dd30a6525f5f36e838339b8b5619be33ed9a7783841580a97b6" 1065 1036 1066 1037 [[package]] 1067 1038 name = "group" ··· 1270 1241 1271 1242 [[package]] 1272 1243 name = "html5ever" 1273 - version = "0.29.1" 1244 + version = "0.36.1" 1274 1245 source = "registry+https://github.com/rust-lang/crates.io-index" 1275 - checksum = "3b7410cae13cbc75623c98ac4cbfd1f0bedddf3227afc24f370cf0f50a44a11c" 1246 + checksum = "6452c4751a24e1b99c3260d505eaeee76a050573e61f30ac2c924ddc7236f01e" 1276 1247 dependencies = [ 1277 1248 "log", 1278 - "mac", 1279 - "markup5ever 0.14.1", 1280 - "match_token", 1249 + "markup5ever 0.36.1", 1281 1250 ] 1282 1251 1283 1252 [[package]] ··· 1740 1709 dependencies = [ 1741 1710 "chrono", 1742 1711 "clap", 1743 - "dom_smoothie", 1744 1712 "dotenvy", 1745 - "html2md", 1746 1713 "malfestio-core", 1714 + "malfestio-readability", 1747 1715 "malfestio-server", 1748 1716 "reqwest", 1749 1717 "tokio", ··· 1761 1729 ] 1762 1730 1763 1731 [[package]] 1732 + name = "malfestio-readability" 1733 + version = "0.1.0" 1734 + dependencies = [ 1735 + "html-escape", 1736 + "html2md", 1737 + "html5ever 0.36.1", 1738 + "regex", 1739 + "scraper", 1740 + "sxd-document", 1741 + "sxd-xpath", 1742 + "thiserror 2.0.17", 1743 + "tokio", 1744 + "url", 1745 + ] 1746 + 1747 + [[package]] 1764 1748 name = "malfestio-server" 1765 1749 version = "0.1.0" 1766 1750 dependencies = [ ··· 1771 1755 "base64", 1772 1756 "chrono", 1773 1757 "deadpool-postgres", 1774 - "dom_smoothie", 1775 1758 "dotenvy", 1776 1759 "ed25519-dalek", 1777 1760 "getrandom 0.3.4", 1778 1761 "hickory-resolver 0.24.4", 1779 - "html2md", 1780 1762 "malfestio-core", 1763 + "malfestio-readability", 1781 1764 "regex", 1782 1765 "reqwest", 1783 1766 "serde", ··· 1804 1787 dependencies = [ 1805 1788 "log", 1806 1789 "phf 0.11.3", 1807 - "phf_codegen", 1808 - "string_cache", 1809 - "string_cache_codegen", 1790 + "phf_codegen 0.11.3", 1791 + "string_cache 0.8.9", 1792 + "string_cache_codegen 0.5.4", 1810 1793 "tendril", 1811 1794 ] 1812 1795 1813 1796 [[package]] 1814 1797 name = "markup5ever" 1815 - version = "0.14.1" 1798 + version = "0.36.1" 1816 1799 source = "registry+https://github.com/rust-lang/crates.io-index" 1817 - checksum = "c7a7213d12e1864c0f002f52c2923d4556935a43dec5e71355c2760e0f6e7a18" 1800 + checksum = "6c3294c4d74d0742910f8c7b466f44dda9eb2d5742c1e430138df290a1e8451c" 1818 1801 dependencies = [ 1819 1802 "log", 1820 - "phf 0.11.3", 1821 - "phf_codegen", 1822 - "string_cache", 1823 - "string_cache_codegen", 1824 1803 "tendril", 1804 + "web_atoms", 1825 1805 ] 1826 1806 1827 1807 [[package]] ··· 1848 1828 ] 1849 1829 1850 1830 [[package]] 1851 - name = "match_token" 1852 - version = "0.1.0" 1853 - source = "registry+https://github.com/rust-lang/crates.io-index" 1854 - checksum = "88a9689d8d44bf9964484516275f5cd4c9b59457a6940c1d5d0ecbb94510a36b" 1855 - dependencies = [ 1856 - "proc-macro2", 1857 - "quote", 1858 - "syn 2.0.111", 1859 - ] 1860 - 1861 - [[package]] 1862 1831 name = "matchers" 1863 1832 version = "0.2.0" 1864 1833 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 2126 2095 checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220" 2127 2096 2128 2097 [[package]] 2098 + name = "peresil" 2099 + version = "0.3.0" 2100 + source = "registry+https://github.com/rust-lang/crates.io-index" 2101 + checksum = "f658886ed52e196e850cfbbfddab9eaa7f6d90dd0929e264c31e5cec07e09e57" 2102 + 2103 + [[package]] 2129 2104 name = "phf" 2130 2105 version = "0.11.3" 2131 2106 source = "registry+https://github.com/rust-lang/crates.io-index" 2132 2107 checksum = "1fd6780a80ae0c52cc120a26a1a42c1ae51b247a253e4e06113d23d2c2edd078" 2133 2108 dependencies = [ 2134 - "phf_macros", 2135 2109 "phf_shared 0.11.3", 2136 2110 ] 2137 2111 ··· 2141 2115 source = "registry+https://github.com/rust-lang/crates.io-index" 2142 2116 checksum = "c1562dc717473dbaa4c1f85a36410e03c047b2e7df7f45ee938fbef64ae7fadf" 2143 2117 dependencies = [ 2118 + "phf_macros", 2144 2119 "phf_shared 0.13.1", 2145 2120 "serde", 2146 2121 ] ··· 2151 2126 source = "registry+https://github.com/rust-lang/crates.io-index" 2152 2127 checksum = "aef8048c789fa5e851558d709946d6d79a8ff88c0440c587967f8e94bfb1216a" 2153 2128 dependencies = [ 2154 - "phf_generator", 2129 + "phf_generator 0.11.3", 2155 2130 "phf_shared 0.11.3", 2156 2131 ] 2157 2132 2158 2133 [[package]] 2134 + name = "phf_codegen" 2135 + version = "0.13.1" 2136 + source = "registry+https://github.com/rust-lang/crates.io-index" 2137 + checksum = "49aa7f9d80421bca176ca8dbfebe668cc7a2684708594ec9f3c0db0805d5d6e1" 2138 + dependencies = [ 2139 + "phf_generator 0.13.1", 2140 + "phf_shared 0.13.1", 2141 + ] 2142 + 2143 + [[package]] 2159 2144 name = "phf_generator" 2160 2145 version = "0.11.3" 2161 2146 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 2166 2151 ] 2167 2152 2168 2153 [[package]] 2154 + name = "phf_generator" 2155 + version = "0.13.1" 2156 + source = "registry+https://github.com/rust-lang/crates.io-index" 2157 + checksum = "135ace3a761e564ec88c03a77317a7c6b80bb7f7135ef2544dbe054243b89737" 2158 + dependencies = [ 2159 + "fastrand", 2160 + "phf_shared 0.13.1", 2161 + ] 2162 + 2163 + [[package]] 2169 2164 name = "phf_macros" 2170 - version = "0.11.3" 2165 + version = "0.13.1" 2171 2166 source = "registry+https://github.com/rust-lang/crates.io-index" 2172 - checksum = "f84ac04429c13a7ff43785d75ad27569f2951ce0ffd30a3321230db2fc727216" 2167 + checksum = "812f032b54b1e759ccd5f8b6677695d5268c588701effba24601f6932f8269ef" 2173 2168 dependencies = [ 2174 - "phf_generator", 2175 - "phf_shared 0.11.3", 2169 + "phf_generator 0.13.1", 2170 + "phf_shared 0.13.1", 2176 2171 "proc-macro2", 2177 2172 "quote", 2178 2173 "syn 2.0.111", ··· 2310 2305 dependencies = [ 2311 2306 "unicode-ident", 2312 2307 ] 2308 + 2309 + [[package]] 2310 + name = "quick-error" 2311 + version = "1.2.3" 2312 + source = "registry+https://github.com/rust-lang/crates.io-index" 2313 + checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0" 2313 2314 2314 2315 [[package]] 2315 2316 name = "quinn" ··· 2673 2674 checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" 2674 2675 2675 2676 [[package]] 2677 + name = "scraper" 2678 + version = "0.25.0" 2679 + source = "registry+https://github.com/rust-lang/crates.io-index" 2680 + checksum = "93cecd86d6259499c844440546d02f55f3e17bd286e529e48d1f9f67e92315cb" 2681 + dependencies = [ 2682 + "cssparser", 2683 + "ego-tree", 2684 + "getopts", 2685 + "html5ever 0.36.1", 2686 + "precomputed-hash", 2687 + "selectors", 2688 + "tendril", 2689 + ] 2690 + 2691 + [[package]] 2676 2692 name = "sec1" 2677 2693 version = "0.7.3" 2678 2694 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 2725 2741 2726 2742 [[package]] 2727 2743 name = "selectors" 2728 - version = "0.26.0" 2744 + version = "0.33.0" 2729 2745 source = "registry+https://github.com/rust-lang/crates.io-index" 2730 - checksum = "fd568a4c9bb598e291a08244a5c1f5a8a6650bee243b5b0f8dbb3d9cc1d87fe8" 2746 + checksum = "feef350c36147532e1b79ea5c1f3791373e61cbd9a6a2615413b3807bb164fb7" 2731 2747 dependencies = [ 2732 2748 "bitflags", 2733 2749 "cssparser", 2734 2750 "derive_more", 2735 - "fxhash", 2736 2751 "log", 2737 2752 "new_debug_unreachable", 2738 - "phf 0.11.3", 2739 - "phf_codegen", 2753 + "phf 0.13.1", 2754 + "phf_codegen 0.13.1", 2740 2755 "precomputed-hash", 2756 + "rustc-hash", 2741 2757 "servo_arc", 2742 2758 "smallvec", 2743 2759 ] ··· 2986 3002 ] 2987 3003 2988 3004 [[package]] 3005 + name = "string_cache" 3006 + version = "0.9.0" 3007 + source = "registry+https://github.com/rust-lang/crates.io-index" 3008 + checksum = "a18596f8c785a729f2819c0f6a7eae6ebeebdfffbfe4214ae6b087f690e31901" 3009 + dependencies = [ 3010 + "new_debug_unreachable", 3011 + "parking_lot", 3012 + "phf_shared 0.13.1", 3013 + "precomputed-hash", 3014 + "serde", 3015 + ] 3016 + 3017 + [[package]] 2989 3018 name = "string_cache_codegen" 2990 3019 version = "0.5.4" 2991 3020 source = "registry+https://github.com/rust-lang/crates.io-index" 2992 3021 checksum = "c711928715f1fe0fe509c53b43e993a9a557babc2d0a3567d0a3006f1ac931a0" 2993 3022 dependencies = [ 2994 - "phf_generator", 3023 + "phf_generator 0.11.3", 2995 3024 "phf_shared 0.11.3", 2996 3025 "proc-macro2", 2997 3026 "quote", 2998 3027 ] 2999 3028 3000 3029 [[package]] 3030 + name = "string_cache_codegen" 3031 + version = "0.6.1" 3032 + source = "registry+https://github.com/rust-lang/crates.io-index" 3033 + checksum = "585635e46db231059f76c5849798146164652513eb9e8ab2685939dd90f29b69" 3034 + dependencies = [ 3035 + "phf_generator 0.13.1", 3036 + "phf_shared 0.13.1", 3037 + "proc-macro2", 3038 + "quote", 3039 + ] 3040 + 3041 + [[package]] 3001 3042 name = "stringprep" 3002 3043 version = "0.1.5" 3003 3044 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 3019 3060 version = "2.6.1" 3020 3061 source = "registry+https://github.com/rust-lang/crates.io-index" 3021 3062 checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" 3063 + 3064 + [[package]] 3065 + name = "sxd-document" 3066 + version = "0.3.2" 3067 + source = "registry+https://github.com/rust-lang/crates.io-index" 3068 + checksum = "94d82f37be9faf1b10a82c4bd492b74f698e40082f0f40de38ab275f31d42078" 3069 + dependencies = [ 3070 + "peresil", 3071 + "typed-arena", 3072 + ] 3073 + 3074 + [[package]] 3075 + name = "sxd-xpath" 3076 + version = "0.4.2" 3077 + source = "registry+https://github.com/rust-lang/crates.io-index" 3078 + checksum = "36e39da5d30887b5690e29de4c5ebb8ddff64ebd9933f98a01daaa4fd11b36ea" 3079 + dependencies = [ 3080 + "peresil", 3081 + "quick-error", 3082 + "sxd-document", 3083 + ] 3022 3084 3023 3085 [[package]] 3024 3086 name = "syn" ··· 3460 3522 checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" 3461 3523 3462 3524 [[package]] 3525 + name = "typed-arena" 3526 + version = "1.7.0" 3527 + source = "registry+https://github.com/rust-lang/crates.io-index" 3528 + checksum = "a9b2228007eba4120145f785df0f6c92ea538f5a3635a612ecf4e334c8c1446d" 3529 + 3530 + [[package]] 3463 3531 name = "typenum" 3464 3532 version = "1.19.0" 3465 3533 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 3493 3561 checksum = "7df058c713841ad818f1dc5d3fd88063241cc61f49f5fbea4b951e8cf5a8d71d" 3494 3562 3495 3563 [[package]] 3496 - name = "unicode-segmentation" 3497 - version = "1.12.0" 3564 + name = "unicode-width" 3565 + version = "0.2.2" 3498 3566 source = "registry+https://github.com/rust-lang/crates.io-index" 3499 - checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493" 3567 + checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254" 3500 3568 3501 3569 [[package]] 3502 3570 name = "unsigned-varint" ··· 3698 3766 dependencies = [ 3699 3767 "js-sys", 3700 3768 "wasm-bindgen", 3769 + ] 3770 + 3771 + [[package]] 3772 + name = "web_atoms" 3773 + version = "0.2.0" 3774 + source = "registry+https://github.com/rust-lang/crates.io-index" 3775 + checksum = "acd0c322f146d0f8aad130ce6c187953889359584497dac6561204c8e17bb43d" 3776 + dependencies = [ 3777 + "phf 0.13.1", 3778 + "phf_codegen 0.13.1", 3779 + "string_cache 0.9.0", 3780 + "string_cache_codegen 0.6.1", 3701 3781 ] 3702 3782 3703 3783 [[package]]

+1 -1

Cargo.toml

··· 1 1 [workspace] 2 2 resolver = "2" 3 - members = ["crates/cli", "crates/core", "crates/server"] 3 + members = ["crates/cli", "crates/core", "crates/readability", "crates/server"] 4 4 5 5 [workspace.lints.clippy] 6 6 bool_comparison = "deny"

+1 -2

crates/cli/Cargo.toml

··· 6 6 [dependencies] 7 7 chrono = "0.4" 8 8 clap = { version = "4.5.53", features = ["derive"] } 9 - dom_smoothie = "0.4" 10 9 dotenvy = "0.15.7" 11 - html2md = "0.2.15" 12 10 malfestio-core = { version = "0.1.0", path = "../core" } 11 + malfestio-readability = { version = "0.1.0", path = "../readability" } 13 12 malfestio-server = { version = "0.1.0", path = "../server" } 14 13 reqwest = { version = "0.12", features = ["json"] } 15 14 tokio = { version = "1.48.0", features = ["full"] }

+22 -35

crates/cli/src/main.rs

··· 317 317 318 318 #[cfg(debug_assertions)] 319 319 async fn debug_article(url: &str, output_file: Option<&str>) -> malfestio_core::Result<()> { 320 - use dom_smoothie::Readability; 320 + use malfestio_readability::Readability; 321 321 322 322 println!("Fetching article from: {}", url); 323 323 ··· 327 327 .build() 328 328 .map_err(|e| malfestio_core::Error::Other(format!("Failed to build client: {}", e)))?; 329 329 330 - let response = client.get(url) 330 + let response = client 331 + .get(url) 331 332 .send() 332 333 .await 333 334 .map_err(|e| malfestio_core::Error::Other(format!("Failed to fetch URL: {}", e)))?; ··· 339 340 340 341 println!("Fetched {} bytes of HTML", html_content.len()); 341 342 342 - // Extract article using dom_smoothie 343 + // Extract article using malfestio-readability 343 344 println!("Extracting article content..."); 344 345 let url_clone = url.to_string(); 345 - let result = tokio::task::spawn_blocking( 346 - move || -> Result<(String, String, Option<String>, Option<String>), String> { 347 - let mut readability = Readability::new(html_content, Some(&url_clone), None) 348 - .map_err(|e| format!("Readability error: {}", e))?; 349 - let article = readability.parse().map_err(|e| format!("Parse error: {}", e))?; 350 - Ok(( 351 - article.title, 352 - article.content.to_string(), 353 - article.byline, 354 - article.published_time, 355 - )) 356 - }, 357 - ) 346 + let result = tokio::task::spawn_blocking(move || -> Result<malfestio_readability::Article, String> { 347 + let readability = Readability::new(html_content, Some(&url_clone)); 348 + readability.parse().map_err(|e| format!("Parse error: {}", e)) 349 + }) 358 350 .await 359 351 .map_err(|e| malfestio_core::Error::Other(format!("Task join error: {}", e)))? 360 352 .map_err(malfestio_core::Error::Other)?; 361 353 362 - let (title, content, author, publish_date) = result; 354 + let article = result; 363 355 364 356 println!("✓ Extracted article:"); 365 - println!(" Title: {}", title); 366 - if let Some(author) = &author { 357 + println!(" Title: {}", article.title); 358 + if let Some(ref author) = article.author { 367 359 println!(" Author: {}", author); 368 360 } 369 - if let Some(date) = &publish_date { 361 + if let Some(ref date) = article.published_date { 370 362 println!(" Published: {}", date); 371 363 } 372 - println!(" Content length: {} bytes", content.len()); 373 - 374 - // Convert HTML to markdown 375 - println!("\nConverting to markdown..."); 376 - let markdown = html2md::parse_html(&content); 377 - println!("✓ Converted to {} bytes of markdown", markdown.len()); 364 + println!(" Content length: {} bytes", article.content.len()); 365 + println!(" Markdown length: {} bytes", article.markdown.len()); 378 366 379 - // Output 380 367 if let Some(file_path) = output_file { 381 368 println!("\nSaving to file: {}", file_path); 382 369 383 370 let mut output = String::new(); 384 - output.push_str(&format!("# {}\n\n", title)); 385 - if let Some(author) = author { 371 + output.push_str(&format!("# {}\n\n", article.title)); 372 + if let Some(ref author) = article.author { 386 373 output.push_str(&format!("**Author:** {}\n", author)); 387 374 } 388 - if let Some(date) = publish_date { 375 + if let Some(ref date) = article.published_date { 389 376 output.push_str(&format!("**Published:** {}\n", date)); 390 377 } 391 378 output.push_str(&format!("**Source:** {}\n\n", url)); 392 379 output.push_str("---\n\n"); 393 - output.push_str(&markdown); 380 + output.push_str(&article.markdown); 394 381 395 382 fs::write(file_path, output) 396 383 .map_err(|e| malfestio_core::Error::Other(format!("Failed to write file: {}", e)))?; ··· 398 385 println!("✓ Saved to {}", file_path); 399 386 } else { 400 387 println!("\n{}", "=".repeat(80)); 401 - println!("# {}", title); 402 - if let Some(author) = author { 388 + println!("# {}", article.title); 389 + if let Some(ref author) = article.author { 403 390 println!("\n**Author:** {}", author); 404 391 } 405 - if let Some(date) = publish_date { 392 + if let Some(ref date) = article.published_date { 406 393 println!("**Published:** {}", date); 407 394 } 408 395 println!("**Source:** {}", url); 409 396 println!("{}", "=".repeat(80)); 410 - println!("\n{}", markdown); 397 + println!("\n{}", article.markdown); 411 398 } 412 399 413 400 Ok(())

+18

crates/readability/Cargo.toml

··· 1 + [package] 2 + name = "malfestio-readability" 3 + version = "0.1.0" 4 + edition = "2024" 5 + 6 + [dependencies] 7 + scraper = "0.25" 8 + html5ever = "0.36" 9 + sxd-document = "0.3" 10 + sxd-xpath = "0.4" 11 + html2md = "0.2.15" 12 + html-escape = "0.2" 13 + url = "2.5" 14 + thiserror = "2.0" 15 + regex = "1.12" 16 + 17 + [dev-dependencies] 18 + tokio = { version = "1.48", features = ["test-util"] }

+25

crates/readability/rules/.wikipedia.org.txt

··· 1 + title: //h1[@id='firstHeading'] 2 + body: //div[@id = 'bodyContent'] 3 + strip_id_or_class: editsection 4 + #strip_id_or_class: toc 5 + strip_id_or_class: vertical-navbox 6 + strip: //*[@id='toc'] 7 + strip: //div[@id='catlinks'] 8 + strip: //div[@id='jump-to-nav'] 9 + strip: //div[@class='thumbcaption']//div[@class='magnify'] 10 + strip: //table[@class='navbox'] 11 + #strip: //table[contains(@class, 'infobox')] 12 + strip: //div[@class='dablink'] 13 + strip: //div[@id='contentSub'] 14 + strip: //table[contains(@class, 'metadata')] 15 + strip: //*[contains(@class, 'noprint')] 16 + strip: //span[@class='noexcerpt'] 17 + strip: //math 18 + 19 + http_header(user-agent): Mozilla/5.2 20 + 21 + prune: no 22 + tidy: no 23 + test_url: http://en.wikipedia.org/wiki/Christopher_Lloyd 24 + test_url: https://en.wikipedia.org/wiki/Ronnie_James_Dio 25 + test_url: https://en.wikipedia.org/wiki/Metallica

+9

crates/readability/rules/arxiv.org.txt

··· 1 + title: //h1[contains(concat(' ',normalize-space(@class),' '),' title ')] 2 + 3 + body: //blockquote[contains(concat(' ',normalize-space(@class),' '),' abstract ')] 4 + 5 + date: //meta[@name='citation_date']/@content 6 + author: //meta[@name='citation_author']/@content 7 + 8 + test_url: https://arxiv.org/abs/2009.03017 9 + test_url: https://arxiv.org/abs/2012.03780

+5

crates/readability/src/cleaner/mod.rs

··· 1 + //! HTML cleaning and sanitization 2 + 3 + pub mod sanitizer; 4 + 5 + pub use sanitizer::HtmlCleaner;

+30

crates/readability/src/cleaner/sanitizer.rs

··· 1 + //! HTML sanitization and cleaning 2 + 3 + /// HTML cleaner and sanitizer 4 + pub struct HtmlCleaner; 5 + 6 + impl HtmlCleaner { 7 + /// Clean HTML content 8 + pub fn clean(html: &str) -> String { 9 + // TODO: Implement cleaning 10 + html.to_string() 11 + } 12 + 13 + /// Remove scripts and styles 14 + pub fn remove_scripts_and_styles(html: &str) -> String { 15 + // TODO: Implement 16 + html.to_string() 17 + } 18 + 19 + /// Normalize whitespace 20 + pub fn normalize_whitespace(html: &str) -> String { 21 + // TODO: Implement 22 + html.to_string() 23 + } 24 + 25 + /// Remove empty elements 26 + pub fn remove_empty_elements(html: &str) -> String { 27 + // TODO: Implement 28 + html.to_string() 29 + } 30 + }

+69

crates/readability/src/config/embedded_rules.rs

··· 1 + //! Embedded site-specific extraction rules 2 + //! 3 + //! Rules are compiled into the binary at build time for fast access without filesystem dependencies. 4 + 5 + use std::collections::HashMap; 6 + 7 + /// Embedded rule files indexed by domain 8 + /// 9 + /// Supported domains: 10 + /// - arxiv.org 11 + /// - .wikipedia.org (subdomain wildcard) 12 + pub fn get_embedded_rules() -> HashMap<&'static str, &'static str> { 13 + let mut rules = HashMap::new(); 14 + rules.insert("arxiv.org", include_str!("../../rules/arxiv.org.txt")); 15 + rules.insert(".wikipedia.org", include_str!("../../rules/.wikipedia.org.txt")); 16 + rules 17 + } 18 + 19 + /// Get embedded rule content for a domain 20 + pub fn get_rule_for_domain(domain: &str) -> Option<&'static str> { 21 + let rules = get_embedded_rules(); 22 + 23 + if let Some(rule) = rules.get(domain) { 24 + return Some(rule); 25 + } 26 + 27 + let parts: Vec<&str> = domain.split('.').collect(); 28 + if parts.len() > 2 { 29 + let parent_domain = parts[1..].join("."); 30 + let wildcard_key = format!(".{}", parent_domain); 31 + if let Some(rule) = rules.get(wildcard_key.as_str()) { 32 + return Some(rule); 33 + } 34 + } 35 + 36 + None 37 + } 38 + 39 + #[cfg(test)] 40 + mod tests { 41 + use super::*; 42 + 43 + #[test] 44 + fn test_embedded_rules_loaded() { 45 + let rules = get_embedded_rules(); 46 + assert!(rules.contains_key("arxiv.org")); 47 + assert!(rules.contains_key(".wikipedia.org")); 48 + } 49 + 50 + #[test] 51 + fn test_get_arxiv_rule() { 52 + let rule = get_rule_for_domain("arxiv.org"); 53 + assert!(rule.is_some()); 54 + assert!(rule.unwrap().contains("title:")); 55 + } 56 + 57 + #[test] 58 + fn test_get_wikipedia_rule_subdomain() { 59 + let rule = get_rule_for_domain("en.wikipedia.org"); 60 + assert!(rule.is_some()); 61 + assert!(rule.unwrap().contains("firstHeading")); 62 + } 63 + 64 + #[test] 65 + fn test_unknown_domain() { 66 + let rule = get_rule_for_domain("unknown.com"); 67 + assert!(rule.is_none()); 68 + } 69 + }

+132

crates/readability/src/config/loader.rs

··· 1 + //! Load site-specific configuration files based on URL 2 + 3 + use crate::config::embedded_rules; 4 + use crate::config::parser::{SiteConfig, parse_config}; 5 + use crate::error::Result; 6 + use std::path::{Path, PathBuf}; 7 + use url::Url; 8 + 9 + /// Loads site-specific configuration files 10 + /// 11 + /// First checks embedded rules, then falls back to external rules_dir if provided. 12 + #[derive(Default)] 13 + pub struct ConfigLoader { 14 + rules_dir: Option<PathBuf>, 15 + } 16 + 17 + impl ConfigLoader { 18 + /// Create a new config loader with embedded rules only 19 + pub fn new() -> Self { 20 + Self::default() 21 + } 22 + 23 + /// Create a config loader with an external rules directory 24 + /// 25 + /// External rules take precedence over embedded rules. 26 + pub fn with_rules_dir(rules_dir: PathBuf) -> Self { 27 + Self { rules_dir: Some(rules_dir) } 28 + } 29 + 30 + /// Load configuration for a given URL 31 + /// 32 + /// Priority: 33 + /// 1. External rules (if rules_dir provided) 34 + /// 2. Embedded rules 35 + /// 3. None (if no match found) 36 + pub fn load_for_url(&self, url: &str) -> Result<Option<SiteConfig>> { 37 + let Some(domain) = Self::extract_domain(url) else { 38 + return Ok(None); 39 + }; 40 + 41 + if let Some(ref rules_dir) = self.rules_dir 42 + && let Some(config) = self.try_load_from_dir(rules_dir, &domain)? 43 + { 44 + return Ok(Some(config)); 45 + } 46 + 47 + if let Some(rule_content) = embedded_rules::get_rule_for_domain(&domain) { 48 + return Ok(Some(parse_config(rule_content)?)); 49 + } 50 + 51 + Ok(None) 52 + } 53 + 54 + /// Try to load config from external directory 55 + fn try_load_from_dir(&self, rules_dir: &Path, domain: &str) -> Result<Option<SiteConfig>> { 56 + let exact_path = rules_dir.join(format!("{}.txt", domain)); 57 + if exact_path.exists() { 58 + let content = std::fs::read_to_string(&exact_path)?; 59 + return Ok(Some(parse_config(&content)?)); 60 + } 61 + 62 + let wildcard_path = rules_dir.join(format!(".{}.txt", domain)); 63 + if wildcard_path.exists() { 64 + let content = std::fs::read_to_string(&wildcard_path)?; 65 + return Ok(Some(parse_config(&content)?)); 66 + } 67 + 68 + if let Some(parent_domain) = Self::extract_parent_domain(domain) { 69 + let parent_wildcard = rules_dir.join(format!(".{}.txt", parent_domain)); 70 + if parent_wildcard.exists() { 71 + let content = std::fs::read_to_string(&parent_wildcard)?; 72 + return Ok(Some(parse_config(&content)?)); 73 + } 74 + } 75 + 76 + Ok(None) 77 + } 78 + 79 + /// Extract domain from URL 80 + fn extract_domain(url: &str) -> Option<String> { 81 + Url::parse(url).ok().and_then(|u| u.host_str().map(String::from)) 82 + } 83 + 84 + /// Extract parent domain (e.g., "en.wikipedia.org" -> "wikipedia.org") 85 + fn extract_parent_domain(domain: &str) -> Option<String> { 86 + let parts: Vec<&str> = domain.split('.').collect(); 87 + if parts.len() > 2 { Some(parts[1..].join(".")) } else { None } 88 + } 89 + } 90 + 91 + #[cfg(test)] 92 + mod tests { 93 + use super::*; 94 + 95 + #[test] 96 + fn test_extract_domain() { 97 + assert_eq!( 98 + ConfigLoader::extract_domain("https://arxiv.org/abs/123"), 99 + Some("arxiv.org".to_string()) 100 + ); 101 + assert_eq!( 102 + ConfigLoader::extract_domain("https://en.wikipedia.org/wiki/Article"), 103 + Some("en.wikipedia.org".to_string()) 104 + ); 105 + assert_eq!(ConfigLoader::extract_domain("invalid"), None); 106 + } 107 + 108 + #[test] 109 + fn test_load_embedded_arxiv() { 110 + let loader = ConfigLoader::new(); 111 + let config = loader 112 + .load_for_url("https://arxiv.org/abs/2009.03017") 113 + .unwrap() 114 + .expect("Should find embedded arxiv config"); 115 + 116 + assert_eq!(config.title.len(), 1); 117 + assert_eq!(config.body.len(), 1); 118 + } 119 + 120 + #[test] 121 + fn test_load_embedded_wikipedia() { 122 + let loader = ConfigLoader::new(); 123 + let config = loader 124 + .load_for_url("https://en.wikipedia.org/wiki/Article") 125 + .unwrap() 126 + .expect("Should find embedded wikipedia config"); 127 + 128 + assert_eq!(config.title.len(), 1); 129 + assert_eq!(config.body.len(), 1); 130 + assert!(!config.prune); 131 + } 132 + }

+8

crates/readability/src/config/mod.rs

··· 1 + //! Configuration file parsing and loading for site-specific extraction rules 2 + 3 + pub mod embedded_rules; 4 + pub mod loader; 5 + pub mod parser; 6 + 7 + pub use loader::ConfigLoader; 8 + pub use parser::{SiteConfig, parse_config};

+156

crates/readability/src/config/parser.rs

··· 1 + //! Parser for ftr-site-config format extraction rules 2 + 3 + use crate::error::{Error, Result}; 4 + 5 + /// Site-specific extraction configuration 6 + #[derive(Debug, Clone, Default)] 7 + pub struct SiteConfig { 8 + /// XPath expressions for title extraction (evaluated in order) 9 + pub title: Vec<String>, 10 + /// XPath expressions for body extraction 11 + pub body: Vec<String>, 12 + /// XPath expressions for author extraction 13 + pub author: Vec<String>, 14 + /// XPath expressions for date extraction 15 + pub date: Vec<String>, 16 + /// XPath expressions for elements to strip 17 + pub strip: Vec<String>, 18 + /// Substrings to match in @id or @class for stripping 19 + pub strip_id_or_class: Vec<String>, 20 + /// Whether to prune non-content elements (default: true) 21 + pub prune: bool, 22 + /// Whether to run HTML Tidy preprocessor (default: true) 23 + pub tidy: bool, 24 + /// Whether to fall back to generic extraction on failure (default: true) 25 + pub autodetect_on_failure: bool, 26 + /// Test URLs for validation 27 + pub test_urls: Vec<String>, 28 + } 29 + 30 + /// Parse a site configuration file in ftr-site-config format 31 + /// 32 + /// Format: 33 + /// ```text 34 + /// # Comments start with hash 35 + /// directive: value 36 + /// directive: another value 37 + /// 38 + /// # Boolean directives 39 + /// prune: yes 40 + /// tidy: no 41 + /// ``` 42 + pub fn parse_config(content: &str) -> Result<SiteConfig> { 43 + let mut config = SiteConfig { prune: true, tidy: true, autodetect_on_failure: true, ..Default::default() }; 44 + 45 + for line in content.lines() { 46 + let line = line.trim(); 47 + 48 + if line.is_empty() || line.starts_with('#') { 49 + continue; 50 + } 51 + 52 + if let Some((directive, value)) = line.split_once(':') { 53 + let directive = directive.trim(); 54 + let value = value.trim(); 55 + 56 + match directive { 57 + "title" => config.title.push(value.to_string()), 58 + "body" => config.body.push(value.to_string()), 59 + "author" => config.author.push(value.to_string()), 60 + "date" => config.date.push(value.to_string()), 61 + "strip" => config.strip.push(value.to_string()), 62 + "strip_id_or_class" => config.strip_id_or_class.push(value.to_string()), 63 + "test_url" => config.test_urls.push(value.to_string()), 64 + "prune" => config.prune = parse_bool(value)?, 65 + "tidy" => config.tidy = parse_bool(value)?, 66 + "autodetect_on_failure" => config.autodetect_on_failure = parse_bool(value)?, 67 + // TODO: Implement other directives (like http_header) 68 + _ => {} 69 + } 70 + } 71 + } 72 + 73 + Ok(config) 74 + } 75 + 76 + /// Parse a boolean value (yes/no, true/false, 1/0) 77 + fn parse_bool(value: &str) -> Result<bool> { 78 + match value.to_lowercase().as_str() { 79 + "yes" | "true" | "1" => Ok(true), 80 + "no" | "false" | "0" => Ok(false), 81 + _ => Err(Error::ConfigError(format!("Invalid boolean value: {}", value))), 82 + } 83 + } 84 + 85 + #[cfg(test)] 86 + mod tests { 87 + use super::*; 88 + 89 + #[test] 90 + fn test_parse_empty_config() { 91 + let config = parse_config("").unwrap(); 92 + assert!(config.title.is_empty()); 93 + assert!(config.body.is_empty()); 94 + } 95 + 96 + #[test] 97 + fn test_parse_arxiv_config() { 98 + let content = r#" 99 + title: //h1[contains(concat(' ',normalize-space(@class),' '),' title ')] 100 + body: //blockquote[contains(concat(' ',normalize-space(@class),' '),' abstract ')] 101 + date: //meta[@name='citation_date']/@content 102 + author: //meta[@name='citation_author']/@content 103 + test_url: https://arxiv.org/abs/2009.03017 104 + test_url: https://arxiv.org/abs/2012.03780 105 + "#; 106 + 107 + let config = parse_config(content).unwrap(); 108 + assert_eq!(config.title.len(), 1); 109 + assert_eq!(config.body.len(), 1); 110 + assert_eq!(config.author.len(), 1); 111 + assert_eq!(config.date.len(), 1); 112 + assert_eq!(config.test_urls.len(), 2); 113 + } 114 + 115 + #[test] 116 + fn test_parse_with_comments() { 117 + let content = r#" 118 + # This is a comment 119 + title: //h1 120 + # Another comment 121 + body: //article 122 + "#; 123 + 124 + let config = parse_config(content).unwrap(); 125 + assert_eq!(config.title.len(), 1); 126 + assert_eq!(config.body.len(), 1); 127 + } 128 + 129 + #[test] 130 + fn test_parse_boolean_directives() { 131 + let content = r#" 132 + prune: no 133 + tidy: yes 134 + autodetect_on_failure: no 135 + "#; 136 + 137 + let config = parse_config(content).unwrap(); 138 + assert!(!config.prune); 139 + assert!(config.tidy); 140 + assert!(!config.autodetect_on_failure); 141 + } 142 + 143 + #[test] 144 + fn test_parse_strip_directives() { 145 + let content = r#" 146 + strip: //div[@class='sidebar'] 147 + strip: //div[@id='footer'] 148 + strip_id_or_class: advertisement 149 + strip_id_or_class: nav 150 + "#; 151 + 152 + let config = parse_config(content).unwrap(); 153 + assert_eq!(config.strip.len(), 2); 154 + assert_eq!(config.strip_id_or_class.len(), 2); 155 + } 156 + }

+39

crates/readability/src/converter/html2md.rs

··· 1 + //! Markdown conversion using html2md crate 2 + 3 + /// Convert HTML to Markdown 4 + pub fn to_markdown(html: &str) -> String { 5 + html2md::parse_html(html) 6 + } 7 + 8 + /// Generate an excerpt from markdown (first ~200 chars) 9 + pub fn generate_excerpt(markdown: &str, max_length: usize) -> String { 10 + let cleaned: String = markdown.chars().filter(|c| !c.is_control() || *c == '\n').collect(); 11 + 12 + if cleaned.len() <= max_length { 13 + cleaned 14 + } else { 15 + let truncated = &cleaned[..max_length]; 16 + format!("{}...", truncated.trim_end()) 17 + } 18 + } 19 + 20 + #[cfg(test)] 21 + mod tests { 22 + use super::*; 23 + 24 + #[test] 25 + fn test_generate_excerpt() { 26 + let markdown = 27 + "This is a long piece of markdown text that should be truncated to approximately 200 characters or so."; 28 + let excerpt = generate_excerpt(markdown, 50); 29 + assert!(excerpt.len() <= 53); 30 + assert!(excerpt.ends_with("...")); 31 + } 32 + 33 + #[test] 34 + fn test_generate_excerpt_short() { 35 + let markdown = "Short text"; 36 + let excerpt = generate_excerpt(markdown, 50); 37 + assert_eq!(excerpt, "Short text"); 38 + } 39 + }

+5

crates/readability/src/converter/mod.rs

··· 1 + //! HTML to Markdown conversion 2 + 3 + pub mod html2md; 4 + 5 + pub use self::html2md::to_markdown;

+23

crates/readability/src/error.rs

··· 1 + use thiserror::Error; 2 + 3 + /// Errors that can occur during article extraction 4 + #[derive(Error, Debug)] 5 + pub enum Error { 6 + #[error("HTML parsing failed: {0}")] 7 + ParseError(String), 8 + 9 + #[error("XPath evaluation failed: {0}")] 10 + XPathError(String), 11 + 12 + #[error("Config parse error: {0}")] 13 + ConfigError(String), 14 + 15 + #[error("Extraction failed: {0}")] 16 + ExtractionError(String), 17 + 18 + #[error("IO error: {0}")] 19 + Io(#[from] std::io::Error), 20 + } 21 + 22 + /// Result type for readability operations 23 + pub type Result<T> = std::result::Result<T, Error>;

+391

crates/readability/src/extractor/generic.rs

··· 1 + //! Generic content extraction with a simplified heuristic-based approach 2 + //! 3 + //! ## Implementation Strategy 4 + //! 5 + //! This is a **simplified** content extractor, not a full Mozilla Readability implementation. 6 + //! It uses basic heuristics to find common patterns in HTML documents. 7 + //! 8 + //! ### What This Implementation Does: 9 + //! - Extracts title from `<title>`, `<h1>`, or `og:title` meta tags 10 + //! - Finds body content by looking for semantic HTML5 tags and common class names 11 + //! - Extracts author from meta tags or common byline patterns 12 + //! - Extracts date from meta tags or `<time>` elements 13 + //! - Uses simple CSS selector patterns (no complex scoring algorithm) 14 + //! 15 + //! ### What This Implementation Does NOT Do (Implementation Gaps): 16 + //! - **No content scoring**: Unlike Mozilla Readability, we don't score paragraphs by 17 + //! text length, link density, or class names to find the "best" content candidate 18 + //! - **No sibling inclusion**: We don't check if siblings of the main content should 19 + //! be included based on similarity thresholds 20 + //! - **No ancestor scoring**: We don't propagate scores up the DOM tree 21 + //! - **No link density checking**: We don't filter out high link-density sections 22 + //! - **No "unlikely candidate" removal**: We don't remove elements based on negative 23 + //! class name patterns like "sidebar", "comment", etc. 24 + //! - **Limited fallback chain**: Mozilla Readability tries multiple strategies; we try 25 + //! a few common patterns and give up 26 + //! 27 + //! ### Design Decisions: 28 + //! - **Semantic HTML first**: We prefer `<article>`, `<main>` over class-based selection 29 + //! because they're more reliable indicators of content 30 + //! - **Multiple fallbacks**: We try progressively broader selectors to maximize success rate 31 + //! - **Metadata from standards**: We use standard meta tags (Open Graph, Schema.org, etc.) 32 + //! before falling back to heuristics 33 + //! - **Fail fast**: If we can't find content with our heuristics, we return an error 34 + //! rather than returning garbage content 35 + //! 36 + //! ## TODOs: 37 + //! - TODO: Implement basic content scoring (count paragraphs, text length) 38 + //! - TODO: Add link density checks to filter navigation/sidebar 39 + //! - TODO: Remove unlikely candidates (ads, footers, etc.) by class name 40 + //! - TODO: Try multiple content candidates and pick the best one 41 + //! - TODO: Clean extracted HTML (remove scripts, styles, empty elements) 42 + //! - TODO: Handle multi-page articles (pagination detection) 43 + 44 + use crate::error::{Error, Result}; 45 + use scraper::{Html, Selector}; 46 + 47 + /// Extracted content from generic algorithm 48 + #[derive(Debug, Clone)] 49 + pub struct ExtractedContent { 50 + pub title: String, 51 + pub body_html: String, 52 + pub author: Option<String>, 53 + pub date: Option<String>, 54 + } 55 + 56 + /// Generic content extractor using simple heuristics 57 + /// 58 + /// This extractor attempts to find article content using common HTML patterns. 59 + /// It's designed as a fallback when site-specific XPath rules are not available. 60 + pub struct GenericExtractor { 61 + html: String, 62 + } 63 + 64 + impl GenericExtractor { 65 + /// Create a new generic extractor 66 + pub fn new(html: String) -> Self { 67 + Self { html } 68 + } 69 + 70 + /// Extract content using simple heuristics 71 + /// 72 + /// ## Extraction Strategy: 73 + /// 1. Title: `<title>` tag, then `<h1>`, then `og:title` meta tag 74 + /// 2. Body: `<article>`, then `<main>`, then `[role="main"]`, then `.content` 75 + /// 3. Author: meta tags (author, og:author, article:author), then `.byline` 76 + /// 4. Date: meta tags (article:published_time, datePublished), then `<time>` 77 + /// 78 + /// ## Limitations: 79 + /// - Returns first match, doesn't evaluate quality 80 + /// - No cleaning of extracted HTML (scripts, ads, etc. may be included) 81 + /// - May extract wrong content if page structure is unusual 82 + pub fn extract(&self) -> Result<ExtractedContent> { 83 + let document = Html::parse_document(&self.html); 84 + 85 + let title = self 86 + .extract_title(&document) 87 + .ok_or_else(|| Error::ExtractionError("Could not extract title".to_string()))?; 88 + 89 + let body_html = self 90 + .extract_body(&document) 91 + .ok_or_else(|| Error::ExtractionError("Could not extract body content".to_string()))?; 92 + 93 + let author = self.extract_author(&document); 94 + let date = self.extract_date(&document); 95 + Ok(ExtractedContent { title, body_html, author, date }) 96 + } 97 + 98 + /// Extract title from document 99 + /// 100 + /// Tries in order: 101 + /// 1. `<title>` tag content (cleaned of site suffixes) 102 + /// 2. First `<h1>` tag 103 + /// 3. `og:title` meta tag 104 + /// 105 + /// ## Implementation Gap: 106 + /// - Doesn't try to clean title (remove " | Site Name" suffixes, etc.) 107 + /// - Doesn't check title quality or length 108 + fn extract_title(&self, document: &Html) -> Option<String> { 109 + if let Ok(selector) = Selector::parse("title") 110 + && let Some(element) = document.select(&selector).next() 111 + { 112 + let text: String = element.text().collect(); 113 + if !text.trim().is_empty() { 114 + return Some(text.trim().to_string()); 115 + } 116 + } 117 + 118 + if let Ok(selector) = Selector::parse("h1") 119 + && let Some(element) = document.select(&selector).next() 120 + { 121 + let text: String = element.text().collect(); 122 + if !text.trim().is_empty() { 123 + return Some(text.trim().to_string()); 124 + } 125 + } 126 + 127 + if let Ok(selector) = Selector::parse("meta[property='og:title']") 128 + && let Some(element) = document.select(&selector).next() 129 + && let Some(content) = element.value().attr("content") 130 + && !content.trim().is_empty() 131 + { 132 + return Some(content.trim().to_string()); 133 + } 134 + 135 + None 136 + } 137 + 138 + /// Extract body content from document 139 + /// 140 + /// Tries in order: 141 + /// 1. `<article>` tag (semantic HTML5) 142 + /// 2. `<main>` tag (semantic HTML5) 143 + /// 3. `[role="main"]` attribute (ARIA landmark) 144 + /// 4. First element with class containing "content", "article", "post", "entry" 145 + /// 5. `<body>` tag as last resort (usually includes nav, footer, etc.) 146 + /// 147 + /// ## Implementation Gaps: 148 + /// - Doesn't score multiple candidates to find the best one 149 + /// - Doesn't clean the HTML (may include ads, sidebars, etc.) 150 + /// - Doesn't check content length or quality 151 + /// - Doesn't exclude navigation, footers, comments within the selected element 152 + /// - Returns inner HTML as-is without any processing 153 + /// 154 + /// TODO: Add basic cleaning (remove script, style, nav, footer, aside) 155 + /// TODO: Check content length (minimum threshold) 156 + /// TODO: If multiple candidates, pick the one with most <p> tags 157 + fn extract_body(&self, document: &Html) -> Option<String> { 158 + let selectors = vec![ 159 + "article", 160 + "main", 161 + "[role='main']", 162 + "[class*='content']", 163 + "[class*='article']", 164 + "[class*='post']", 165 + "[class*='entry']", 166 + "body", 167 + ]; 168 + 169 + for selector_str in selectors { 170 + if let Ok(selector) = Selector::parse(selector_str) 171 + && let Some(element) = document.select(&selector).next() 172 + { 173 + let html = element.html(); 174 + if !html.trim().is_empty() { 175 + return Some(html); 176 + } 177 + } 178 + } 179 + 180 + None 181 + } 182 + 183 + /// Extract author from document 184 + /// 185 + /// Tries in order: 186 + /// 1. `<meta name="author">` tag 187 + /// 2. `<meta property="og:author">` tag 188 + /// 3. `<meta property="article:author">` tag 189 + /// 4. Element with class "author", "byline", or "by" 190 + /// 191 + /// ## Implementation Gaps: 192 + /// - Doesn't parse structured data (JSON-LD, Schema.org) 193 + /// - Doesn't extract from "By John Doe" patterns in text 194 + /// - Returns first match without validation 195 + fn extract_author(&self, document: &Html) -> Option<String> { 196 + let meta_selectors = vec![ 197 + "meta[name='author']", 198 + "meta[property='og:author']", 199 + "meta[property='article:author']", 200 + ]; 201 + 202 + for selector_str in meta_selectors { 203 + if let Ok(selector) = Selector::parse(selector_str) 204 + && let Some(element) = document.select(&selector).next() 205 + && let Some(content) = element.value().attr("content") 206 + && !content.trim().is_empty() 207 + { 208 + return Some(content.trim().to_string()); 209 + } 210 + } 211 + 212 + let class_selectors = vec![".author", ".byline", ".by"]; 213 + 214 + for selector_str in class_selectors { 215 + if let Ok(selector) = Selector::parse(selector_str) 216 + && let Some(element) = document.select(&selector).next() 217 + { 218 + let text: String = element.text().collect(); 219 + if !text.trim().is_empty() { 220 + return Some(text.trim().to_string()); 221 + } 222 + } 223 + } 224 + 225 + None 226 + } 227 + 228 + /// Extract publication date from document 229 + /// 230 + /// Tries in order: 231 + /// 1. `<meta property="article:published_time">` (Open Graph) 232 + /// 2. `<meta itemprop="datePublished">` (Schema.org) 233 + /// 3. `<time datetime="...">` attribute 234 + /// 4. `<time>` element text content 235 + /// 236 + /// ## Implementation Gaps: 237 + /// - Doesn't parse or normalize date formats 238 + /// - Doesn't validate date values 239 + /// - Doesn't extract from text patterns ("Published on Jan 1, 2020") 240 + fn extract_date(&self, document: &Html) -> Option<String> { 241 + let meta_selectors = vec![ 242 + "meta[property='article:published_time']", 243 + "meta[itemprop='datePublished']", 244 + ]; 245 + 246 + for selector_str in meta_selectors { 247 + if let Ok(selector) = Selector::parse(selector_str) 248 + && let Some(element) = document.select(&selector).next() 249 + && let Some(content) = element.value().attr("content") 250 + && !content.trim().is_empty() 251 + { 252 + return Some(content.trim().to_string()); 253 + } 254 + } 255 + 256 + if let Ok(selector) = Selector::parse("time[datetime]") 257 + && let Some(element) = document.select(&selector).next() 258 + && let Some(datetime) = element.value().attr("datetime") 259 + && !datetime.trim().is_empty() 260 + { 261 + return Some(datetime.trim().to_string()); 262 + } 263 + 264 + if let Ok(selector) = Selector::parse("time") 265 + && let Some(element) = document.select(&selector).next() 266 + { 267 + let text: String = element.text().collect(); 268 + if !text.trim().is_empty() { 269 + return Some(text.trim().to_string()); 270 + } 271 + } 272 + None 273 + } 274 + } 275 + 276 + #[cfg(test)] 277 + mod tests { 278 + use super::*; 279 + 280 + #[test] 281 + fn test_extract_title_from_title_tag() { 282 + let html = r#" 283 + <html> 284 + <head><title>Test Article Title</title></head> 285 + <body></body> 286 + </html> 287 + "#; 288 + 289 + let extractor = GenericExtractor::new(html.to_string()); 290 + let document = Html::parse_document(html); 291 + let title = extractor.extract_title(&document); 292 + 293 + assert_eq!(title, Some("Test Article Title".to_string())); 294 + } 295 + 296 + #[test] 297 + fn test_extract_title_from_h1() { 298 + let html = r#" 299 + <html> 300 + <body><h1>Article Heading</h1></body> 301 + </html> 302 + "#; 303 + 304 + let extractor = GenericExtractor::new(html.to_string()); 305 + let document = Html::parse_document(html); 306 + let title = extractor.extract_title(&document); 307 + 308 + assert_eq!(title, Some("Article Heading".to_string())); 309 + } 310 + 311 + #[test] 312 + fn test_extract_body_from_article() { 313 + let html = r#" 314 + <html> 315 + <body> 316 + <article> 317 + <p>This is the article content.</p> 318 + </article> 319 + </body> 320 + </html> 321 + "#; 322 + 323 + let extractor = GenericExtractor::new(html.to_string()); 324 + let document = Html::parse_document(html); 325 + let body = extractor.extract_body(&document); 326 + 327 + assert!(body.is_some()); 328 + assert!(body.unwrap().contains("This is the article content")); 329 + } 330 + 331 + #[test] 332 + fn test_extract_author_from_meta() { 333 + let html = r#" 334 + <html> 335 + <head> 336 + <meta name="author" content="John Doe"> 337 + </head> 338 + </html> 339 + "#; 340 + 341 + let extractor = GenericExtractor::new(html.to_string()); 342 + let document = Html::parse_document(html); 343 + let author = extractor.extract_author(&document); 344 + 345 + assert_eq!(author, Some("John Doe".to_string())); 346 + } 347 + 348 + #[test] 349 + fn test_extract_date_from_meta() { 350 + let html = r#" 351 + <html> 352 + <head> 353 + <meta property="article:published_time" content="2024-01-15"> 354 + </head> 355 + </html> 356 + "#; 357 + 358 + let extractor = GenericExtractor::new(html.to_string()); 359 + let document = Html::parse_document(html); 360 + let date = extractor.extract_date(&document); 361 + 362 + assert_eq!(date, Some("2024-01-15".to_string())); 363 + } 364 + 365 + #[test] 366 + fn test_full_extraction() { 367 + let html = r#" 368 + <html> 369 + <head> 370 + <title>Test Article</title> 371 + <meta name="author" content="Jane Smith"> 372 + <meta property="article:published_time" content="2024-01-15"> 373 + </head> 374 + <body> 375 + <article> 376 + <h1>Article Title</h1> 377 + <p>Article content goes here.</p> 378 + </article> 379 + </body> 380 + </html> 381 + "#; 382 + 383 + let extractor = GenericExtractor::new(html.to_string()); 384 + let result = extractor.extract().unwrap(); 385 + 386 + assert_eq!(result.title, "Test Article"); 387 + assert!(result.body_html.contains("Article content goes here")); 388 + assert_eq!(result.author, Some("Jane Smith".to_string())); 389 + assert_eq!(result.date, Some("2024-01-15".to_string())); 390 + } 391 + }

+8

crates/readability/src/extractor/mod.rs

··· 1 + //! Content extraction using XPath rules and generic algorithms 2 + 3 + pub mod generic; 4 + pub mod scoring; 5 + pub mod xpath; 6 + 7 + pub use generic::GenericExtractor; 8 + pub use xpath::XPathExtractor;

+42

crates/readability/src/extractor/scoring.rs

··· 1 + //! Content scoring for the Mozilla Readability algorithm 2 + //! 3 + //! TODO: Implement scoring 4 + 5 + /// Content score for an element 6 + #[derive(Debug, Clone)] 7 + pub struct ContentScore { 8 + /// Text length of the element 9 + pub text_length: usize, 10 + /// Link density (0.0 to 1.0) 11 + pub link_density: f32, 12 + /// Class/ID weight (positive for content, negative for non-content) 13 + pub class_weight: f32, 14 + /// Total calculated score 15 + pub total: f32, 16 + } 17 + 18 + /// Positive class/ID patterns indicating content 19 + pub const POSITIVE_PATTERNS: &[&str] = &[ 20 + "article", "body", "content", "entry", "main", "page", "post", "text", "blog", "story", 21 + ]; 22 + 23 + /// Negative class/ID patterns indicating non-content 24 + pub const NEGATIVE_PATTERNS: &[&str] = &[ 25 + "combx", 26 + "comment", 27 + "community", 28 + "disqus", 29 + "extra", 30 + "footer", 31 + "header", 32 + "menu", 33 + "remark", 34 + "rss", 35 + "share", 36 + "sidebar", 37 + "sponsor", 38 + "ad-", 39 + "agegate", 40 + "pagination", 41 + "nav", 42 + ];

+494

crates/readability/src/extractor/xpath.rs

··· 1 + //! XPath-based content extraction using site-specific rules 2 + //! 3 + //! This module provides content extraction from HTML documents using XPath-like expressions. 4 + //! 5 + //! ## Strategy 6 + //! 7 + //! Since Rust doesn't have a robust HTML-compatible XPath library, we use a hybrid approach: 8 + //! 1. Convert simple XPath expressions to CSS selectors (scraper handles these well) 9 + //! 2. Handle complex patterns (contains(), normalize-space()) with custom matchers 10 + //! 3. Use regex parsing for XPath syntax to extract selector components 11 + //! 12 + //! ## Supported XPath Patterns 13 + //! 14 + //! - `//tag` - Simple tag selection 15 + //! - `//tag[@id='value']` - ID selection 16 + //! - `//tag[@class='value']` - Exact class match 17 + //! - `//tag[contains(@class, 'value')]` - Class contains match 18 + //! - `//tag[contains(concat(' ',normalize-space(@class),' '),' value ')]` - Normalized class match 19 + //! - `//meta[@name='value']/@content` - Attribute extraction from meta tags 20 + 21 + use crate::config::SiteConfig; 22 + use crate::error::{Error, Result}; 23 + use regex::Regex; 24 + use scraper::{ElementRef, Html, Selector}; 25 + 26 + /// Extracted content from XPath rules 27 + #[derive(Debug, Clone)] 28 + pub struct ExtractedContent { 29 + pub title: Option<String>, 30 + pub body_html: Option<String>, 31 + pub author: Option<String>, 32 + pub date: Option<String>, 33 + } 34 + 35 + /// XPath-based extractor 36 + pub struct XPathExtractor { 37 + html: String, 38 + } 39 + 40 + impl XPathExtractor { 41 + /// Create a new XPath extractor 42 + pub fn new(html: String) -> Self { 43 + Self { html } 44 + } 45 + 46 + /// Extract content using site-specific rules 47 + pub fn extract(&self, config: &SiteConfig) -> Result<ExtractedContent> { 48 + let cleaned_html = self.apply_strip_rules(&self.html, config)?; 49 + let document = Html::parse_document(&cleaned_html); 50 + 51 + let title = self.extract_field(&document, &config.title, false)?; 52 + let body_html = self.extract_field(&document, &config.body, true)?; 53 + let author = self.extract_field(&document, &config.author, false)?; 54 + let date = self.extract_field(&document, &config.date, false)?; 55 + 56 + Ok(ExtractedContent { title, body_html, author, date }) 57 + } 58 + 59 + /// Apply strip rules to remove unwanted elements 60 + /// 61 + /// Processes both `strip` (XPath) and `strip_id_or_class` (substring match) directives. 62 + fn apply_strip_rules(&self, html: &str, config: &SiteConfig) -> Result<String> { 63 + let document = Html::parse_document(html); 64 + let mut elements_to_remove: Vec<String> = Vec::new(); 65 + 66 + for substring in &config.strip_id_or_class { 67 + let substring_lower = substring.to_lowercase(); 68 + for element in document.tree.nodes() { 69 + if let Some(el) = ElementRef::wrap(element) { 70 + let should_remove = el 71 + .value() 72 + .id() 73 + .is_some_and(|id| id.to_lowercase().contains(&substring_lower)) 74 + || el 75 + .value() 76 + .classes() 77 + .any(|class| class.to_lowercase().contains(&substring_lower)); 78 + 79 + if should_remove { 80 + elements_to_remove.push(self.element_signature(&el)); 81 + } 82 + } 83 + } 84 + } 85 + 86 + for xpath in &config.strip { 87 + if let Some((css, _)) = self.xpath_to_css_with_attr(xpath) 88 + && let Ok(selector) = Selector::parse(&css) 89 + { 90 + for el in document.select(&selector) { 91 + elements_to_remove.push(self.element_signature(&el)); 92 + } 93 + } 94 + } 95 + 96 + self.rebuild_html_without_elements(&document, &elements_to_remove) 97 + } 98 + 99 + /// Generate a signature for an element to identify it during rebuild 100 + fn element_signature(&self, el: &ElementRef) -> String { 101 + let tag = el.value().name(); 102 + let id = el.value().id().unwrap_or(""); 103 + let classes: Vec<&str> = el.value().classes().collect(); 104 + format!("{}#{}#{}", tag, id, classes.join(",")) 105 + } 106 + 107 + /// Rebuild HTML without specified elements 108 + fn rebuild_html_without_elements(&self, document: &Html, to_remove: &[String]) -> Result<String> { 109 + if to_remove.is_empty() { 110 + return Ok(self.html.clone()); 111 + } 112 + 113 + let mut result = String::new(); 114 + self.rebuild_node(&document.root_element(), to_remove, &mut result); 115 + Ok(result) 116 + } 117 + 118 + /// Recursively rebuild a node and its children, skipping removed elements 119 + fn rebuild_node(&self, element: &ElementRef, to_remove: &[String], output: &mut String) { 120 + let sig = self.element_signature(element); 121 + if to_remove.contains(&sig) { 122 + return; 123 + } 124 + 125 + let tag = element.value().name(); 126 + output.push('<'); 127 + output.push_str(tag); 128 + 129 + for (name, value) in element.value().attrs() { 130 + output.push(' '); 131 + output.push_str(name); 132 + output.push_str("=\""); 133 + output.push_str(&html_escape::encode_double_quoted_attribute(value)); 134 + output.push('"'); 135 + } 136 + output.push('>'); 137 + 138 + for child in element.children() { 139 + if let Some(el) = ElementRef::wrap(child) { 140 + self.rebuild_node(&el, to_remove, output); 141 + } else if let Some(text) = child.value().as_text() { 142 + output.push_str(&html_escape::encode_text(&text.to_string())); 143 + } 144 + } 145 + 146 + const VOID_ELEMENTS: &[&str] = &[ 147 + "area", "base", "br", "col", "embed", "hr", "img", "input", "link", "meta", "param", "source", "track", 148 + "wbr", 149 + ]; 150 + 151 + if !VOID_ELEMENTS.contains(&tag) { 152 + output.push_str("</"); 153 + output.push_str(tag); 154 + output.push('>'); 155 + } 156 + } 157 + 158 + /// Extract a field using XPath expressions (tries each in order) 159 + fn extract_field(&self, document: &Html, xpaths: &[String], extract_html: bool) -> Result<Option<String>> { 160 + for xpath_expr in xpaths { 161 + if let Some(result) = self.evaluate_xpath(document, xpath_expr, extract_html)? { 162 + return Ok(Some(result)); 163 + } 164 + } 165 + Ok(None) 166 + } 167 + 168 + /// Evaluate an XPath expression against the document 169 + fn evaluate_xpath(&self, document: &Html, xpath: &str, extract_html: bool) -> Result<Option<String>> { 170 + let (xpath_part, attr_to_extract) = if let Some(pos) = xpath.rfind("/@") { 171 + (&xpath[..pos], Some(&xpath[pos + 2..])) 172 + } else { 173 + (xpath, None) 174 + }; 175 + 176 + let (css, class_filter) = match self.xpath_to_css_with_attr(xpath_part) { 177 + Some(result) => result, 178 + None => return Ok(None), 179 + }; 180 + 181 + let selector = 182 + Selector::parse(&css).map_err(|e| Error::XPathError(format!("Invalid CSS selector '{}': {:?}", css, e)))?; 183 + 184 + for element in document.select(&selector) { 185 + if let Some(ref filter) = class_filter 186 + && !self.element_has_class_containing(&element, filter) 187 + { 188 + continue; 189 + } 190 + 191 + if let Some(attr) = attr_to_extract { 192 + if let Some(value) = element.value().attr(attr) { 193 + return Ok(Some(value.to_string())); 194 + } 195 + continue; 196 + } 197 + 198 + let content = 199 + if extract_html { element.inner_html() } else { element.text().collect::<Vec<_>>().join(" ") }; 200 + 201 + let content = content.trim().to_string(); 202 + if !content.is_empty() { 203 + return Ok(Some(content)); 204 + } 205 + } 206 + 207 + Ok(None) 208 + } 209 + 210 + /// Convert XPath to CSS selector with optional class filter 211 + fn xpath_to_css_with_attr(&self, xpath: &str) -> Option<(String, Option<String>)> { 212 + let xpath = xpath.trim(); 213 + 214 + if xpath.starts_with("//") && !xpath.contains('[') && !xpath.contains('@') { 215 + let tag = xpath.trim_start_matches("//"); 216 + return Some((tag.to_string(), None)); 217 + } 218 + 219 + if let Some(css) = self.parse_id_selector(xpath) { 220 + return Some((css, None)); 221 + } 222 + 223 + if let Some((css, class_filter)) = self.parse_contains_class_normalized(xpath) { 224 + return Some((css, Some(class_filter))); 225 + } 226 + 227 + if let Some((css, class_filter)) = self.parse_contains_class_simple(xpath) { 228 + return Some((css, Some(class_filter))); 229 + } 230 + 231 + if let Some(css) = self.parse_exact_class(xpath) { 232 + return Some((css, None)); 233 + } 234 + 235 + if let Some(css) = self.parse_exact_class(xpath) { 236 + return Some((css, None)); 237 + } 238 + 239 + if let Some(css) = self.parse_any_tag_with_id(xpath) { 240 + return Some((css, None)); 241 + } 242 + 243 + if let Some(css) = self.parse_meta_selector(xpath) { 244 + return Some((css, None)); 245 + } 246 + 247 + if let Some(css) = self.parse_meta_selector(xpath) { 248 + return Some((css, None)); 249 + } 250 + 251 + None 252 + } 253 + 254 + /// Parse //tag[@id='value'] pattern 255 + fn parse_id_selector(&self, xpath: &str) -> Option<String> { 256 + let re = Regex::new(r#"//(\w+)\[@id\s*=\s*['"]([^'"]+)['"]\]"#).ok()?; 257 + let caps = re.captures(xpath)?; 258 + let tag = caps.get(1)?.as_str(); 259 + let id = caps.get(2)?.as_str(); 260 + Some(format!("{}#{}", tag, id)) 261 + } 262 + 263 + /// Parse //*[@id='value'] pattern 264 + fn parse_any_tag_with_id(&self, xpath: &str) -> Option<String> { 265 + let re = Regex::new(r#"//\*\[@id\s*=\s*['"]([^'"]+)['"]\]"#).ok()?; 266 + let caps = re.captures(xpath)?; 267 + let id = caps.get(1)?.as_str(); 268 + Some(format!("#{}", id)) 269 + } 270 + 271 + /// Parse //tag[@class='value'] pattern (exact class match) 272 + fn parse_exact_class(&self, xpath: &str) -> Option<String> { 273 + if xpath.contains("contains") { 274 + return None; 275 + } 276 + let re = Regex::new(r#"//(\w+)\[@class\s*=\s*['"]([^'"]+)['"]\]"#).ok()?; 277 + let caps = re.captures(xpath)?; 278 + let tag = caps.get(1)?.as_str(); 279 + let class = caps.get(2)?.as_str(); 280 + Some(format!("{}[class=\"{}\"]", tag, class)) 281 + } 282 + 283 + /// Parse //tag[contains(@class, 'value')] pattern 284 + fn parse_contains_class_simple(&self, xpath: &str) -> Option<(String, String)> { 285 + let re = Regex::new(r#"//(\w+)\[contains\s*\(\s*@class\s*,\s*['"]([^'"]+)['"]\s*\)\]"#).ok()?; 286 + let caps = re.captures(xpath)?; 287 + let tag = caps.get(1)?.as_str(); 288 + let class_substr = caps.get(2)?.as_str(); 289 + Some((tag.to_string(), class_substr.to_string())) 290 + } 291 + 292 + /// Parse //tag[contains(concat(' ',normalize-space(@class),' '),' value ')] pattern 293 + fn parse_contains_class_normalized(&self, xpath: &str) -> Option<(String, String)> { 294 + let re = Regex::new(r#"//(\w+)\[contains\s*\(\s*concat\s*\(.+\)\s*,\s*['"]([^'"]+)['"]\s*\)\]"#).ok()?; 295 + let caps = re.captures(xpath)?; 296 + let tag = caps.get(1)?.as_str(); 297 + let class_name = caps.get(2)?.as_str().trim(); 298 + Some((tag.to_string(), class_name.to_string())) 299 + } 300 + 301 + /// Parse //meta[@name='value'] pattern 302 + fn parse_meta_selector(&self, xpath: &str) -> Option<String> { 303 + let re = Regex::new(r#"//meta\[@(\w+)\s*=\s*['"]([^'"]+)['"]\]"#).ok()?; 304 + let caps = re.captures(xpath)?; 305 + let attr_name = caps.get(1)?.as_str(); 306 + let attr_value = caps.get(2)?.as_str(); 307 + Some(format!("meta[{}=\"{}\"]", attr_name, attr_value)) 308 + } 309 + 310 + /// Check if element has a class containing the given substring 311 + fn element_has_class_containing(&self, element: &ElementRef, class_filter: &str) -> bool { 312 + element.value().classes().any(|class| class.contains(class_filter)) 313 + } 314 + } 315 + 316 + #[cfg(test)] 317 + mod tests { 318 + use super::*; 319 + use crate::config::parser::SiteConfig; 320 + 321 + #[test] 322 + fn test_xpath_to_css_simple_tag() { 323 + let extractor = XPathExtractor::new(String::new()); 324 + let (css, filter) = extractor.xpath_to_css_with_attr("//h1").unwrap(); 325 + assert_eq!(css, "h1"); 326 + assert!(filter.is_none()); 327 + } 328 + 329 + #[test] 330 + fn test_xpath_to_css_id_selector() { 331 + let extractor = XPathExtractor::new(String::new()); 332 + let (css, filter) = extractor.xpath_to_css_with_attr("//h1[@id='firstHeading']").unwrap(); 333 + assert_eq!(css, "h1#firstHeading"); 334 + assert!(filter.is_none()); 335 + } 336 + 337 + #[test] 338 + fn test_xpath_to_css_any_tag_with_id() { 339 + let extractor = XPathExtractor::new(String::new()); 340 + let (css, filter) = extractor.xpath_to_css_with_attr("//*[@id='bodyContent']").unwrap(); 341 + assert_eq!(css, "#bodyContent"); 342 + assert!(filter.is_none()); 343 + } 344 + 345 + #[test] 346 + fn test_xpath_contains_class_simple() { 347 + let extractor = XPathExtractor::new(String::new()); 348 + let (css, filter) = extractor 349 + .xpath_to_css_with_attr("//div[contains(@class, 'content')]") 350 + .unwrap(); 351 + assert_eq!(css, "div"); 352 + assert_eq!(filter, Some("content".to_string())); 353 + } 354 + 355 + #[test] 356 + fn test_xpath_contains_class_normalized() { 357 + let extractor = XPathExtractor::new(String::new()); 358 + let xpath = "//h1[contains(concat(' ',normalize-space(@class),' '),' title ')]"; 359 + let (css, filter) = extractor.xpath_to_css_with_attr(xpath).unwrap(); 360 + assert_eq!(css, "h1"); 361 + assert_eq!(filter, Some("title".to_string())); 362 + } 363 + 364 + #[test] 365 + fn test_extract_meta_attribute() { 366 + let html = r#" 367 + <html> 368 + <head> 369 + <meta name="citation_date" content="2020-09-07"> 370 + <meta name="citation_author" content="John Doe"> 371 + </head> 372 + </html> 373 + "#; 374 + 375 + let extractor = XPathExtractor::new(html.to_string()); 376 + let document = Html::parse_document(html); 377 + 378 + let date = extractor 379 + .evaluate_xpath(&document, "//meta[@name='citation_date']/@content", false) 380 + .unwrap(); 381 + assert_eq!(date, Some("2020-09-07".to_string())); 382 + 383 + let author = extractor 384 + .evaluate_xpath(&document, "//meta[@name='citation_author']/@content", false) 385 + .unwrap(); 386 + assert_eq!(author, Some("John Doe".to_string())); 387 + } 388 + 389 + #[test] 390 + fn test_extract_with_contains_class() { 391 + let html = r#" 392 + <html> 393 + <body> 394 + <h1 class="page-title title main">Article Title</h1> 395 + <div class="article-content">Content here</div> 396 + </body> 397 + </html> 398 + "#; 399 + 400 + let extractor = XPathExtractor::new(html.to_string()); 401 + let document = Html::parse_document(html); 402 + 403 + let title = extractor 404 + .evaluate_xpath(&document, "//h1[contains(@class, 'title')]", false) 405 + .unwrap(); 406 + assert_eq!(title, Some("Article Title".to_string())); 407 + } 408 + 409 + #[test] 410 + fn test_strip_id_or_class() { 411 + let html = r#" 412 + <html> 413 + <body> 414 + <div id="main-content">Main content</div> 415 + <div class="sidebar-widget">Sidebar</div> 416 + <div class="advertisement-banner">Ad</div> 417 + </body> 418 + </html> 419 + "#; 420 + 421 + let config = SiteConfig { 422 + strip_id_or_class: vec!["sidebar".to_string(), "advertisement".to_string()], 423 + ..Default::default() 424 + }; 425 + 426 + let extractor = XPathExtractor::new(html.to_string()); 427 + let cleaned = extractor.apply_strip_rules(html, &config).unwrap(); 428 + 429 + assert!(cleaned.contains("Main content")); 430 + assert!(!cleaned.contains("Sidebar")); 431 + assert!(!cleaned.contains("Ad")); 432 + } 433 + 434 + #[test] 435 + fn test_strip_xpath() { 436 + let html = r#" 437 + <html> 438 + <body> 439 + <div id="content">Main content</div> 440 + <div id="toc">Table of contents</div> 441 + <div id="footer">Footer</div> 442 + </body> 443 + </html> 444 + "#; 445 + 446 + let config = SiteConfig { 447 + strip: vec!["//*[@id='toc']".to_string(), "//div[@id='footer']".to_string()], 448 + ..Default::default() 449 + }; 450 + 451 + let extractor = XPathExtractor::new(html.to_string()); 452 + let cleaned = extractor.apply_strip_rules(html, &config).unwrap(); 453 + 454 + assert!(cleaned.contains("Main content")); 455 + assert!(!cleaned.contains("Table of contents")); 456 + assert!(!cleaned.contains("Footer")); 457 + } 458 + 459 + #[test] 460 + fn test_full_extraction() { 461 + let html = r#" 462 + <html> 463 + <head> 464 + <meta name="author" content="Test Author"> 465 + <meta name="date" content="2024-01-15"> 466 + </head> 467 + <body> 468 + <h1 id="title">Test Title</h1> 469 + <article class="content"> 470 + <p>Article content here.</p> 471 + </article> 472 + <div class="sidebar">Sidebar content</div> 473 + </body> 474 + </html> 475 + "#; 476 + 477 + let config = SiteConfig { 478 + title: vec!["//h1[@id='title']".to_string()], 479 + body: vec!["//article".to_string()], 480 + author: vec!["//meta[@name='author']/@content".to_string()], 481 + date: vec!["//meta[@name='date']/@content".to_string()], 482 + strip_id_or_class: vec!["sidebar".to_string()], 483 + ..Default::default() 484 + }; 485 + 486 + let extractor = XPathExtractor::new(html.to_string()); 487 + let result = extractor.extract(&config).unwrap(); 488 + 489 + assert_eq!(result.title, Some("Test Title".to_string())); 490 + assert!(result.body_html.unwrap().contains("Article content here")); 491 + assert_eq!(result.author, Some("Test Author".to_string())); 492 + assert_eq!(result.date, Some("2024-01-15".to_string())); 493 + } 494 + }

+135

crates/readability/src/lib.rs

··· 1 + //! Article extraction library with support for site-specific XPath rules and generic content extraction. 2 + //! 3 + //! This crate provides functionality to extract clean article content from HTML pages using: 4 + //! - Site-specific XPath rules (ftr-site-config format) 5 + //! - Generic content extraction (Mozilla Readability algorithm) 6 + //! - Automatic markdown conversion 7 + //! 8 + //! # Example 9 + //! 10 + //! ```no_run 11 + //! use malfestio_readability::Readability; 12 + //! use std::path::PathBuf; 13 + //! 14 + //! let html = r#"<html><head><title>Article</title></head><body>...</body></html>"#; 15 + //! let readability = Readability::new(html.to_string(), Some("https://example.com/article")) 16 + //! .with_rules_dir(PathBuf::from("rules")); 17 + //! 18 + //! let article = readability.parse().unwrap(); 19 + //! println!("Title: {}", article.title); 20 + //! println!("Markdown: {}", article.markdown); 21 + //! ``` 22 + 23 + pub mod cleaner; 24 + pub mod config; 25 + pub mod converter; 26 + pub mod error; 27 + pub mod extractor; 28 + 29 + use std::path::PathBuf; 30 + 31 + pub use error::{Error, Result}; 32 + 33 + /// Extracted article content 34 + #[derive(Debug, Clone)] 35 + pub struct Article { 36 + /// Article title 37 + pub title: String, 38 + /// Clean HTML content 39 + pub content: String, 40 + /// Markdown formatted content 41 + pub markdown: String, 42 + /// Article author (if found) 43 + pub author: Option<String>, 44 + /// Publication date (if found) 45 + pub published_date: Option<String>, 46 + /// Excerpt (first ~200 chars of content) 47 + pub excerpt: Option<String>, 48 + } 49 + 50 + /// Main entry point for article extraction 51 + pub struct Readability { 52 + html: String, 53 + url: Option<String>, 54 + rules_dir: Option<PathBuf>, 55 + } 56 + 57 + impl Readability { 58 + /// Create a new Readability instance 59 + /// 60 + /// # Arguments 61 + /// 62 + /// * `html` - The HTML content to extract from 63 + /// * `url` - Optional URL of the article (used for rule matching) 64 + pub fn new(html: String, url: Option<&str>) -> Self { 65 + Self { html, url: url.map(String::from), rules_dir: None } 66 + } 67 + 68 + /// Set the directory containing extraction rules 69 + /// 70 + /// Rules files should be named `domain.com.txt` or `.domain.com.txt` for subdomain matching. 71 + pub fn with_rules_dir(mut self, path: PathBuf) -> Self { 72 + self.rules_dir = Some(path); 73 + self 74 + } 75 + 76 + /// Extract article content from HTML 77 + /// 78 + /// ## Extraction Flow: 79 + /// 1. If URL provided: Try to load site-specific XPath rules from embedded rules 80 + /// 2. If rules found: Attempt XPath-based extraction 81 + /// 3. If no rules OR XPath extraction fails: Fall back to generic heuristic extraction 82 + /// 4. Convert extracted HTML to markdown 83 + /// 5. Generate excerpt from markdown 84 + /// 6. Return complete Article struct 85 + /// 86 + /// ## Implementation Gaps: 87 + /// - XPath extraction doesn't handle complex expressions with `contains()`, `normalize-space()`, etc. 88 + /// These will fall back to generic extraction 89 + /// - No content cleaning between XPath/generic extraction and markdown conversion 90 + /// (scripts, styles, etc. may be present in extracted HTML) 91 + /// - Generic extraction may include non-content elements (nav, footer, etc.) 92 + /// 93 + /// ## Design Decision: 94 + /// We prefer to return *something* (via generic extraction) rather than fail completely. 95 + /// This maximizes success rate at the cost of potentially lower quality extraction. 96 + /// 97 + /// TODO: Add HTML cleaning step before markdown conversion 98 + /// TODO: Implement XPath strip directives to remove unwanted elements 99 + /// TODO: Add content validation (minimum length, etc.) 100 + pub fn parse(&self) -> Result<Article> { 101 + use config::ConfigLoader; 102 + use converter::to_markdown; 103 + use extractor::XPathExtractor; 104 + 105 + let (title, content, author, date) = if let Some(ref url) = self.url { 106 + let loader = ConfigLoader::new(); 107 + 108 + if let Some(config) = loader.load_for_url(url)? { 109 + let xpath_extractor = XPathExtractor::new(self.html.clone()); 110 + let xpath_result = xpath_extractor.extract(&config)?; 111 + 112 + if let (Some(title), Some(body)) = (xpath_result.title, xpath_result.body_html) { 113 + (title, body, xpath_result.author, xpath_result.date) 114 + } else { 115 + self.extract_with_generic()? 116 + } 117 + } else { 118 + self.extract_with_generic()? 119 + } 120 + } else { 121 + self.extract_with_generic()? 122 + }; 123 + 124 + let markdown = to_markdown(&content); 125 + let excerpt = Some(converter::html2md::generate_excerpt(&markdown, 200)); 126 + Ok(Article { title, content, markdown, author, published_date: date, excerpt }) 127 + } 128 + 129 + /// Extract using generic heuristic-based algorithm 130 + fn extract_with_generic(&self) -> Result<(String, String, Option<String>, Option<String>)> { 131 + let generic_extractor = extractor::GenericExtractor::new(self.html.clone()); 132 + let result = generic_extractor.extract()?; 133 + Ok((result.title, result.body_html, result.author, result.date)) 134 + } 135 + }

+1 -2

crates/server/Cargo.toml

··· 13 13 deadpool-postgres = "0.14.0" 14 14 dotenvy = "0.15.7" 15 15 ed25519-dalek = { version = "2.2.0", features = ["serde"] } 16 - dom_smoothie = "0.4" 17 16 getrandom = { version = "0.3", features = ["std"] } 18 - html2md = "0.2.15" 19 17 malfestio-core = { version = "0.1.0", path = "../core" } 18 + malfestio-readability = { version = "0.1.0", path = "../readability" } 20 19 regex = "1.12.2" 21 20 reqwest = { version = "0.12", features = ["json"] } 22 21 serde = "1.0.228"

+20 -40

crates/server/src/api/importer.rs

··· 1 1 use crate::middleware::auth::UserContext; 2 2 use crate::state::SharedState; 3 3 use axum::{Json, extract::Extension, http::StatusCode, response::IntoResponse}; 4 - use dom_smoothie::Readability; 5 4 use malfestio_core::model::Visibility; 5 + use malfestio_readability::Readability; 6 6 use serde::{Deserialize, Serialize}; 7 7 use serde_json::json; 8 8 ··· 33 33 } 34 34 35 35 let url = payload.url.clone(); 36 - 37 - // Fetch HTML content 38 36 let html_result = reqwest::get(&url).await; 39 37 let html_content = match html_result { 40 38 Ok(response) => match response.text().await { ··· 56 54 } 57 55 }; 58 56 59 - // Extract article using dom_smoothie 60 57 let url_for_task = url.clone(); 61 - let result = tokio::task::spawn_blocking( 62 - move || -> Result<(String, String, Option<String>, Option<String>), String> { 63 - let mut readability = Readability::new(html_content, Some(&url_for_task), None) 64 - .map_err(|e| format!("Readability error: {}", e))?; 65 - let article = readability.parse().map_err(|e| format!("Parse error: {}", e))?; 66 - Ok(( 67 - article.title, 68 - article.content.to_string(), 69 - article.byline, 70 - article.published_time, 71 - )) 72 - }, 73 - ) 58 + let result = tokio::task::spawn_blocking(move || -> Result<malfestio_readability::Article, String> { 59 + let readability = Readability::new(html_content, Some(&url_for_task)); 60 + readability.parse().map_err(|e| format!("Parse error: {}", e)) 61 + }) 74 62 .await; 75 63 76 64 match result { 77 - Ok(Ok((title, content, author, publish_date))) => { 78 - // Convert HTML content to markdown 79 - let markdown = html2md::parse_html(&content); 65 + Ok(Ok(article)) => { 66 + let markdown = article.markdown; 80 67 81 68 let response = ImportArticleResponse { 82 - title, 69 + title: article.title, 83 70 markdown, 84 - metadata: ArticleMetadata { author, publish_date, source_url: payload.url }, 71 + metadata: ArticleMetadata { 72 + author: article.author, 73 + publish_date: article.published_date, 74 + source_url: payload.url, 75 + }, 85 76 }; 86 77 87 78 Json(response).into_response() ··· 121 112 } 122 113 123 114 let url = payload.url.clone(); 124 - 125 - // Fetch HTML content 126 115 let html_result = reqwest::get(&url).await; 127 116 let html_content = match html_result { 128 117 Ok(response) => match response.text().await { ··· 144 133 } 145 134 }; 146 135 147 - // Extract article using dom_smoothie 148 136 let url_for_task = url.clone(); 149 - let result = tokio::task::spawn_blocking(move || -> Result<(String, String), String> { 150 - let mut readability = Readability::new(html_content, Some(&url_for_task), None) 151 - .map_err(|e| format!("Readability error: {}", e))?; 152 - let article = readability.parse().map_err(|e| format!("Parse error: {}", e))?; 153 - Ok((article.title, article.content.to_string())) 137 + let result = tokio::task::spawn_blocking(move || -> Result<malfestio_readability::Article, String> { 138 + let readability = Readability::new(html_content, Some(&url_for_task)); 139 + readability.parse().map_err(|e| format!("Parse error: {}", e)) 154 140 }) 155 141 .await; 156 142 157 143 match result { 158 - Ok(Ok((title, content))) => { 159 - // Convert HTML content to markdown 160 - let markdown = html2md::parse_html(&content); 161 - 162 - // Merge auto-tags with user-provided tags 144 + Ok(Ok(article)) => { 145 + let title = article.title; 146 + let markdown = article.markdown; 163 147 let mut tags = payload.tags.clone(); 164 148 if !tags.contains(&"imported".to_string()) { 165 149 tags.push("imported".to_string()); ··· 168 152 tags.push("article".to_string()); 169 153 } 170 154 171 - // Store source URL as first link 172 155 let links = vec![payload.url.clone()]; 173 156 174 - // Create note 175 157 match state 176 158 .note_repo 177 159 .create(&user_ctx.did, &title, &markdown, tags, payload.visibility, links) ··· 222 204 let body_json: serde_json::Value = serde_json::from_slice(&body_bytes).unwrap(); 223 205 let title = body_json["title"].as_str().unwrap(); 224 206 assert!(title.contains("Rust")); 225 - // Verify markdown field exists and is non-empty 207 + 226 208 let markdown = body_json["markdown"].as_str().unwrap(); 227 209 assert!(markdown.len() > 100); 228 - // Verify no HTML tags leak through 229 210 assert!(!markdown.contains("<div")); 230 211 assert!(!markdown.contains("<p>")); 231 - // Verify metadata structure exists 232 212 assert!(body_json["metadata"].is_object()); 233 213 assert_eq!( 234 214 body_json["metadata"]["source_url"].as_str().unwrap(),

Configure Feed

Configure Feed