more · anil.recoil.org/atomic-eeg@daa0a92

+223 -3

aggregate_feeds.py

··· 5 5 # "feedgenerator", 6 6 # "requests", 7 7 # "beautifulsoup4", 8 + # "urllib3", 8 9 # ] 9 10 # /// 10 11 # Do not delete the above as its needed for `uv run` ··· 21 22 import re 22 23 from html import unescape 23 24 from bs4 import BeautifulSoup 25 + from urllib.parse import urlparse, urljoin 24 26 25 27 def load_feed_urls(file_path): 26 28 with open(file_path, 'r') as f: ··· 147 149 # Get link 148 150 link = entry.get('link', '') 149 151 150 - # Get description/content 152 + # Get full content from the feed entry 151 153 if hasattr(entry, 'content') and entry.content: 152 154 content = entry.content[0].value 153 155 else: 154 156 content = entry.get('summary', '') 155 - 157 + 156 158 # Create HTML preview that will be used as the content 157 159 preview = create_html_preview(content) 158 160 ··· 162 164 all_entries.append({ 163 165 'title': title, 164 166 'link': link, 165 - 'content': content, 167 + 'content': content, # Use the feed content directly 166 168 'preview': preview, 167 169 'author': author_name, 168 170 'pub_date': pub_date, ··· 207 209 208 210 return feed 209 211 212 + # Functions from make_threads.py 213 + 214 + def extract_links_from_html(html_content, base_url=None): 215 + """Extract and normalize links from HTML content""" 216 + soup = BeautifulSoup(html_content, 'html.parser') 217 + links = [] 218 + 219 + for a_tag in soup.find_all('a', href=True): 220 + href = a_tag['href'].strip() 221 + 222 + # Skip empty links, anchors, javascript, and mailto 223 + if not href or href.startswith(('#', 'javascript:', 'mailto:')): 224 + continue 225 + 226 + # Convert relative URLs to absolute if we have a base URL 227 + if base_url and not href.startswith(('http://', 'https://')): 228 + href = urljoin(base_url, href) 229 + 230 + links.append(href) 231 + 232 + return links 233 + 234 + def normalize_url(url): 235 + """Normalize URLs to consistently match them""" 236 + if not url: 237 + return "" 238 + 239 + # Handle common URL shorteners or redirects (not implemented) 240 + 241 + # Parse the URL 242 + parsed = urlparse(url) 243 + 244 + # Ensure scheme is consistent 245 + scheme = parsed.scheme.lower() or 'http' 246 + 247 + # Normalize netloc (lowercase, remove 'www.' prefix optionally) 248 + netloc = parsed.netloc.lower() 249 + if netloc.startswith('www.'): 250 + netloc = netloc[4:] 251 + 252 + # Remove trailing slashes and index.html/index.php 253 + path = parsed.path.rstrip('/') 254 + for index_file in ['/index.html', '/index.php', '/index.htm']: 255 + if path.endswith(index_file): 256 + path = path[:-len(index_file)] 257 + 258 + # Remove common fragments and query parameters that don't affect content 259 + # (like tracking params, utm_*, etc.) 260 + query_parts = [] 261 + if parsed.query: 262 + for param in parsed.query.split('&'): 263 + if '=' in param: 264 + key, value = param.split('=', 1) 265 + if not key.startswith(('utm_', 'ref', 'source')): 266 + query_parts.append(f"{key}={value}") 267 + 268 + query = '&'.join(query_parts) 269 + 270 + # Remove common hash fragments 271 + fragment = '' 272 + 273 + # Special case for common blogging platforms 274 + # Medium, WordPress, Ghost, etc. may have specific URL patterns 275 + 276 + # Reconstruct the URL 277 + normalized = f"{scheme}://{netloc}{path}" 278 + if query: 279 + normalized += f"?{query}" 280 + if fragment: 281 + normalized += f"#{fragment}" 282 + 283 + return normalized 284 + 285 + def get_domain(url): 286 + """Extract domain from a URL""" 287 + parsed = urlparse(url) 288 + domain = parsed.netloc.lower() 289 + # Remove 'www.' prefix if present 290 + if domain.startswith('www.'): 291 + domain = domain[4:] 292 + return domain 293 + 294 + def generate_threads(entries): 295 + """Generate thread data from the entries""" 296 + print(f"Generating thread data from {len(entries)} entries...", file=sys.stderr) 297 + 298 + entry_urls = {} # Maps normalized URLs to entry data 299 + 300 + # First pass: collect all entries and their URLs 301 + for entry in entries: 302 + # Get link 303 + link = entry['link'] 304 + if not link: 305 + continue 306 + 307 + # Normalize the entry URL to help with matching 308 + normalized_link = normalize_url(link) 309 + 310 + # Get the domain of the entry 311 + entry_domain = get_domain(link) 312 + 313 + # Use the feed content to extract links 314 + content_to_extract = entry['content'] 315 + 316 + # Extract all links from content, using the entry link as base URL for resolving relative URLs 317 + content_links = extract_links_from_html(content_to_extract, base_url=link) 318 + 319 + entry_data = { 320 + 'title': entry['title'], 321 + 'link': link, 322 + 'normalized_link': normalized_link, 323 + 'domain': entry_domain, 324 + 'feed_title': entry['feed_title'], 325 + 'id': entry['id'], 326 + 'content_links': content_links, 327 + 'references': [], # Will be filled in the second pass 328 + 'referenced_by': [], # Will be filled in the second pass 329 + 'external_links': [] # Links to content outside the feed 330 + } 331 + 332 + entry_urls[normalized_link] = entry_data 333 + 334 + print(f"Extracted links from all entries", file=sys.stderr) 335 + 336 + # Second pass: analyze links between entries 337 + for entry_id, entry_data in entry_urls.items(): 338 + # Keep track of references to avoid duplicates 339 + reference_ids = set() 340 + normalized_content_links = [normalize_url(link) for link in entry_data['content_links']] 341 + 342 + for i, normalized_link in enumerate(normalized_content_links): 343 + original_link = entry_data['content_links'][i] if i < len(entry_data['content_links']) else normalized_link 344 + 345 + # Check if this is a link to another entry in the feed 346 + if normalized_link in entry_urls and normalized_link != entry_data['normalized_link']: 347 + referenced_entry = entry_urls[normalized_link] 348 + 349 + # Avoid duplicate references 350 + if referenced_entry['id'] in reference_ids: 351 + continue 352 + 353 + reference_ids.add(referenced_entry['id']) 354 + 355 + # Add to the references of the current entry 356 + entry_data['references'].append({ 357 + 'id': referenced_entry['id'], 358 + 'link': referenced_entry['link'], 359 + 'title': referenced_entry['title'], 360 + 'feed_title': referenced_entry['feed_title'], 361 + 'in_feed': True # Mark as a reference to a post in the feed 362 + }) 363 + 364 + # Add to the referenced_by of the referenced entry 365 + # Check if this entry is already in referenced_by 366 + already_referenced = any(ref['id'] == entry_data['id'] for ref in referenced_entry['referenced_by']) 367 + if not already_referenced: 368 + referenced_entry['referenced_by'].append({ 369 + 'id': entry_data['id'], 370 + 'link': entry_data['link'], 371 + 'title': entry_data['title'], 372 + 'feed_title': entry_data['feed_title'], 373 + 'in_feed': True # Mark as a reference from a post in the feed 374 + }) 375 + elif normalized_link != entry_data['normalized_link']: 376 + # This is a link to something outside the feed 377 + # Check if it's from the same domain as the entry 378 + link_domain = get_domain(original_link) 379 + 380 + # Only include external links from different domains 381 + if link_domain != entry_data['domain']: 382 + # Track as an external link if not already in the list 383 + if not any(ext_link['url'] == original_link for ext_link in entry_data['external_links']): 384 + external_link = { 385 + 'url': original_link, 386 + 'normalized_url': normalized_link, 387 + 'in_feed': False # Mark as external to the feed 388 + } 389 + entry_data['external_links'].append(external_link) 390 + 391 + # Create the thread data structure 392 + thread_data = {} 393 + for _, entry_data in entry_urls.items(): 394 + thread_data[entry_data['id']] = { 395 + 'id': entry_data['id'], 396 + 'title': entry_data['title'], 397 + 'link': entry_data['link'], 398 + 'feed_title': entry_data['feed_title'], 399 + 'references': entry_data['references'], 400 + 'referenced_by': entry_data['referenced_by'], 401 + 'external_links': entry_data['external_links'] 402 + } 403 + 404 + # Generate some statistics 405 + entries_with_references = sum(1 for entry_data in entry_urls.values() if entry_data['references']) 406 + entries_with_referenced_by = sum(1 for entry_data in entry_urls.values() if entry_data['referenced_by']) 407 + entries_with_external_links = sum(1 for entry_data in entry_urls.values() if entry_data['external_links']) 408 + total_internal_references = sum(len(entry_data['references']) for entry_data in entry_urls.values()) 409 + total_external_links = sum(len(entry_data['external_links']) for entry_data in entry_urls.values()) 410 + 411 + print(f"\nThread Analysis:", file=sys.stderr) 412 + print(f"Total entries: {len(entry_urls)}", file=sys.stderr) 413 + print(f"Entries that reference other entries in the feed: {entries_with_references}", file=sys.stderr) 414 + print(f"Entries referenced by other entries in the feed: {entries_with_referenced_by}", file=sys.stderr) 415 + print(f"Entries with external links: {entries_with_external_links}", file=sys.stderr) 416 + print(f"Total internal references: {total_internal_references}", file=sys.stderr) 417 + print(f"Total external links: {total_external_links}", file=sys.stderr) 418 + 419 + return thread_data 420 + 210 421 def main(): 211 422 # Load feed URLs 212 423 feed_urls = load_feed_urls('feed.json') ··· 235 446 feed.write(f, 'utf-8') 236 447 237 448 print(f"Feed successfully written to eeg.xml", file=sys.stderr) 449 + 450 + # Generate thread data 451 + thread_data = generate_threads(entries) 452 + 453 + # Write the thread data to a JSON file 454 + with open('threads.json', 'w') as f: 455 + json.dump(thread_data, f, indent=2) 456 + 457 + print(f"Thread data successfully written to threads.json", file=sys.stderr) 238 458 239 459 if __name__ == "__main__": 240 460 main()

-256

make_threads.py

··· 1 - # /// script 2 - # requires-python = ">=3.11" 3 - # dependencies = [ 4 - # "feedparser", 5 - # "beautifulsoup4", 6 - # "urllib3", 7 - # ] 8 - # /// 9 - # Do not delete the above as its needed for `uv run` 10 - #!/usr/bin/env python3 11 - 12 - import json 13 - import feedparser 14 - import sys 15 - import os 16 - from bs4 import BeautifulSoup 17 - import re 18 - from urllib.parse import urlparse, urljoin 19 - 20 - def extract_links_from_html(html_content, base_url=None): 21 - """Extract and normalize links from HTML content""" 22 - soup = BeautifulSoup(html_content, 'html.parser') 23 - links = [] 24 - 25 - for a_tag in soup.find_all('a', href=True): 26 - href = a_tag['href'].strip() 27 - 28 - # Skip empty links, anchors, javascript, and mailto 29 - if not href or href.startswith(('#', 'javascript:', 'mailto:')): 30 - continue 31 - 32 - # Convert relative URLs to absolute if we have a base URL 33 - if base_url and not href.startswith(('http://', 'https://')): 34 - href = urljoin(base_url, href) 35 - 36 - links.append(href) 37 - 38 - return links 39 - 40 - def normalize_url(url): 41 - """Normalize URLs to consistently match them""" 42 - if not url: 43 - return "" 44 - 45 - # Handle common URL shorteners or redirects (not implemented) 46 - 47 - # Parse the URL 48 - parsed = urlparse(url) 49 - 50 - # Ensure scheme is consistent 51 - scheme = parsed.scheme.lower() or 'http' 52 - 53 - # Normalize netloc (lowercase, remove 'www.' prefix optionally) 54 - netloc = parsed.netloc.lower() 55 - if netloc.startswith('www.'): 56 - netloc = netloc[4:] 57 - 58 - # Remove trailing slashes and index.html/index.php 59 - path = parsed.path.rstrip('/') 60 - for index_file in ['/index.html', '/index.php', '/index.htm']: 61 - if path.endswith(index_file): 62 - path = path[:-len(index_file)] 63 - 64 - # Remove common fragments and query parameters that don't affect content 65 - # (like tracking params, utm_*, etc.) 66 - query_parts = [] 67 - if parsed.query: 68 - for param in parsed.query.split('&'): 69 - if '=' in param: 70 - key, value = param.split('=', 1) 71 - if not key.startswith(('utm_', 'ref', 'source')): 72 - query_parts.append(f"{key}={value}") 73 - 74 - query = '&'.join(query_parts) 75 - 76 - # Remove common hash fragments 77 - fragment = '' 78 - 79 - # Special case for common blogging platforms 80 - # Medium, WordPress, Ghost, etc. may have specific URL patterns 81 - 82 - # Reconstruct the URL 83 - normalized = f"{scheme}://{netloc}{path}" 84 - if query: 85 - normalized += f"?{query}" 86 - if fragment: 87 - normalized += f"#{fragment}" 88 - 89 - return normalized 90 - 91 - def get_domain(url): 92 - """Extract domain from a URL""" 93 - parsed = urlparse(url) 94 - domain = parsed.netloc.lower() 95 - # Remove 'www.' prefix if present 96 - if domain.startswith('www.'): 97 - domain = domain[4:] 98 - return domain 99 - 100 - def analyze_feed(): 101 - # Parse the aggregated feed 102 - print(f"Parsing eeg.xml...", file=sys.stderr) 103 - feed_data = feedparser.parse("eeg.xml") 104 - 105 - # Add debug info about the feed 106 - print(f"Feed title: {feed_data.feed.get('title', 'Unknown')}", file=sys.stderr) 107 - print(f"Feed version: {feed_data.get('version', 'Unknown')}", file=sys.stderr) 108 - 109 - if not feed_data or not hasattr(feed_data, 'entries'): 110 - print("Error: Could not parse feed or no entries found", file=sys.stderr) 111 - return 112 - 113 - print(f"Found {len(feed_data.entries)} entries in the aggregated feed", file=sys.stderr) 114 - 115 - all_entries = [] 116 - entry_urls = {} # Maps normalized URLs to entry data 117 - 118 - # First pass: collect all entries and their URLs 119 - for entry in feed_data.entries: 120 - # Get link 121 - link = entry.get('link', '') 122 - if not link: 123 - continue 124 - 125 - # Normalize the entry URL to help with matching 126 - normalized_link = normalize_url(link) 127 - 128 - # Get the domain of the entry 129 - entry_domain = get_domain(link) 130 - 131 - # Get feed title (stored as category in the aggregated feed) 132 - feed_title = "Unknown" 133 - if hasattr(entry, 'tags') and entry.tags: 134 - feed_title = entry.tags[0].term 135 - 136 - # Get description/content 137 - if hasattr(entry, 'content') and entry.content: 138 - content = entry.content[0].value 139 - else: 140 - content = entry.get('summary', '') 141 - 142 - # Extract all links from content, using the entry link as base URL for resolving relative URLs 143 - content_links = extract_links_from_html(content, base_url=link) 144 - 145 - # Get unique ID 146 - entry_id = entry.get('id', link) 147 - 148 - entry_data = { 149 - 'title': entry.get('title', 'No title'), 150 - 'link': link, 151 - 'normalized_link': normalized_link, 152 - 'domain': entry_domain, 153 - 'feed_title': feed_title, 154 - 'id': entry_id, 155 - 'content_links': content_links, 156 - 'references': [], # Will be filled in the second pass 157 - 'referenced_by': [], # Will be filled in the second pass 158 - 'external_links': [] # Links to content outside the feed 159 - } 160 - 161 - all_entries.append(entry_data) 162 - entry_urls[normalized_link] = entry_data 163 - 164 - print(f"Total entries processed: {len(all_entries)}", file=sys.stderr) 165 - 166 - # Second pass: analyze links between entries 167 - for entry in all_entries: 168 - # Keep track of references to avoid duplicates 169 - reference_ids = set() 170 - normalized_content_links = [normalize_url(link) for link in entry['content_links']] 171 - 172 - for i, normalized_link in enumerate(normalized_content_links): 173 - original_link = entry['content_links'][i] if i < len(entry['content_links']) else normalized_link 174 - 175 - # Check if this is a link to another entry in the feed 176 - if normalized_link in entry_urls and normalized_link != entry['normalized_link']: 177 - referenced_entry = entry_urls[normalized_link] 178 - 179 - # Avoid duplicate references 180 - if referenced_entry['id'] in reference_ids: 181 - continue 182 - 183 - reference_ids.add(referenced_entry['id']) 184 - 185 - # Add to the references of the current entry 186 - entry['references'].append({ 187 - 'id': referenced_entry['id'], 188 - 'link': referenced_entry['link'], 189 - 'title': referenced_entry['title'], 190 - 'feed_title': referenced_entry['feed_title'], 191 - 'in_feed': True # Mark as a reference to a post in the feed 192 - }) 193 - 194 - # Add to the referenced_by of the referenced entry 195 - # Check if this entry is already in referenced_by 196 - already_referenced = any(ref['id'] == entry['id'] for ref in referenced_entry['referenced_by']) 197 - if not already_referenced: 198 - referenced_entry['referenced_by'].append({ 199 - 'id': entry['id'], 200 - 'link': entry['link'], 201 - 'title': entry['title'], 202 - 'feed_title': entry['feed_title'], 203 - 'in_feed': True # Mark as a reference from a post in the feed 204 - }) 205 - elif normalized_link != entry['normalized_link']: 206 - # This is a link to something outside the feed 207 - # Check if it's from the same domain as the entry 208 - link_domain = get_domain(original_link) 209 - 210 - # Only include external links from different domains 211 - if link_domain != entry['domain']: 212 - # Track as an external link if not already in the list 213 - if not any(ext_link['url'] == original_link for ext_link in entry['external_links']): 214 - external_link = { 215 - 'url': original_link, 216 - 'normalized_url': normalized_link, 217 - 'in_feed': False # Mark as external to the feed 218 - } 219 - entry['external_links'].append(external_link) 220 - 221 - # Create the thread data structure 222 - thread_data = {} 223 - for entry in all_entries: 224 - thread_data[entry['id']] = { 225 - 'id': entry['id'], 226 - 'title': entry['title'], 227 - 'link': entry['link'], 228 - 'feed_title': entry['feed_title'], 229 - 'references': entry['references'], 230 - 'referenced_by': entry['referenced_by'], 231 - 'external_links': entry['external_links'] 232 - } 233 - 234 - # Write the thread data to a JSON file 235 - with open('threads.json', 'w') as f: 236 - json.dump(thread_data, f, indent=2) 237 - 238 - print(f"Thread data successfully written to threads.json", file=sys.stderr) 239 - 240 - # Generate some statistics 241 - entries_with_references = sum(1 for entry in all_entries if entry['references']) 242 - entries_with_referenced_by = sum(1 for entry in all_entries if entry['referenced_by']) 243 - entries_with_external_links = sum(1 for entry in all_entries if entry['external_links']) 244 - total_internal_references = sum(len(entry['references']) for entry in all_entries) 245 - total_external_links = sum(len(entry['external_links']) for entry in all_entries) 246 - 247 - print(f"\nThread Analysis:", file=sys.stderr) 248 - print(f"Total entries: {len(all_entries)}", file=sys.stderr) 249 - print(f"Entries that reference other entries in the feed: {entries_with_references}", file=sys.stderr) 250 - print(f"Entries referenced by other entries in the feed: {entries_with_referenced_by}", file=sys.stderr) 251 - print(f"Entries with external links: {entries_with_external_links}", file=sys.stderr) 252 - print(f"Total internal references: {total_internal_references}", file=sys.stderr) 253 - print(f"Total external links: {total_external_links}", file=sys.stderr) 254 - 255 - if __name__ == "__main__": 256 - analyze_feed()

+93 -2

threads.json

··· 29 29 } 30 30 ] 31 31 }, 32 + "https://www.jonmsterling.com/2025-W15/": { 33 + "id": "https://www.jonmsterling.com/2025-W15/", 34 + "title": "Weeknotes 2025-W15", 35 + "link": "https://www.jonmsterling.com/2025-W15/", 36 + "feed_title": "Jon Sterling \u203a Weeknotes", 37 + "references": [ 38 + { 39 + "id": "https://www.forester-notes.org/JVIT/", 40 + "link": "https://www.forester-notes.org/JVIT/", 41 + "title": "Towards Forester 5.0 II: a design for canonical URLs", 42 + "feed_title": "Forester Blog", 43 + "in_feed": true 44 + }, 45 + { 46 + "id": "https://patrick.sirref.org/weekly-2025-03-31/", 47 + "link": "https://patrick.sirref.org/weekly-2025-03-31/", 48 + "title": "Shelter, Hazel and More!", 49 + "feed_title": "Weeklies", 50 + "in_feed": true 51 + } 52 + ], 53 + "referenced_by": [], 54 + "external_links": [ 55 + { 56 + "url": "https://www.forester-notes.org/jms-011P/", 57 + "normalized_url": "https://forester-notes.org/jms-011P", 58 + "in_feed": false 59 + }, 60 + { 61 + "url": "https://git.sr.ht/~jonsterling/forester-base-theme/commit/a251f9cf19b0ff42f4553d315df5181b985c79cb", 62 + "normalized_url": "https://git.sr.ht/~jonsterling/forester-base-theme/commit/a251f9cf19b0ff42f4553d315df5181b985c79cb", 63 + "in_feed": false 64 + }, 65 + { 66 + "url": "https://topiary.tweag.io/", 67 + "normalized_url": "https://topiary.tweag.io", 68 + "in_feed": false 69 + }, 70 + { 71 + "url": "https://github.com/RedPRL/cooltt", 72 + "normalized_url": "https://github.com/RedPRL/cooltt", 73 + "in_feed": false 74 + }, 75 + { 76 + "url": "https://github.com/RedPRL/redtt", 77 + "normalized_url": "https://github.com/RedPRL/redtt", 78 + "in_feed": false 79 + }, 80 + { 81 + "url": "https://github.com/RedPRL/sml-redprl", 82 + "normalized_url": "https://github.com/RedPRL/sml-redprl", 83 + "in_feed": false 84 + }, 85 + { 86 + "url": "https://lawrencecpaulson.github.io/tag/locales", 87 + "normalized_url": "https://lawrencecpaulson.github.io/tag/locales", 88 + "in_feed": false 89 + }, 90 + { 91 + "url": "https://www21.in.tum.de/~ballarin/publications/jar2019.pdf", 92 + "normalized_url": "https://www21.in.tum.de/~ballarin/publications/jar2019.pdf", 93 + "in_feed": false 94 + }, 95 + { 96 + "url": "https://github.com/agda/agda/issues/5837", 97 + "normalized_url": "https://github.com/agda/agda/issues/5837", 98 + "in_feed": false 99 + }, 100 + { 101 + "url": "https://www.abebooks.co.uk/9789812701428/Domain-theoretic-Foundations-Functional-Programming-Streicher-9812701427/plp", 102 + "normalized_url": "https://abebooks.co.uk/9789812701428/Domain-theoretic-Foundations-Functional-Programming-Streicher-9812701427/plp", 103 + "in_feed": false 104 + } 105 + ] 106 + }, 32 107 "https://mort.io/blog/coping-and-capping/": { 33 108 "id": "https://mort.io/blog/coping-and-capping/", 34 109 "title": "Coping and Capping", ··· 908 983 "link": "https://patrick.sirref.org/weekly-2025-03-31/", 909 984 "feed_title": "Weeklies", 910 985 "references": [], 911 - "referenced_by": [], 986 + "referenced_by": [ 987 + { 988 + "id": "https://www.jonmsterling.com/2025-W15/", 989 + "link": "https://www.jonmsterling.com/2025-W15/", 990 + "title": "Weeknotes 2025-W15", 991 + "feed_title": "Jon Sterling \u203a Weeknotes", 992 + "in_feed": true 993 + } 994 + ], 912 995 "external_links": [ 913 996 { 914 997 "url": "https://github.com/quantifyearth/shark", ··· 1325 1408 "in_feed": true 1326 1409 } 1327 1410 ], 1328 - "referenced_by": [], 1411 + "referenced_by": [ 1412 + { 1413 + "id": "https://www.jonmsterling.com/2025-W15/", 1414 + "link": "https://www.jonmsterling.com/2025-W15/", 1415 + "title": "Weeknotes 2025-W15", 1416 + "feed_title": "Jon Sterling \u203a Weeknotes", 1417 + "in_feed": true 1418 + } 1419 + ], 1329 1420 "external_links": [ 1330 1421 { 1331 1422 "url": "https://web.archive.org/",

Configure Feed

Configure Feed