···55# "feedgenerator",
66# "requests",
77# "beautifulsoup4",
88+# "urllib3",
89# ]
910# ///
1011# Do not delete the above as its needed for `uv run`
···2122import re
2223from html import unescape
2324from bs4 import BeautifulSoup
2525+from urllib.parse import urlparse, urljoin
24262527def load_feed_urls(file_path):
2628 with open(file_path, 'r') as f:
···147149 # Get link
148150 link = entry.get('link', '')
149151150150- # Get description/content
152152+ # Get full content from the feed entry
151153 if hasattr(entry, 'content') and entry.content:
152154 content = entry.content[0].value
153155 else:
154156 content = entry.get('summary', '')
155155-157157+156158 # Create HTML preview that will be used as the content
157159 preview = create_html_preview(content)
158160···162164 all_entries.append({
163165 'title': title,
164166 'link': link,
165165- 'content': content,
167167+ 'content': content, # Use the feed content directly
166168 'preview': preview,
167169 'author': author_name,
168170 'pub_date': pub_date,
···207209208210 return feed
209211212212+# Functions from make_threads.py
213213+214214+def extract_links_from_html(html_content, base_url=None):
215215+ """Extract and normalize links from HTML content"""
216216+ soup = BeautifulSoup(html_content, 'html.parser')
217217+ links = []
218218+219219+ for a_tag in soup.find_all('a', href=True):
220220+ href = a_tag['href'].strip()
221221+222222+ # Skip empty links, anchors, javascript, and mailto
223223+ if not href or href.startswith(('#', 'javascript:', 'mailto:')):
224224+ continue
225225+226226+ # Convert relative URLs to absolute if we have a base URL
227227+ if base_url and not href.startswith(('http://', 'https://')):
228228+ href = urljoin(base_url, href)
229229+230230+ links.append(href)
231231+232232+ return links
233233+234234+def normalize_url(url):
235235+ """Normalize URLs to consistently match them"""
236236+ if not url:
237237+ return ""
238238+239239+ # Handle common URL shorteners or redirects (not implemented)
240240+241241+ # Parse the URL
242242+ parsed = urlparse(url)
243243+244244+ # Ensure scheme is consistent
245245+ scheme = parsed.scheme.lower() or 'http'
246246+247247+ # Normalize netloc (lowercase, remove 'www.' prefix optionally)
248248+ netloc = parsed.netloc.lower()
249249+ if netloc.startswith('www.'):
250250+ netloc = netloc[4:]
251251+252252+ # Remove trailing slashes and index.html/index.php
253253+ path = parsed.path.rstrip('/')
254254+ for index_file in ['/index.html', '/index.php', '/index.htm']:
255255+ if path.endswith(index_file):
256256+ path = path[:-len(index_file)]
257257+258258+ # Remove common fragments and query parameters that don't affect content
259259+ # (like tracking params, utm_*, etc.)
260260+ query_parts = []
261261+ if parsed.query:
262262+ for param in parsed.query.split('&'):
263263+ if '=' in param:
264264+ key, value = param.split('=', 1)
265265+ if not key.startswith(('utm_', 'ref', 'source')):
266266+ query_parts.append(f"{key}={value}")
267267+268268+ query = '&'.join(query_parts)
269269+270270+ # Remove common hash fragments
271271+ fragment = ''
272272+273273+ # Special case for common blogging platforms
274274+ # Medium, WordPress, Ghost, etc. may have specific URL patterns
275275+276276+ # Reconstruct the URL
277277+ normalized = f"{scheme}://{netloc}{path}"
278278+ if query:
279279+ normalized += f"?{query}"
280280+ if fragment:
281281+ normalized += f"#{fragment}"
282282+283283+ return normalized
284284+285285+def get_domain(url):
286286+ """Extract domain from a URL"""
287287+ parsed = urlparse(url)
288288+ domain = parsed.netloc.lower()
289289+ # Remove 'www.' prefix if present
290290+ if domain.startswith('www.'):
291291+ domain = domain[4:]
292292+ return domain
293293+294294+def generate_threads(entries):
295295+ """Generate thread data from the entries"""
296296+ print(f"Generating thread data from {len(entries)} entries...", file=sys.stderr)
297297+298298+ entry_urls = {} # Maps normalized URLs to entry data
299299+300300+ # First pass: collect all entries and their URLs
301301+ for entry in entries:
302302+ # Get link
303303+ link = entry['link']
304304+ if not link:
305305+ continue
306306+307307+ # Normalize the entry URL to help with matching
308308+ normalized_link = normalize_url(link)
309309+310310+ # Get the domain of the entry
311311+ entry_domain = get_domain(link)
312312+313313+ # Use the feed content to extract links
314314+ content_to_extract = entry['content']
315315+316316+ # Extract all links from content, using the entry link as base URL for resolving relative URLs
317317+ content_links = extract_links_from_html(content_to_extract, base_url=link)
318318+319319+ entry_data = {
320320+ 'title': entry['title'],
321321+ 'link': link,
322322+ 'normalized_link': normalized_link,
323323+ 'domain': entry_domain,
324324+ 'feed_title': entry['feed_title'],
325325+ 'id': entry['id'],
326326+ 'content_links': content_links,
327327+ 'references': [], # Will be filled in the second pass
328328+ 'referenced_by': [], # Will be filled in the second pass
329329+ 'external_links': [] # Links to content outside the feed
330330+ }
331331+332332+ entry_urls[normalized_link] = entry_data
333333+334334+ print(f"Extracted links from all entries", file=sys.stderr)
335335+336336+ # Second pass: analyze links between entries
337337+ for entry_id, entry_data in entry_urls.items():
338338+ # Keep track of references to avoid duplicates
339339+ reference_ids = set()
340340+ normalized_content_links = [normalize_url(link) for link in entry_data['content_links']]
341341+342342+ for i, normalized_link in enumerate(normalized_content_links):
343343+ original_link = entry_data['content_links'][i] if i < len(entry_data['content_links']) else normalized_link
344344+345345+ # Check if this is a link to another entry in the feed
346346+ if normalized_link in entry_urls and normalized_link != entry_data['normalized_link']:
347347+ referenced_entry = entry_urls[normalized_link]
348348+349349+ # Avoid duplicate references
350350+ if referenced_entry['id'] in reference_ids:
351351+ continue
352352+353353+ reference_ids.add(referenced_entry['id'])
354354+355355+ # Add to the references of the current entry
356356+ entry_data['references'].append({
357357+ 'id': referenced_entry['id'],
358358+ 'link': referenced_entry['link'],
359359+ 'title': referenced_entry['title'],
360360+ 'feed_title': referenced_entry['feed_title'],
361361+ 'in_feed': True # Mark as a reference to a post in the feed
362362+ })
363363+364364+ # Add to the referenced_by of the referenced entry
365365+ # Check if this entry is already in referenced_by
366366+ already_referenced = any(ref['id'] == entry_data['id'] for ref in referenced_entry['referenced_by'])
367367+ if not already_referenced:
368368+ referenced_entry['referenced_by'].append({
369369+ 'id': entry_data['id'],
370370+ 'link': entry_data['link'],
371371+ 'title': entry_data['title'],
372372+ 'feed_title': entry_data['feed_title'],
373373+ 'in_feed': True # Mark as a reference from a post in the feed
374374+ })
375375+ elif normalized_link != entry_data['normalized_link']:
376376+ # This is a link to something outside the feed
377377+ # Check if it's from the same domain as the entry
378378+ link_domain = get_domain(original_link)
379379+380380+ # Only include external links from different domains
381381+ if link_domain != entry_data['domain']:
382382+ # Track as an external link if not already in the list
383383+ if not any(ext_link['url'] == original_link for ext_link in entry_data['external_links']):
384384+ external_link = {
385385+ 'url': original_link,
386386+ 'normalized_url': normalized_link,
387387+ 'in_feed': False # Mark as external to the feed
388388+ }
389389+ entry_data['external_links'].append(external_link)
390390+391391+ # Create the thread data structure
392392+ thread_data = {}
393393+ for _, entry_data in entry_urls.items():
394394+ thread_data[entry_data['id']] = {
395395+ 'id': entry_data['id'],
396396+ 'title': entry_data['title'],
397397+ 'link': entry_data['link'],
398398+ 'feed_title': entry_data['feed_title'],
399399+ 'references': entry_data['references'],
400400+ 'referenced_by': entry_data['referenced_by'],
401401+ 'external_links': entry_data['external_links']
402402+ }
403403+404404+ # Generate some statistics
405405+ entries_with_references = sum(1 for entry_data in entry_urls.values() if entry_data['references'])
406406+ entries_with_referenced_by = sum(1 for entry_data in entry_urls.values() if entry_data['referenced_by'])
407407+ entries_with_external_links = sum(1 for entry_data in entry_urls.values() if entry_data['external_links'])
408408+ total_internal_references = sum(len(entry_data['references']) for entry_data in entry_urls.values())
409409+ total_external_links = sum(len(entry_data['external_links']) for entry_data in entry_urls.values())
410410+411411+ print(f"\nThread Analysis:", file=sys.stderr)
412412+ print(f"Total entries: {len(entry_urls)}", file=sys.stderr)
413413+ print(f"Entries that reference other entries in the feed: {entries_with_references}", file=sys.stderr)
414414+ print(f"Entries referenced by other entries in the feed: {entries_with_referenced_by}", file=sys.stderr)
415415+ print(f"Entries with external links: {entries_with_external_links}", file=sys.stderr)
416416+ print(f"Total internal references: {total_internal_references}", file=sys.stderr)
417417+ print(f"Total external links: {total_external_links}", file=sys.stderr)
418418+419419+ return thread_data
420420+210421def main():
211422 # Load feed URLs
212423 feed_urls = load_feed_urls('feed.json')
···235446 feed.write(f, 'utf-8')
236447237448 print(f"Feed successfully written to eeg.xml", file=sys.stderr)
449449+450450+ # Generate thread data
451451+ thread_data = generate_threads(entries)
452452+453453+ # Write the thread data to a JSON file
454454+ with open('threads.json', 'w') as f:
455455+ json.dump(thread_data, f, indent=2)
456456+457457+ print(f"Thread data successfully written to threads.json", file=sys.stderr)
238458239459if __name__ == "__main__":
240460 main()
-256
make_threads.py
···11-# /// script
22-# requires-python = ">=3.11"
33-# dependencies = [
44-# "feedparser",
55-# "beautifulsoup4",
66-# "urllib3",
77-# ]
88-# ///
99-# Do not delete the above as its needed for `uv run`
1010-#!/usr/bin/env python3
1111-1212-import json
1313-import feedparser
1414-import sys
1515-import os
1616-from bs4 import BeautifulSoup
1717-import re
1818-from urllib.parse import urlparse, urljoin
1919-2020-def extract_links_from_html(html_content, base_url=None):
2121- """Extract and normalize links from HTML content"""
2222- soup = BeautifulSoup(html_content, 'html.parser')
2323- links = []
2424-2525- for a_tag in soup.find_all('a', href=True):
2626- href = a_tag['href'].strip()
2727-2828- # Skip empty links, anchors, javascript, and mailto
2929- if not href or href.startswith(('#', 'javascript:', 'mailto:')):
3030- continue
3131-3232- # Convert relative URLs to absolute if we have a base URL
3333- if base_url and not href.startswith(('http://', 'https://')):
3434- href = urljoin(base_url, href)
3535-3636- links.append(href)
3737-3838- return links
3939-4040-def normalize_url(url):
4141- """Normalize URLs to consistently match them"""
4242- if not url:
4343- return ""
4444-4545- # Handle common URL shorteners or redirects (not implemented)
4646-4747- # Parse the URL
4848- parsed = urlparse(url)
4949-5050- # Ensure scheme is consistent
5151- scheme = parsed.scheme.lower() or 'http'
5252-5353- # Normalize netloc (lowercase, remove 'www.' prefix optionally)
5454- netloc = parsed.netloc.lower()
5555- if netloc.startswith('www.'):
5656- netloc = netloc[4:]
5757-5858- # Remove trailing slashes and index.html/index.php
5959- path = parsed.path.rstrip('/')
6060- for index_file in ['/index.html', '/index.php', '/index.htm']:
6161- if path.endswith(index_file):
6262- path = path[:-len(index_file)]
6363-6464- # Remove common fragments and query parameters that don't affect content
6565- # (like tracking params, utm_*, etc.)
6666- query_parts = []
6767- if parsed.query:
6868- for param in parsed.query.split('&'):
6969- if '=' in param:
7070- key, value = param.split('=', 1)
7171- if not key.startswith(('utm_', 'ref', 'source')):
7272- query_parts.append(f"{key}={value}")
7373-7474- query = '&'.join(query_parts)
7575-7676- # Remove common hash fragments
7777- fragment = ''
7878-7979- # Special case for common blogging platforms
8080- # Medium, WordPress, Ghost, etc. may have specific URL patterns
8181-8282- # Reconstruct the URL
8383- normalized = f"{scheme}://{netloc}{path}"
8484- if query:
8585- normalized += f"?{query}"
8686- if fragment:
8787- normalized += f"#{fragment}"
8888-8989- return normalized
9090-9191-def get_domain(url):
9292- """Extract domain from a URL"""
9393- parsed = urlparse(url)
9494- domain = parsed.netloc.lower()
9595- # Remove 'www.' prefix if present
9696- if domain.startswith('www.'):
9797- domain = domain[4:]
9898- return domain
9999-100100-def analyze_feed():
101101- # Parse the aggregated feed
102102- print(f"Parsing eeg.xml...", file=sys.stderr)
103103- feed_data = feedparser.parse("eeg.xml")
104104-105105- # Add debug info about the feed
106106- print(f"Feed title: {feed_data.feed.get('title', 'Unknown')}", file=sys.stderr)
107107- print(f"Feed version: {feed_data.get('version', 'Unknown')}", file=sys.stderr)
108108-109109- if not feed_data or not hasattr(feed_data, 'entries'):
110110- print("Error: Could not parse feed or no entries found", file=sys.stderr)
111111- return
112112-113113- print(f"Found {len(feed_data.entries)} entries in the aggregated feed", file=sys.stderr)
114114-115115- all_entries = []
116116- entry_urls = {} # Maps normalized URLs to entry data
117117-118118- # First pass: collect all entries and their URLs
119119- for entry in feed_data.entries:
120120- # Get link
121121- link = entry.get('link', '')
122122- if not link:
123123- continue
124124-125125- # Normalize the entry URL to help with matching
126126- normalized_link = normalize_url(link)
127127-128128- # Get the domain of the entry
129129- entry_domain = get_domain(link)
130130-131131- # Get feed title (stored as category in the aggregated feed)
132132- feed_title = "Unknown"
133133- if hasattr(entry, 'tags') and entry.tags:
134134- feed_title = entry.tags[0].term
135135-136136- # Get description/content
137137- if hasattr(entry, 'content') and entry.content:
138138- content = entry.content[0].value
139139- else:
140140- content = entry.get('summary', '')
141141-142142- # Extract all links from content, using the entry link as base URL for resolving relative URLs
143143- content_links = extract_links_from_html(content, base_url=link)
144144-145145- # Get unique ID
146146- entry_id = entry.get('id', link)
147147-148148- entry_data = {
149149- 'title': entry.get('title', 'No title'),
150150- 'link': link,
151151- 'normalized_link': normalized_link,
152152- 'domain': entry_domain,
153153- 'feed_title': feed_title,
154154- 'id': entry_id,
155155- 'content_links': content_links,
156156- 'references': [], # Will be filled in the second pass
157157- 'referenced_by': [], # Will be filled in the second pass
158158- 'external_links': [] # Links to content outside the feed
159159- }
160160-161161- all_entries.append(entry_data)
162162- entry_urls[normalized_link] = entry_data
163163-164164- print(f"Total entries processed: {len(all_entries)}", file=sys.stderr)
165165-166166- # Second pass: analyze links between entries
167167- for entry in all_entries:
168168- # Keep track of references to avoid duplicates
169169- reference_ids = set()
170170- normalized_content_links = [normalize_url(link) for link in entry['content_links']]
171171-172172- for i, normalized_link in enumerate(normalized_content_links):
173173- original_link = entry['content_links'][i] if i < len(entry['content_links']) else normalized_link
174174-175175- # Check if this is a link to another entry in the feed
176176- if normalized_link in entry_urls and normalized_link != entry['normalized_link']:
177177- referenced_entry = entry_urls[normalized_link]
178178-179179- # Avoid duplicate references
180180- if referenced_entry['id'] in reference_ids:
181181- continue
182182-183183- reference_ids.add(referenced_entry['id'])
184184-185185- # Add to the references of the current entry
186186- entry['references'].append({
187187- 'id': referenced_entry['id'],
188188- 'link': referenced_entry['link'],
189189- 'title': referenced_entry['title'],
190190- 'feed_title': referenced_entry['feed_title'],
191191- 'in_feed': True # Mark as a reference to a post in the feed
192192- })
193193-194194- # Add to the referenced_by of the referenced entry
195195- # Check if this entry is already in referenced_by
196196- already_referenced = any(ref['id'] == entry['id'] for ref in referenced_entry['referenced_by'])
197197- if not already_referenced:
198198- referenced_entry['referenced_by'].append({
199199- 'id': entry['id'],
200200- 'link': entry['link'],
201201- 'title': entry['title'],
202202- 'feed_title': entry['feed_title'],
203203- 'in_feed': True # Mark as a reference from a post in the feed
204204- })
205205- elif normalized_link != entry['normalized_link']:
206206- # This is a link to something outside the feed
207207- # Check if it's from the same domain as the entry
208208- link_domain = get_domain(original_link)
209209-210210- # Only include external links from different domains
211211- if link_domain != entry['domain']:
212212- # Track as an external link if not already in the list
213213- if not any(ext_link['url'] == original_link for ext_link in entry['external_links']):
214214- external_link = {
215215- 'url': original_link,
216216- 'normalized_url': normalized_link,
217217- 'in_feed': False # Mark as external to the feed
218218- }
219219- entry['external_links'].append(external_link)
220220-221221- # Create the thread data structure
222222- thread_data = {}
223223- for entry in all_entries:
224224- thread_data[entry['id']] = {
225225- 'id': entry['id'],
226226- 'title': entry['title'],
227227- 'link': entry['link'],
228228- 'feed_title': entry['feed_title'],
229229- 'references': entry['references'],
230230- 'referenced_by': entry['referenced_by'],
231231- 'external_links': entry['external_links']
232232- }
233233-234234- # Write the thread data to a JSON file
235235- with open('threads.json', 'w') as f:
236236- json.dump(thread_data, f, indent=2)
237237-238238- print(f"Thread data successfully written to threads.json", file=sys.stderr)
239239-240240- # Generate some statistics
241241- entries_with_references = sum(1 for entry in all_entries if entry['references'])
242242- entries_with_referenced_by = sum(1 for entry in all_entries if entry['referenced_by'])
243243- entries_with_external_links = sum(1 for entry in all_entries if entry['external_links'])
244244- total_internal_references = sum(len(entry['references']) for entry in all_entries)
245245- total_external_links = sum(len(entry['external_links']) for entry in all_entries)
246246-247247- print(f"\nThread Analysis:", file=sys.stderr)
248248- print(f"Total entries: {len(all_entries)}", file=sys.stderr)
249249- print(f"Entries that reference other entries in the feed: {entries_with_references}", file=sys.stderr)
250250- print(f"Entries referenced by other entries in the feed: {entries_with_referenced_by}", file=sys.stderr)
251251- print(f"Entries with external links: {entries_with_external_links}", file=sys.stderr)
252252- print(f"Total internal references: {total_internal_references}", file=sys.stderr)
253253- print(f"Total external links: {total_external_links}", file=sys.stderr)
254254-255255-if __name__ == "__main__":
256256- analyze_feed()