feat: publication layer on atlas — turso metadata, zoom-scaled circles, lazy cover images

+313 -5

2 changed files

expand all

scripts

build-atlas

site

atlas.js

+195 -3

scripts/build-atlas

··· 1 1 #!/usr/bin/env -S uv run --script --quiet 2 2 # /// script 3 3 # requires-python = ">=3.12" 4 - # dependencies = ["httpx", "numpy", "scikit-learn", "umap-learn", "hdbscan", "pydantic-settings"] 4 + # dependencies = ["httpx", "numpy", "scikit-learn", "umap-learn", "hdbscan", "pydantic-settings", "anthropic"] 5 5 # /// 6 6 """ 7 7 Build atlas.json — 2D semantic map of the document index. ··· 41 41 42 42 turbopuffer_api_key: str 43 43 turbopuffer_namespace: str = "leaflet-search" 44 + anthropic_api_key: str = "" # optional: enables LLM-refined cluster labels 45 + turso_url: str = "" # optional: enables publication data (e.g. https://leaf-....turso.io) 46 + turso_token: str = "" 44 47 45 48 @classmethod 46 49 def settings_customise_sources(cls, settings_cls, **kwargs): ··· 89 92 return all_rows 90 93 91 94 95 + def turso_query_publications(settings: Settings) -> dict[str, dict]: 96 + """Fetch publication metadata from Turso. Returns basePath → {name, did, coverImage}.""" 97 + if not settings.turso_url or not settings.turso_token: 98 + return {} 99 + 100 + host = settings.turso_url.replace("https://", "").replace("http://", "").replace("libsql://", "") 101 + url = f"https://{host}/v3/pipeline" 102 + headers = {"Authorization": f"Bearer {settings.turso_token}"} 103 + 104 + body = { 105 + "requests": [ 106 + { 107 + "type": "execute", 108 + "stmt": { 109 + "sql": ( 110 + "SELECT p.name, p.base_path, p.did, " 111 + "(SELECT d.cover_image FROM documents d " 112 + "WHERE d.publication_uri = p.uri AND d.cover_image IS NOT NULL " 113 + "LIMIT 1) as cover_image " 114 + "FROM publications p" 115 + ) 116 + }, 117 + }, 118 + {"type": "close"}, 119 + ] 120 + } 121 + 122 + try: 123 + resp = httpx.post(url, headers=headers, json=body, timeout=30) 124 + resp.raise_for_status() 125 + except (httpx.ConnectError, httpx.HTTPStatusError) as e: 126 + log(f" warning: turso query failed: {e}") 127 + return {} 128 + results = resp.json().get("results", []) 129 + if not results or "response" not in results[0]: 130 + return {} 131 + 132 + rows = results[0]["response"]["result"]["rows"] 133 + cols = [c["name"] for c in results[0]["response"]["result"]["cols"]] 134 + lookup = {} 135 + for row in rows: 136 + values = {cols[i]: (row[i]["value"] if row[i]["type"] != "null" else None) for i in range(len(cols))} 137 + bp = values.get("base_path") 138 + if bp: 139 + lookup[bp] = { 140 + "name": values.get("name") or bp, 141 + "did": values.get("did") or "", 142 + "coverImage": values.get("cover_image") or "", 143 + } 144 + return lookup 145 + 146 + 92 147 def extract_terms(title: str) -> list[str]: 93 148 """Extract lowercase alphanumeric tokens from a title.""" 94 149 if not title: ··· 130 185 return labels 131 186 132 187 188 + def llm_refine_labels( 189 + tfidf_labels: dict[int, str], 190 + titles_per_cluster: dict[int, list[str]], 191 + cluster_counts: dict[int, int], 192 + api_key: str, 193 + tier: str = "coarse", 194 + batch_size: int = 15, 195 + ) -> dict[int, str]: 196 + """Refine c-TF-IDF labels with Claude Haiku for human-readable topic names.""" 197 + import asyncio 198 + from anthropic import AsyncAnthropic 199 + 200 + client = AsyncAnthropic(api_key=api_key) 201 + rng = np.random.default_rng(42) 202 + 203 + cluster_ids = sorted(tfidf_labels.keys()) 204 + batches = [cluster_ids[i : i + batch_size] for i in range(0, len(cluster_ids), batch_size)] 205 + 206 + async def label_batch(batch: list[int]) -> dict[int, str]: 207 + parts = [] 208 + for cid in batch: 209 + titles = titles_per_cluster.get(cid, []) 210 + sample = list(rng.choice(titles, size=min(12, len(titles)), replace=False)) if titles else [] 211 + parts.append( 212 + f"Cluster {cid} ({cluster_counts.get(cid, 0)} docs):\n" 213 + f" Keywords: {tfidf_labels[cid]}\n" 214 + f" Sample titles: {', '.join(repr(t) for t in sample)}" 215 + ) 216 + 217 + prompt = ( 218 + "You are labeling topic clusters for a document atlas — a 2D map where each cluster " 219 + "represents a group of semantically similar blog posts and articles.\n\n" 220 + "For each cluster below, write a short descriptive label (2-4 lowercase words) that " 221 + "captures the theme. The label should read like a topic name on a map.\n\n" 222 + "Good labels: \"ai language models\", \"bluesky development\", \"personal journals\", " 223 + "\"spanish poetry\", \"game reviews\", \"home cooking\"\n" 224 + "Bad labels: \"ai llms llm\", \"results april june\", \"39 let english\"\n\n" 225 + + "\n\n".join(parts) 226 + + "\n\nRespond with ONLY a JSON object mapping cluster ID (as string) to label. No other text." 227 + ) 228 + 229 + resp = await client.messages.create( 230 + model="claude-haiku-4-5-20251001", 231 + max_tokens=2048, 232 + messages=[{"role": "user", "content": prompt}], 233 + ) 234 + text = resp.content[0].text.strip() 235 + # extract JSON from response (handle markdown code fences) 236 + if text.startswith("```"): 237 + text = text.split("\n", 1)[1].rsplit("```", 1)[0].strip() 238 + try: 239 + parsed = json.loads(text) 240 + return {int(k): str(v) for k, v in parsed.items()} 241 + except (json.JSONDecodeError, ValueError): 242 + log(f" warning: failed to parse LLM response for batch, using TF-IDF fallback") 243 + return {} 244 + 245 + async def run_all(): 246 + results = await asyncio.gather(*[label_batch(b) for b in batches]) 247 + merged = {} 248 + for r in results: 249 + merged.update(r) 250 + return merged 251 + 252 + log(f" refining {tier} labels with LLM ({len(batches)} batches, {len(cluster_ids)} clusters)...") 253 + refined = asyncio.run(run_all()) 254 + 255 + # merge: use LLM labels where available, fall back to c-TF-IDF 256 + final = {} 257 + for cid in cluster_ids: 258 + final[cid] = refined.get(cid, tfidf_labels[cid]) 259 + log(f" {tier}: {len(refined)}/{len(cluster_ids)} labels refined by LLM") 260 + return final 261 + 262 + 133 263 def assign_outliers(coords_2d: np.ndarray, labels: np.ndarray, centroids: dict[int, np.ndarray]) -> np.ndarray: 134 264 """Assign outlier points (label == -1) to nearest cluster centroid.""" 135 265 result = labels.copy() ··· 267 397 mask = labels_fine == cid 268 398 fine_centroids[cid] = X_2d[mask].mean(axis=0) 269 399 270 - # --- step 4: labels via c-TF-IDF --- 400 + # --- step 4: labels via c-TF-IDF + optional LLM refinement --- 271 401 log("computing cluster labels (c-TF-IDF on titles)...") 272 402 273 403 coarse_titles: dict[int, list[str]] = {} ··· 281 411 coarse_labels = cluster_labels(coarse_titles) 282 412 fine_labels = cluster_labels(fine_titles) 283 413 414 + if settings.anthropic_api_key: 415 + log("refining labels with LLM...") 416 + coarse_counts = {int(cid): int((labels_coarse == cid).sum()) for cid in set(labels_coarse)} 417 + fine_counts = {int(cid): int((labels_fine == cid).sum()) for cid in set(labels_fine)} 418 + try: 419 + coarse_labels = llm_refine_labels( 420 + coarse_labels, coarse_titles, coarse_counts, 421 + settings.anthropic_api_key, tier="coarse", 422 + ) 423 + fine_labels = llm_refine_labels( 424 + fine_labels, fine_titles, fine_counts, 425 + settings.anthropic_api_key, tier="fine", 426 + ) 427 + except Exception as e: 428 + log(f" LLM labeling failed, using c-TF-IDF fallback: {e}") 429 + else: 430 + log(" (no ANTHROPIC_API_KEY — using c-TF-IDF labels only)") 431 + 284 432 # map fine clusters to their parent coarse cluster (majority vote) 285 433 fine_to_coarse = {} 286 434 for fine_id in set(labels_fine): ··· 292 440 log(f" coarse: {len(coarse_labels)} labels") 293 441 log(f" fine: {len(fine_labels)} labels") 294 442 295 - # --- step 5: build output --- 443 + # --- step 5: publication centroids --- 444 + log("computing publication centroids...") 445 + pub_lookup = turso_query_publications(settings) 446 + if pub_lookup: 447 + log(f" fetched {len(pub_lookup)} publications from turso") 448 + else: 449 + log(" (no TURSO_URL/TURSO_TOKEN — using basePath grouping only)") 450 + 451 + # group documents by basePath → list of 2D positions 452 + bp_groups: dict[str, list[int]] = {} 453 + for i, meta in enumerate(metadata): 454 + bp = meta.get("basePath", "") 455 + if bp: 456 + bp_groups.setdefault(bp, []).append(i) 457 + 458 + publications = [] 459 + for bp, indices in bp_groups.items(): 460 + if len(indices) < 2: 461 + continue 462 + coords = X_2d[indices] 463 + centroid = coords.mean(axis=0) 464 + info = pub_lookup.get(bp, {}) 465 + pub = { 466 + "name": info.get("name") or bp, 467 + "basePath": bp, 468 + "did": info.get("did", ""), 469 + "cx": round(float(centroid[0]), 4), 470 + "cy": round(float(centroid[1]), 4), 471 + "count": len(indices), 472 + "coverImage": info.get("coverImage", ""), 473 + } 474 + # determine dominant platform for this publication 475 + plat_counts: dict[str, int] = {} 476 + for idx in indices: 477 + p = metadata[idx].get("platform", "other") 478 + plat_counts[p] = plat_counts.get(p, 0) + 1 479 + pub["platform"] = max(plat_counts, key=plat_counts.get) # type: ignore 480 + publications.append(pub) 481 + 482 + publications.sort(key=lambda p: p["count"], reverse=True) 483 + log(f" {len(publications)} publications with 2+ documents") 484 + 485 + # --- step 6: build output --- 296 486 log("building output...") 297 487 298 488 points = [] ··· 338 528 "coarse": coarse_clusters, 339 529 "fine": fine_clusters, 340 530 }, 531 + "publications": publications, 341 532 "meta": { 342 533 "generatedAt": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()), 343 534 "nDocuments": len(points), 535 + "nPublications": len(publications), 344 536 }, 345 537 } 346 538

+118 -2

site/atlas.js

··· 86 86 var uriToIndex = null; // Map<uri, index> for search matching 87 87 var clusterFineArr = null; // Uint8Array of fine cluster IDs per point 88 88 89 + // --- publication state --- 90 + var pubData = null; // array from atlas.json 91 + var pubImages = {}; // basePath → HTMLImageElement (loaded) 92 + var pubFailed = {}; // basePath → true (failed to load) 93 + var pubLoading = {}; // basePath → true (currently loading) 94 + var PUB_MAX_CONCURRENT = 6; 95 + var pubLoadCount = 0; 96 + 97 + function pubImageUrl(did, cid) { 98 + if (!did || !cid) return null; 99 + return 'https://cdn.bsky.app/img/feed_thumbnail/plain/' + did + '/' + cid + '@jpeg'; 100 + } 101 + 102 + function loadPubImage(pub) { 103 + var key = pub.basePath; 104 + if (pubImages[key] || pubFailed[key] || pubLoading[key]) return; 105 + if (pubLoadCount >= PUB_MAX_CONCURRENT) return; 106 + var url = pubImageUrl(pub.did, pub.coverImage); 107 + if (!url) { pubFailed[key] = true; return; } 108 + pubLoading[key] = true; 109 + pubLoadCount++; 110 + var img = new Image(); 111 + img.crossOrigin = 'anonymous'; 112 + img.onload = function() { 113 + pubImages[key] = img; 114 + delete pubLoading[key]; 115 + pubLoadCount--; 116 + markDirty(); 117 + }; 118 + img.onerror = function() { 119 + pubFailed[key] = true; 120 + delete pubLoading[key]; 121 + pubLoadCount--; 122 + }; 123 + img.src = url; 124 + } 125 + 89 126 // --- search state --- 90 127 var searchMatches = null; // Set of point indices matching current search 91 128 var searchCenter = null; // {x, y} weighted centroid of matches ··· 440 477 ctx.stroke(); 441 478 } 442 479 } 480 + } 481 + 482 + // --- publication circles --- 483 + // radius = sqrt(count) * zoom * 0.5, no floor — small pubs vanish at low zoom 484 + // at zoom 1: sqrt(50)≈7 → 3.5px (visible), sqrt(10)≈3 → 1.5px (culled) 485 + // at zoom 3: sqrt(10)≈3 → 4.5px (appears), sqrt(4)=2 → 3px (culled) 486 + // at zoom 6: sqrt(4)=2 → 6px (appears) 487 + // result: ~25 visible at overview, hundreds at deep zoom, all smooth 488 + if (pubData && pubData.length > 0) { 489 + var pubLabelZoom = 3; 490 + var pubRendered = 0; 491 + for (var pi2 = 0; pi2 < pubData.length; pi2++) { 492 + var pub = pubData[pi2]; 493 + var pr = Math.min(40, Math.sqrt(pub.count) * zoom * 0.5); 494 + if (pr < 4) continue; // natural culling — small pubs disappear 495 + var psx = cx + pub.cx * scale, psy = cy + pub.cy * scale; 496 + // cull off-screen (with padding for labels) 497 + if (psx < -60 || psx > W + 60 || psy < -60 || psy > H + 60) continue; 498 + pubRendered++; 499 + var pPlatform = pub.platform || 'other'; 500 + var pColors = frameColors[pPlatform] || frameColors.other; 501 + 502 + // lazy-load image for visible, large-enough publications 503 + if (pr >= 12) loadPubImage(pub); 504 + 505 + var img = pubImages[pub.basePath]; 506 + if (img) { 507 + // clipped circle with cover image 508 + ctx.save(); 509 + ctx.globalAlpha = 0.9; 510 + ctx.beginPath(); 511 + ctx.arc(psx, psy, pr, 0, Math.PI * 2); 512 + ctx.clip(); 513 + ctx.drawImage(img, psx - pr, psy - pr, pr * 2, pr * 2); 514 + ctx.restore(); 515 + } else { 516 + // fallback: filled circle with first letter 517 + ctx.globalAlpha = 0.7; 518 + ctx.beginPath(); 519 + ctx.arc(psx, psy, pr, 0, Math.PI * 2); 520 + ctx.fillStyle = pColors.edge; 521 + ctx.fill(); 522 + // letter only when circle is large enough to read 523 + if (pr >= 10) { 524 + var letter = (pub.name || '?').charAt(0).toUpperCase(); 525 + var letterSize = Math.max(8, pr * 0.9); 526 + ctx.font = 'bold ' + Math.round(letterSize) + 'px monospace'; 527 + ctx.textAlign = 'center'; 528 + ctx.textBaseline = 'middle'; 529 + ctx.fillStyle = pColors.core; 530 + ctx.globalAlpha = 0.9; 531 + ctx.fillText(letter, psx, psy); 532 + } 533 + } 534 + 535 + // border ring 536 + ctx.globalAlpha = 0.6; 537 + ctx.beginPath(); 538 + ctx.arc(psx, psy, pr, 0, Math.PI * 2); 539 + ctx.strokeStyle = pColors.mid; 540 + ctx.lineWidth = 1.5; 541 + ctx.stroke(); 542 + 543 + // name label at higher zoom 544 + if (zoom >= pubLabelZoom && pr >= 10) { 545 + var labelSize = Math.max(8, Math.min(12, pr * 0.5)); 546 + ctx.font = Math.round(labelSize) + 'px monospace'; 547 + ctx.textAlign = 'center'; 548 + ctx.textBaseline = 'top'; 549 + ctx.globalAlpha = Math.min(0.8, fadeIn(zoom, pubLabelZoom, 1.0)); 550 + ctx.fillStyle = dark ? 'rgba(255,255,255,0.85)' : 'rgba(0,0,0,0.75)'; 551 + var pubLabel = pub.name.length > 20 ? pub.name.substring(0, 18) + '\u2026' : pub.name; 552 + drawLabel(pubLabel, psx, psy + pr + 4, dark); 553 + } 554 + } 555 + ctx.globalAlpha = 1; 443 556 } 444 557 445 558 // --- points: sprite-stamped --- ··· 1008 1121 var cl = d.clusters.fine[c]; 1009 1122 cl.radius = cl._distN > 0 ? (cl._distSum / cl._distN) * 2 : 0.02; 1010 1123 } 1124 + // load publication data 1125 + pubData = d.publications || []; 1011 1126 buildSpatialIndex(); 1012 1127 renderLegend(); 1013 - document.getElementById('stats').textContent = 1014 - n.toLocaleString() + ' documents \u00B7 ' + 1128 + var statsText = n.toLocaleString() + ' documents \u00B7 ' + 1015 1129 d.clusters.coarse.length + ' regions \u00B7 ' + 1016 1130 d.clusters.fine.length + ' clusters'; 1131 + if (pubData.length > 0) statsText += ' \u00B7 ' + pubData.length + ' publications'; 1132 + document.getElementById('stats').textContent = statsText; 1017 1133 document.getElementById('loading').classList.add('hidden'); 1018 1134 markDirty(); 1019 1135 // jump to specific document by URI (from "view on atlas" links)

Configure Feed

Configure Feed