search for standard sites pub-search.waow.tech
search zig blog atproto
11
fork

Configure Feed

Select the types of activity you want to include in your feed.

fix: sync MCP stats model and scripts with voyage-4-lite

- expand Stats type with embeddings, searches, errors, started_at,
cache_hits, cache_misses, and per-endpoint timing metrics
- update backfill-embeddings to use voyage-4-lite with output_dimension
- update rebuild-documents-table to use F32_BLOB(1024)
- add regression tests for full Stats model

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

zzstoatzz cd6fe8b3 5651c1c9

+224 -7
+1 -1
mcp/README.md
··· 33 33 | `find_similar` | find semantically similar documents | 34 34 | `get_tags` | list all tags with document counts | 35 35 | `get_popular` | see popular search queries | 36 - | `get_stats` | index statistics (document/publication counts) | 36 + | `get_stats` | index statistics and performance metrics | 37 37 38 38 ## workflow 39 39
+19 -1
mcp/src/pub_search/_types.py
··· 53 53 count: int 54 54 55 55 56 + class EndpointTiming(BaseModel): 57 + """Timing stats for a single endpoint.""" 58 + 59 + count: int = 0 60 + avg_ms: float = 0.0 61 + p50_ms: float = 0.0 62 + p95_ms: float = 0.0 63 + p99_ms: float = 0.0 64 + max_ms: float = 0.0 65 + 66 + 56 67 class Stats(BaseModel): 57 - """Leaflet index statistics.""" 68 + """Index statistics.""" 58 69 59 70 documents: int 60 71 publications: int 72 + embeddings: int = 0 73 + searches: int = 0 74 + errors: int = 0 75 + started_at: int = 0 76 + cache_hits: int = 0 77 + cache_misses: int = 0 78 + timing: dict[str, EndpointTiming] = {} 61 79 62 80 63 81 class Document(BaseModel):
+26 -3
mcp/tests/test_mcp.py
··· 6 6 from fastmcp.client import Client 7 7 from fastmcp.client.transports import FastMCPTransport 8 8 9 - from pub_search._types import Document, PopularSearch, SearchResult, Stats, Tag 9 + from pub_search._types import Document, EndpointTiming, PopularSearch, SearchResult, Stats, Tag 10 10 from pub_search.server import mcp 11 11 12 12 ··· 70 70 assert p.query == "rust async" 71 71 assert p.count == 100 72 72 73 - def test_stats(self): 74 - """Stats can be constructed.""" 73 + def test_stats_minimal(self): 74 + """Stats can be constructed with just documents/publications.""" 75 75 s = Stats(documents=1000, publications=50) 76 76 assert s.documents == 1000 77 77 assert s.publications == 50 78 + assert s.embeddings == 0 79 + assert s.timing == {} 80 + 81 + def test_stats_full(self): 82 + """Stats can be constructed with all fields from API.""" 83 + s = Stats( 84 + documents=6527, 85 + publications=2335, 86 + embeddings=6527, 87 + searches=5321, 88 + errors=0, 89 + started_at=1767333441, 90 + cache_hits=978, 91 + cache_misses=627, 92 + timing={ 93 + "search_keyword": EndpointTiming( 94 + count=320, avg_ms=140.1, p50_ms=7.7, p95_ms=616.2, p99_ms=1090.1, max_ms=7294.9 95 + ), 96 + }, 97 + ) 98 + assert s.embeddings == 6527 99 + assert s.cache_hits == 978 100 + assert s.timing["search_keyword"].p50_ms == 7.7 78 101 79 102 def test_document(self): 80 103 """Document can be constructed with full content."""
+3 -2
scripts/backfill-embeddings
··· 128 128 }, 129 129 json={ 130 130 "input": texts, 131 - "model": "voyage-3-lite", 131 + "model": "voyage-4-lite", 132 + "output_dimension": 1024, 132 133 "input_type": "document", 133 134 }, 134 135 timeout=60, ··· 158 159 except Exception as e: 159 160 if "no such column" in str(e).lower(): 160 161 print("adding embedding column...") 161 - turso_exec(settings, "ALTER TABLE documents ADD COLUMN embedding F32_BLOB(512)") 162 + turso_exec(settings, "ALTER TABLE documents ADD COLUMN embedding F32_BLOB(1024)") 162 163 print("done") 163 164 else: 164 165 raise
+175
scripts/rebuild-documents-table
··· 1 + #!/usr/bin/env -S uv run --script --quiet 2 + # /// script 3 + # requires-python = ">=3.12" 4 + # dependencies = ["httpx", "pydantic-settings"] 5 + # /// 6 + """ 7 + Recreate the documents table with clean schema (F32_BLOB(1024) for voyage-4-lite). 8 + 9 + This script: 10 + 1. Drops corrupted DiskANN shadow tables 11 + 2. Creates documents_new with clean schema (F32_BLOB(1024)) 12 + 3. Copies all data (excluding embeddings — those will be re-generated by backfill-embeddings) 13 + 4. Drops old table, renames new one 14 + 5. Recreates FTS and indexes (no DiskANN — we use brute-force for /similar) 15 + 16 + Usage: 17 + ./scripts/rebuild-documents-table 18 + """ 19 + 20 + import os 21 + import sys 22 + import time 23 + 24 + import httpx 25 + from pydantic_settings import BaseSettings, SettingsConfigDict 26 + 27 + TIMEOUT = 300 28 + 29 + 30 + class Settings(BaseSettings): 31 + model_config = SettingsConfigDict( 32 + env_file=os.environ.get("ENV_FILE", ".env"), extra="ignore" 33 + ) 34 + turso_url: str 35 + turso_token: str 36 + 37 + @property 38 + def turso_host(self) -> str: 39 + url = self.turso_url 40 + if url.startswith("libsql://"): 41 + url = url[len("libsql://") :] 42 + return url 43 + 44 + 45 + def pipeline(settings: Settings, statements: list[str], timeout: int = TIMEOUT) -> list[dict]: 46 + requests = [{"type": "execute", "stmt": {"sql": sql}} for sql in statements] 47 + requests.append({"type": "close"}) 48 + 49 + response = httpx.post( 50 + f"https://{settings.turso_host}/v2/pipeline", 51 + headers={ 52 + "Authorization": f"Bearer {settings.turso_token}", 53 + "Content-Type": "application/json", 54 + }, 55 + json={"requests": requests}, 56 + timeout=timeout, 57 + ) 58 + response.raise_for_status() 59 + data = response.json() 60 + 61 + results = [] 62 + for i, result in enumerate(data["results"][:-1]): 63 + if result["type"] == "error": 64 + raise Exception(f"statement {i} failed: {result['error']}") 65 + results.append(result["response"]["result"]) 66 + return results 67 + 68 + 69 + def scalar(settings: Settings, sql: str) -> int: 70 + results = pipeline(settings, [sql], timeout=30) 71 + cell = results[0]["rows"][0][0] 72 + return int(cell["value"] if isinstance(cell, dict) else cell) 73 + 74 + 75 + def step(msg: str): 76 + print(f" {msg}...", end="", flush=True) 77 + 78 + 79 + def done(detail: str = ""): 80 + suffix = f" ({detail})" if detail else "" 81 + print(f" ok{suffix}", flush=True) 82 + 83 + 84 + def main(): 85 + try: 86 + settings = Settings() # type: ignore 87 + except Exception as e: 88 + print(f"error: {e}", file=sys.stderr) 89 + sys.exit(1) 90 + 91 + total = scalar(settings, "SELECT count(*) FROM documents") 92 + print(f"documents table: {total} rows") 93 + 94 + t0 = time.time() 95 + 96 + # step 0: drop corrupted DiskANN shadow tables 97 + step("dropping corrupted vector index tables") 98 + pipeline(settings, [ 99 + "DROP TABLE IF EXISTS documents_embedding_idx_shadow", 100 + "DROP TABLE IF EXISTS libsql_vector_meta_shadow", 101 + ]) 102 + done() 103 + 104 + # step 1: create new table with clean schema 105 + step("creating documents_new") 106 + pipeline(settings, [""" 107 + CREATE TABLE documents_new ( 108 + uri TEXT PRIMARY KEY, 109 + did TEXT NOT NULL, 110 + rkey TEXT NOT NULL, 111 + title TEXT NOT NULL, 112 + content TEXT NOT NULL, 113 + created_at TEXT, 114 + publication_uri TEXT, 115 + platform TEXT DEFAULT 'leaflet', 116 + source_collection TEXT DEFAULT 'pub.leaflet.document', 117 + path TEXT, 118 + base_path TEXT DEFAULT '', 119 + has_publication INTEGER DEFAULT 0, 120 + indexed_at TEXT, 121 + embedding F32_BLOB(1024) 122 + ) 123 + """]) 124 + done() 125 + 126 + # step 2: copy all data (skip embeddings — wrong dims, will be re-generated) 127 + step(f"copying {total} rows (without embeddings)") 128 + pipeline(settings, [""" 129 + INSERT INTO documents_new 130 + (uri, did, rkey, title, content, created_at, publication_uri, 131 + platform, source_collection, path, base_path, has_publication, indexed_at) 132 + SELECT uri, did, rkey, title, content, created_at, publication_uri, 133 + platform, source_collection, path, base_path, has_publication, indexed_at 134 + FROM documents 135 + """]) 136 + done() 137 + 138 + # step 3: drop old table + FTS triggers, rename new 139 + step("swapping tables") 140 + pipeline(settings, [ 141 + "DROP TABLE documents", 142 + "ALTER TABLE documents_new RENAME TO documents", 143 + ]) 144 + done() 145 + 146 + # step 4: recreate indexes 147 + step("creating indexes") 148 + pipeline(settings, [ 149 + "CREATE UNIQUE INDEX IF NOT EXISTS idx_documents_did_rkey ON documents(did, rkey)", 150 + ]) 151 + done() 152 + 153 + # step 5: recreate FTS 154 + step("rebuilding FTS index") 155 + pipeline(settings, [ 156 + "DROP TABLE IF EXISTS documents_fts", 157 + "CREATE VIRTUAL TABLE documents_fts USING fts5(uri UNINDEXED, title, content)", 158 + "INSERT INTO documents_fts (uri, title, content) SELECT uri, title, content FROM documents", 159 + ]) 160 + done() 161 + 162 + # verify 163 + new_total = scalar(settings, "SELECT count(*) FROM documents") 164 + fts_count = scalar(settings, "SELECT count(*) FROM documents_fts") 165 + embedded = scalar(settings, "SELECT count(*) FROM documents WHERE embedding IS NOT NULL") 166 + elapsed = time.time() - t0 167 + 168 + print(f"\ndone in {elapsed:.1f}s") 169 + print(f" documents: {new_total}") 170 + print(f" FTS indexed: {fts_count}") 171 + print(f" embeddings: {embedded} (will be re-generated by backfill-embeddings)") 172 + 173 + 174 + if __name__ == "__main__": 175 + main()