fix: extract leaflet block content in backfill-pds

the script only checked textContent/content/text fields, missing
leaflet's pages[].blocks[].block.plaintext structure. this caused
leaflet documents to be indexed with empty content.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

zzstoatzz 5 months ago c865eef6 cf495f4b

+58 -2

1 changed file

expand all

scripts

backfill-pds

+58 -2

scripts/backfill-pds

··· 104 104 response.raise_for_status() 105 105 106 106 107 + def extract_leaflet_blocks(pages: list) -> str: 108 + """Extract text from leaflet pages/blocks structure.""" 109 + texts = [] 110 + for page in pages: 111 + if not isinstance(page, dict): 112 + continue 113 + blocks = page.get("blocks", []) 114 + for wrapper in blocks: 115 + if not isinstance(wrapper, dict): 116 + continue 117 + block = wrapper.get("block", {}) 118 + if not isinstance(block, dict): 119 + continue 120 + # Extract plaintext from text, header, blockquote, code blocks 121 + block_type = block.get("$type", "") 122 + if block_type in ( 123 + "pub.leaflet.blocks.text", 124 + "pub.leaflet.blocks.header", 125 + "pub.leaflet.blocks.blockquote", 126 + "pub.leaflet.blocks.code", 127 + ): 128 + plaintext = block.get("plaintext", "") 129 + if plaintext: 130 + texts.append(plaintext) 131 + # Handle lists 132 + elif block_type == "pub.leaflet.blocks.unorderedList": 133 + texts.extend(extract_list_items(block.get("children", []))) 134 + return " ".join(texts) 135 + 136 + 137 + def extract_list_items(children: list) -> list[str]: 138 + """Recursively extract text from list items.""" 139 + texts = [] 140 + for child in children: 141 + if not isinstance(child, dict): 142 + continue 143 + content = child.get("content", {}) 144 + if isinstance(content, dict): 145 + plaintext = content.get("plaintext", "") 146 + if plaintext: 147 + texts.append(plaintext) 148 + # Recurse into nested children 149 + nested = child.get("children", []) 150 + if nested: 151 + texts.extend(extract_list_items(nested)) 152 + return texts 153 + 154 + 107 155 def extract_document(record: dict, collection: str) -> dict | None: 108 156 """Extract document fields from a record.""" 109 157 value = record.get("value", {}) ··· 113 161 if not title: 114 162 return None 115 163 116 - # Get content - try textContent (site.standard), then content/text 117 - content = value.get("textContent") or value.get("content") or value.get("text") or "" 164 + # Get content - try textContent (site.standard), then leaflet blocks, then content/text 165 + content = value.get("textContent") or "" 166 + if not content: 167 + # Try leaflet-style pages/blocks 168 + pages = value.get("pages", []) 169 + if pages: 170 + content = extract_leaflet_blocks(pages) 171 + if not content: 172 + # Fall back to simple content/text fields 173 + content = value.get("content") or value.get("text") or "" 118 174 if isinstance(content, dict): 119 175 # Handle richtext format 120 176 content = content.get("text", "")

Configure Feed

Configure Feed