vibe · anil.recoil.org/odoc-mcp@a5e648a

+477

2 changed files

expand all

CLAUDE.md

odoc2llm.py

CLAUDE.md

··· 1 + I wish to turn JSON files output by odoc-driver (an OCaml documentation generator) into succinct Markdown that is a │ 2 + good input to a coding model such as you. Look at │ 3 + _html/mirage-crypto/mirage-crypto/Mirage_crypto/DES/CTR/index.html.json as one such example, with more being in _html/ │ 4 + but be aware there are thousands of files. Write me a odoc2llm.py Python script that uses Beautiful Soup and JSON │ 5 + parsing to crunch up just the relevant signatures and crosslinks into a _single_ markdown file from the _html │ 6 + directory

+471

odoc2llm.py

··· 1 + #!/usr/bin/env python3 2 + # /// script 3 + # requires-python = ">=3.11" 4 + # dependencies = [ 5 + # "bs4", 6 + # ] 7 + # /// 8 + """ 9 + odoc2llm.py - Convert OCaml odoc documentation JSON to concise Markdown for LLMs 10 + 11 + This script processes JSON files generated by odoc-driver (OCaml documentation generator) 12 + and produces a single Markdown file with the essential module structure and signatures 13 + formatted in a way that makes it useful for LLMs to reason about OCaml codebases. 14 + """ 15 + 16 + import os 17 + import sys 18 + import json 19 + import re 20 + from bs4 import BeautifulSoup 21 + from collections import defaultdict 22 + import argparse 23 + from pathlib import Path 24 + import html 25 + 26 + 27 + def extract_module_info(json_content): 28 + """Extract module information from odoc JSON content.""" 29 + data = json.loads(json_content) 30 + 31 + # Extract module name and type from header 32 + header = data.get("header", "") 33 + soup = BeautifulSoup(header, "html.parser") 34 + header_text = soup.get_text().strip() 35 + 36 + # Determine module type and name 37 + module_type = "Module" 38 + if "Module type" in header_text: 39 + module_type = "Module type" 40 + elif "Class" in header_text: 41 + module_type = "Class" 42 + 43 + # Extract the actual module name 44 + module_name = "" 45 + code_tag = soup.find("code") 46 + if code_tag: 47 + module_name = code_tag.get_text().strip() 48 + else: 49 + # Fall back to header text with type prefix removed 50 + module_name = re.sub(r'^(Module|Module type|Class)\s+', '', header_text) 51 + 52 + # Extract breadcrumbs for context 53 + breadcrumbs = [] 54 + for crumb in data.get("breadcrumbs", []): 55 + name = crumb.get("name", "") 56 + if name: 57 + soup = BeautifulSoup(name, "html.parser") 58 + clean_name = soup.get_text().strip() 59 + # Clean up the breadcrumb text 60 + clean_name = re.sub(r'Library\s+`([^`]+)`', r'Library \1', clean_name) 61 + breadcrumbs.append(clean_name) 62 + 63 + # Extract module content 64 + content = data.get("content", "") 65 + soup = BeautifulSoup(content, "html.parser") 66 + 67 + return { 68 + "name": module_name, 69 + "type": module_type, 70 + "breadcrumbs": breadcrumbs, 71 + "content": soup, 72 + "preamble": data.get("preamble", "") 73 + } 74 + 75 + 76 + def clean_signature_text(text): 77 + """Clean up signature text for better readability.""" 78 + # Replace special arrow characters with -> 79 + text = text.replace('⁠', '').replace('−', '-').replace('‑', '-').replace('→', '->') 80 + 81 + # Replace multiple spaces with a single space, except in code blocks 82 + text = re.sub(r'(?<!\n)\s{2,}(?!\n)', ' ', text) 83 + 84 + return text 85 + 86 + 87 + def extract_signature_name(sig_content): 88 + """Extract the name of a signature (function name, type name, etc.).""" 89 + # For val signatures: extract function name before the first : 90 + match = re.match(r'val\s+([a-zA-Z0-9_\']+)\s*:', sig_content) 91 + if match: 92 + return match.group(1) 93 + 94 + # For type signatures: extract type name 95 + match = re.match(r'type\s+([a-zA-Z0-9_\']+)(?:\s|\[|$)', sig_content) 96 + if match: 97 + return match.group(1) 98 + 99 + # For module signatures: extract module name 100 + match = re.match(r'module\s+([a-zA-Z0-9_\']+)', sig_content) 101 + if match: 102 + return match.group(1) 103 + 104 + # For class signatures: extract class name 105 + match = re.match(r'class\s+([a-zA-Z0-9_\']+)', sig_content) 106 + if match: 107 + return match.group(1) 108 + 109 + # For exception signatures: extract exception name 110 + match = re.match(r'exception\s+([a-zA-Z0-9_\']+)', sig_content) 111 + if match: 112 + return match.group(1) 113 + 114 + return None 115 + 116 + 117 + def parse_module_signature(content_soup): 118 + """Parse the OCaml module signature from the HTML content.""" 119 + signatures = [] 120 + 121 + # Get all the odoc-spec divs 122 + spec_divs = content_soup.find_all("div", class_="odoc-spec") 123 + 124 + for spec in spec_divs: 125 + sig_id = None 126 + sig_type = None 127 + sig_content = None 128 + doc_content = None 129 + 130 + # Find the actual signature 131 + sig_div = spec.find("div", class_="spec") 132 + if sig_div: 133 + # Get the ID for cross-referencing 134 + sig_id = sig_div.get("id", "") 135 + 136 + # Determine the type of signature (type, val, module, etc.) 137 + sig_type_span = sig_div.find("span", class_="keyword") 138 + if sig_type_span: 139 + sig_type = sig_type_span.get_text().strip() 140 + 141 + # Get the full code content 142 + code_tag = sig_div.find("code") 143 + if code_tag: 144 + # Extract the full OCaml signature text properly 145 + # We'll convert all spans to plain text while preserving structure 146 + for span in code_tag.find_all("span"): 147 + span.replace_with(span.get_text()) 148 + 149 + sig_content = clean_signature_text(code_tag.get_text()) 150 + 151 + # Find documentation for this signature 152 + doc_div = spec.find("div", class_="spec-doc") 153 + if doc_div: 154 + # Process paragraphs and lists for documentation 155 + doc_parts = [] 156 + 157 + # Process regular paragraphs 158 + for p in doc_div.find_all("p"): 159 + # Clean up code references in paragraph 160 + for code in p.find_all("code"): 161 + # Convert links within code tags to plain text 162 + for a in code.find_all("a"): 163 + a.replace_with(a.get_text()) 164 + # Keep the code tag formatting 165 + code_text = code.get_text() 166 + code.string = code_text 167 + 168 + # Clean up the paragraph text 169 + p_text = clean_signature_text(p.get_text()).strip() 170 + if p_text: 171 + doc_parts.append(p_text) 172 + 173 + # Process bulleted lists 174 + for ul in doc_div.find_all("ul"): 175 + for li in ul.find_all("li"): 176 + # Check if it's a special tag like @raises, @returns, etc. 177 + tag_span = li.find("span", class_="at-tag") 178 + if tag_span: 179 + tag_name = tag_span.get_text().strip() 180 + # Remove the tag span from consideration 181 + tag_span.extract() 182 + # Get the rest of the content 183 + li_text = clean_signature_text(li.get_text()).strip() 184 + doc_parts.append(f"@{tag_name} {li_text}") 185 + else: 186 + # Regular list item 187 + li_text = clean_signature_text(li.get_text()).strip() 188 + doc_parts.append(f"- {li_text}") 189 + 190 + # Process code examples 191 + for pre in doc_div.find_all("pre"): 192 + code = pre.find("code") 193 + if code: 194 + # Get the language class if available 195 + lang = "ocaml" # Default to OCaml 196 + if "language-" in code.get("class", [""]): 197 + for cls in code.get("class", []): 198 + if cls.startswith("language-"): 199 + lang = cls.replace("language-", "") 200 + 201 + # Preserve indentation and line breaks in code blocks 202 + code_text = code.get_text() 203 + doc_parts.append(f"```{lang}\n{code_text}\n```") 204 + 205 + if doc_parts: 206 + doc_content = "\n".join(doc_parts) 207 + 208 + # Only add signatures that have content 209 + if sig_type and sig_content: 210 + # Extract the name of the element (function name, type name, etc.) 211 + name = extract_signature_name(sig_content) 212 + 213 + # Build the full signature 214 + signature = { 215 + "id": sig_id, 216 + "type": sig_type, 217 + "name": name, 218 + "content": sig_content, 219 + "doc": doc_content 220 + } 221 + signatures.append(signature) 222 + 223 + return signatures 224 + 225 + 226 + def generate_markdown(module_info, signatures): 227 + """Generate markdown documentation from parsed module information.""" 228 + md_lines = [] 229 + 230 + # Module header with breadcrumbs 231 + breadcrumb_path = " > ".join(module_info["breadcrumbs"]) 232 + md_lines.append(f"# {module_info['type']} `{module_info['name']}`") 233 + md_lines.append(f"**Path:** {breadcrumb_path}") 234 + md_lines.append("") 235 + 236 + # Add module preamble documentation if available 237 + if module_info["preamble"]: 238 + preamble_soup = BeautifulSoup(module_info["preamble"], "html.parser") 239 + preamble_text = clean_signature_text(preamble_soup.get_text()).strip() 240 + if preamble_text: 241 + md_lines.append(preamble_text) 242 + md_lines.append("") 243 + 244 + # Organize signatures by type 245 + sig_by_type = defaultdict(list) 246 + for sig in signatures: 247 + sig_by_type[sig["type"]].append(sig) 248 + 249 + # Process types first 250 + if "type" in sig_by_type: 251 + md_lines.append("## Types") 252 + for sig in sig_by_type["type"]: 253 + md_lines.append("") 254 + md_lines.append(f"### `{sig['content']}`") 255 + 256 + # Add documentation if available 257 + if sig["doc"]: 258 + md_lines.append("") 259 + md_lines.append(sig["doc"]) 260 + md_lines.append("") 261 + 262 + # Process exceptions 263 + if "exception" in sig_by_type: 264 + md_lines.append("## Exceptions") 265 + for sig in sig_by_type["exception"]: 266 + md_lines.append("") 267 + md_lines.append(f"### `{sig['content']}`") 268 + 269 + # Add documentation if available 270 + if sig["doc"]: 271 + md_lines.append("") 272 + md_lines.append(sig["doc"]) 273 + md_lines.append("") 274 + 275 + # Process values (functions) 276 + if "val" in sig_by_type: 277 + md_lines.append("## Values") 278 + for sig in sig_by_type["val"]: 279 + md_lines.append("") 280 + md_lines.append(f"### `{sig['content']}`") 281 + 282 + # Add documentation if available 283 + if sig["doc"]: 284 + md_lines.append("") 285 + md_lines.append(sig["doc"]) 286 + md_lines.append("") 287 + 288 + # Process modules 289 + if "module" in sig_by_type: 290 + md_lines.append("## Modules") 291 + for sig in sig_by_type["module"]: 292 + md_lines.append("") 293 + md_lines.append(f"### `{sig['content']}`") 294 + 295 + # Add documentation if available 296 + if sig["doc"]: 297 + md_lines.append("") 298 + md_lines.append(sig["doc"]) 299 + md_lines.append("") 300 + 301 + # Process classes 302 + if "class" in sig_by_type: 303 + md_lines.append("## Classes") 304 + for sig in sig_by_type["class"]: 305 + md_lines.append("") 306 + md_lines.append(f"### `{sig['content']}`") 307 + 308 + # Add documentation if available 309 + if sig["doc"]: 310 + md_lines.append("") 311 + md_lines.append(sig["doc"]) 312 + md_lines.append("") 313 + 314 + # Process remaining signature types 315 + for sig_type, sigs in sig_by_type.items(): 316 + if sig_type not in ["type", "val", "module", "class", "exception"]: 317 + md_lines.append(f"## {sig_type.capitalize()}s") 318 + for sig in sigs: 319 + md_lines.append("") 320 + md_lines.append(f"### `{sig['content']}`") 321 + 322 + # Add documentation if available 323 + if sig["doc"]: 324 + md_lines.append("") 325 + md_lines.append(sig["doc"]) 326 + md_lines.append("") 327 + 328 + return "\n".join(md_lines) 329 + 330 + 331 + def build_module_hierarchy(json_files, root_dir): 332 + """Build a hierarchical structure from all the JSON files.""" 333 + hierarchy = defaultdict(list) 334 + 335 + for json_file in json_files: 336 + rel_path = os.path.relpath(json_file, root_dir) 337 + package_parts = rel_path.split(os.sep) 338 + 339 + # Skip irrelevant JSON files 340 + if package_parts[-1] in ["index.html.json", "sidebar.json", "status.json", "sherlodoc_db.js"]: 341 + # For index.html.json, check if it's a module documentation 342 + if package_parts[-1] == "index.html.json" and len(package_parts) > 1: 343 + try: 344 + with open(json_file, 'r', encoding='utf-8') as f: 345 + json_content = f.read() 346 + 347 + # Try to parse the module info 348 + module_info = extract_module_info(json_content) 349 + signatures = parse_module_signature(module_info["content"]) 350 + 351 + # Group by package/library 352 + if len(package_parts) > 1: 353 + package_name = package_parts[0] 354 + hierarchy[package_name].append({ 355 + "file": json_file, 356 + "module_info": module_info, 357 + "signatures": signatures, 358 + "path_parts": package_parts 359 + }) 360 + except Exception as e: 361 + print(f"Error processing {json_file}: {e}", file=sys.stderr) 362 + 363 + continue 364 + 365 + # Try to parse other JSON files (non-index.html.json) 366 + try: 367 + with open(json_file, 'r', encoding='utf-8') as f: 368 + json_content = f.read() 369 + 370 + module_info = extract_module_info(json_content) 371 + signatures = parse_module_signature(module_info["content"]) 372 + 373 + # Group by package/library 374 + if len(package_parts) > 1: 375 + package_name = package_parts[0] 376 + hierarchy[package_name].append({ 377 + "file": json_file, 378 + "module_info": module_info, 379 + "signatures": signatures, 380 + "path_parts": package_parts 381 + }) 382 + except Exception as e: 383 + print(f"Error processing {json_file}: {e}", file=sys.stderr) 384 + 385 + return hierarchy 386 + 387 + 388 + def sort_modules_hierarchically(modules): 389 + """Sort modules to ensure proper hierarchical presentation.""" 390 + # First sort by breadcrumb length (shorter = higher in hierarchy) 391 + # Then sort alphabetically within the same level 392 + return sorted(modules, key=lambda x: ( 393 + len(x["module_info"]["breadcrumbs"]), 394 + x["module_info"]["breadcrumbs"][-1] if x["module_info"]["breadcrumbs"] else "" 395 + )) 396 + 397 + 398 + def generate_markdown_library(lib_name, modules): 399 + """Generate markdown for a specific library.""" 400 + md_lines = [] 401 + 402 + md_lines.append(f"# Library: {lib_name}") 403 + md_lines.append("") 404 + 405 + # Sort modules hierarchically 406 + sorted_modules = sort_modules_hierarchically(modules) 407 + 408 + for module in sorted_modules: 409 + module_md = generate_markdown(module["module_info"], module["signatures"]) 410 + md_lines.append(module_md) 411 + md_lines.append("\n---\n") 412 + 413 + return "\n".join(md_lines) 414 + 415 + 416 + def main(): 417 + parser = argparse.ArgumentParser(description='Convert odoc JSON to Markdown for LLMs.') 418 + parser.add_argument('html_dir', help='Directory containing odoc generated HTML/JSON files') 419 + parser.add_argument('--output', '-o', default='odoc_for_llm.md', help='Output Markdown file') 420 + parser.add_argument('--package', '-p', help='Focus on a specific package/library') 421 + parser.add_argument('--verbose', '-v', action='store_true', help='Enable verbose output') 422 + args = parser.parse_args() 423 + 424 + html_dir = Path(args.html_dir) 425 + 426 + if not html_dir.exists() or not html_dir.is_dir(): 427 + print(f"Error: {html_dir} is not a valid directory", file=sys.stderr) 428 + sys.exit(1) 429 + 430 + # Find all JSON files 431 + json_files = [] 432 + for root, _, files in os.walk(html_dir): 433 + for file in files: 434 + if file.endswith('.html.json'): 435 + json_files.append(os.path.join(root, file)) 436 + 437 + if args.verbose: 438 + print(f"Found {len(json_files)} JSON files", file=sys.stderr) 439 + 440 + # Build module hierarchy 441 + hierarchy = build_module_hierarchy(json_files, html_dir) 442 + 443 + if args.verbose: 444 + print(f"Processed {len(hierarchy)} libraries", file=sys.stderr) 445 + for lib, modules in hierarchy.items(): 446 + print(f" - {lib}: {len(modules)} modules", file=sys.stderr) 447 + 448 + # Generate markdown for all or specific package 449 + if args.package and args.package in hierarchy: 450 + markdown = generate_markdown_library(args.package, hierarchy[args.package]) 451 + else: 452 + # Combine all packages 453 + markdown_parts = [] 454 + for lib_name, modules in sorted(hierarchy.items()): 455 + if args.verbose: 456 + print(f"Generating markdown for {lib_name} ({len(modules)} modules)...", file=sys.stderr) 457 + lib_md = generate_markdown_library(lib_name, modules) 458 + markdown_parts.append(lib_md) 459 + markdown_parts.append("\n\n") 460 + 461 + markdown = "\n".join(markdown_parts) 462 + 463 + # Write markdown to output file 464 + with open(args.output, 'w', encoding='utf-8') as f: 465 + f.write(markdown) 466 + 467 + print(f"Generated Markdown documentation in {args.output}", file=sys.stderr) 468 + 469 + 470 + if __name__ == "__main__": 471 + main()

Configure Feed

Configure Feed