···11+I wish to turn JSON files output by odoc-driver (an OCaml documentation generator) into succinct Markdown that is a │
22+good input to a coding model such as you. Look at │
33+_html/mirage-crypto/mirage-crypto/Mirage_crypto/DES/CTR/index.html.json as one such example, with more being in _html/ │
44+but be aware there are thousands of files. Write me a odoc2llm.py Python script that uses Beautiful Soup and JSON │
55+parsing to crunch up just the relevant signatures and crosslinks into a _single_ markdown file from the _html │
66+directory
+471
odoc2llm.py
···11+#!/usr/bin/env python3
22+# /// script
33+# requires-python = ">=3.11"
44+# dependencies = [
55+# "bs4",
66+# ]
77+# ///
88+"""
99+odoc2llm.py - Convert OCaml odoc documentation JSON to concise Markdown for LLMs
1010+1111+This script processes JSON files generated by odoc-driver (OCaml documentation generator)
1212+and produces a single Markdown file with the essential module structure and signatures
1313+formatted in a way that makes it useful for LLMs to reason about OCaml codebases.
1414+"""
1515+1616+import os
1717+import sys
1818+import json
1919+import re
2020+from bs4 import BeautifulSoup
2121+from collections import defaultdict
2222+import argparse
2323+from pathlib import Path
2424+import html
2525+2626+2727+def extract_module_info(json_content):
2828+ """Extract module information from odoc JSON content."""
2929+ data = json.loads(json_content)
3030+3131+ # Extract module name and type from header
3232+ header = data.get("header", "")
3333+ soup = BeautifulSoup(header, "html.parser")
3434+ header_text = soup.get_text().strip()
3535+3636+ # Determine module type and name
3737+ module_type = "Module"
3838+ if "Module type" in header_text:
3939+ module_type = "Module type"
4040+ elif "Class" in header_text:
4141+ module_type = "Class"
4242+4343+ # Extract the actual module name
4444+ module_name = ""
4545+ code_tag = soup.find("code")
4646+ if code_tag:
4747+ module_name = code_tag.get_text().strip()
4848+ else:
4949+ # Fall back to header text with type prefix removed
5050+ module_name = re.sub(r'^(Module|Module type|Class)\s+', '', header_text)
5151+5252+ # Extract breadcrumbs for context
5353+ breadcrumbs = []
5454+ for crumb in data.get("breadcrumbs", []):
5555+ name = crumb.get("name", "")
5656+ if name:
5757+ soup = BeautifulSoup(name, "html.parser")
5858+ clean_name = soup.get_text().strip()
5959+ # Clean up the breadcrumb text
6060+ clean_name = re.sub(r'Library\s+`([^`]+)`', r'Library \1', clean_name)
6161+ breadcrumbs.append(clean_name)
6262+6363+ # Extract module content
6464+ content = data.get("content", "")
6565+ soup = BeautifulSoup(content, "html.parser")
6666+6767+ return {
6868+ "name": module_name,
6969+ "type": module_type,
7070+ "breadcrumbs": breadcrumbs,
7171+ "content": soup,
7272+ "preamble": data.get("preamble", "")
7373+ }
7474+7575+7676+def clean_signature_text(text):
7777+ """Clean up signature text for better readability."""
7878+ # Replace special arrow characters with ->
7979+ text = text.replace('', '').replace('−', '-').replace('‑', '-').replace('→', '->')
8080+8181+ # Replace multiple spaces with a single space, except in code blocks
8282+ text = re.sub(r'(?<!\n)\s{2,}(?!\n)', ' ', text)
8383+8484+ return text
8585+8686+8787+def extract_signature_name(sig_content):
8888+ """Extract the name of a signature (function name, type name, etc.)."""
8989+ # For val signatures: extract function name before the first :
9090+ match = re.match(r'val\s+([a-zA-Z0-9_\']+)\s*:', sig_content)
9191+ if match:
9292+ return match.group(1)
9393+9494+ # For type signatures: extract type name
9595+ match = re.match(r'type\s+([a-zA-Z0-9_\']+)(?:\s|\[|$)', sig_content)
9696+ if match:
9797+ return match.group(1)
9898+9999+ # For module signatures: extract module name
100100+ match = re.match(r'module\s+([a-zA-Z0-9_\']+)', sig_content)
101101+ if match:
102102+ return match.group(1)
103103+104104+ # For class signatures: extract class name
105105+ match = re.match(r'class\s+([a-zA-Z0-9_\']+)', sig_content)
106106+ if match:
107107+ return match.group(1)
108108+109109+ # For exception signatures: extract exception name
110110+ match = re.match(r'exception\s+([a-zA-Z0-9_\']+)', sig_content)
111111+ if match:
112112+ return match.group(1)
113113+114114+ return None
115115+116116+117117+def parse_module_signature(content_soup):
118118+ """Parse the OCaml module signature from the HTML content."""
119119+ signatures = []
120120+121121+ # Get all the odoc-spec divs
122122+ spec_divs = content_soup.find_all("div", class_="odoc-spec")
123123+124124+ for spec in spec_divs:
125125+ sig_id = None
126126+ sig_type = None
127127+ sig_content = None
128128+ doc_content = None
129129+130130+ # Find the actual signature
131131+ sig_div = spec.find("div", class_="spec")
132132+ if sig_div:
133133+ # Get the ID for cross-referencing
134134+ sig_id = sig_div.get("id", "")
135135+136136+ # Determine the type of signature (type, val, module, etc.)
137137+ sig_type_span = sig_div.find("span", class_="keyword")
138138+ if sig_type_span:
139139+ sig_type = sig_type_span.get_text().strip()
140140+141141+ # Get the full code content
142142+ code_tag = sig_div.find("code")
143143+ if code_tag:
144144+ # Extract the full OCaml signature text properly
145145+ # We'll convert all spans to plain text while preserving structure
146146+ for span in code_tag.find_all("span"):
147147+ span.replace_with(span.get_text())
148148+149149+ sig_content = clean_signature_text(code_tag.get_text())
150150+151151+ # Find documentation for this signature
152152+ doc_div = spec.find("div", class_="spec-doc")
153153+ if doc_div:
154154+ # Process paragraphs and lists for documentation
155155+ doc_parts = []
156156+157157+ # Process regular paragraphs
158158+ for p in doc_div.find_all("p"):
159159+ # Clean up code references in paragraph
160160+ for code in p.find_all("code"):
161161+ # Convert links within code tags to plain text
162162+ for a in code.find_all("a"):
163163+ a.replace_with(a.get_text())
164164+ # Keep the code tag formatting
165165+ code_text = code.get_text()
166166+ code.string = code_text
167167+168168+ # Clean up the paragraph text
169169+ p_text = clean_signature_text(p.get_text()).strip()
170170+ if p_text:
171171+ doc_parts.append(p_text)
172172+173173+ # Process bulleted lists
174174+ for ul in doc_div.find_all("ul"):
175175+ for li in ul.find_all("li"):
176176+ # Check if it's a special tag like @raises, @returns, etc.
177177+ tag_span = li.find("span", class_="at-tag")
178178+ if tag_span:
179179+ tag_name = tag_span.get_text().strip()
180180+ # Remove the tag span from consideration
181181+ tag_span.extract()
182182+ # Get the rest of the content
183183+ li_text = clean_signature_text(li.get_text()).strip()
184184+ doc_parts.append(f"@{tag_name} {li_text}")
185185+ else:
186186+ # Regular list item
187187+ li_text = clean_signature_text(li.get_text()).strip()
188188+ doc_parts.append(f"- {li_text}")
189189+190190+ # Process code examples
191191+ for pre in doc_div.find_all("pre"):
192192+ code = pre.find("code")
193193+ if code:
194194+ # Get the language class if available
195195+ lang = "ocaml" # Default to OCaml
196196+ if "language-" in code.get("class", [""]):
197197+ for cls in code.get("class", []):
198198+ if cls.startswith("language-"):
199199+ lang = cls.replace("language-", "")
200200+201201+ # Preserve indentation and line breaks in code blocks
202202+ code_text = code.get_text()
203203+ doc_parts.append(f"```{lang}\n{code_text}\n```")
204204+205205+ if doc_parts:
206206+ doc_content = "\n".join(doc_parts)
207207+208208+ # Only add signatures that have content
209209+ if sig_type and sig_content:
210210+ # Extract the name of the element (function name, type name, etc.)
211211+ name = extract_signature_name(sig_content)
212212+213213+ # Build the full signature
214214+ signature = {
215215+ "id": sig_id,
216216+ "type": sig_type,
217217+ "name": name,
218218+ "content": sig_content,
219219+ "doc": doc_content
220220+ }
221221+ signatures.append(signature)
222222+223223+ return signatures
224224+225225+226226+def generate_markdown(module_info, signatures):
227227+ """Generate markdown documentation from parsed module information."""
228228+ md_lines = []
229229+230230+ # Module header with breadcrumbs
231231+ breadcrumb_path = " > ".join(module_info["breadcrumbs"])
232232+ md_lines.append(f"# {module_info['type']} `{module_info['name']}`")
233233+ md_lines.append(f"**Path:** {breadcrumb_path}")
234234+ md_lines.append("")
235235+236236+ # Add module preamble documentation if available
237237+ if module_info["preamble"]:
238238+ preamble_soup = BeautifulSoup(module_info["preamble"], "html.parser")
239239+ preamble_text = clean_signature_text(preamble_soup.get_text()).strip()
240240+ if preamble_text:
241241+ md_lines.append(preamble_text)
242242+ md_lines.append("")
243243+244244+ # Organize signatures by type
245245+ sig_by_type = defaultdict(list)
246246+ for sig in signatures:
247247+ sig_by_type[sig["type"]].append(sig)
248248+249249+ # Process types first
250250+ if "type" in sig_by_type:
251251+ md_lines.append("## Types")
252252+ for sig in sig_by_type["type"]:
253253+ md_lines.append("")
254254+ md_lines.append(f"### `{sig['content']}`")
255255+256256+ # Add documentation if available
257257+ if sig["doc"]:
258258+ md_lines.append("")
259259+ md_lines.append(sig["doc"])
260260+ md_lines.append("")
261261+262262+ # Process exceptions
263263+ if "exception" in sig_by_type:
264264+ md_lines.append("## Exceptions")
265265+ for sig in sig_by_type["exception"]:
266266+ md_lines.append("")
267267+ md_lines.append(f"### `{sig['content']}`")
268268+269269+ # Add documentation if available
270270+ if sig["doc"]:
271271+ md_lines.append("")
272272+ md_lines.append(sig["doc"])
273273+ md_lines.append("")
274274+275275+ # Process values (functions)
276276+ if "val" in sig_by_type:
277277+ md_lines.append("## Values")
278278+ for sig in sig_by_type["val"]:
279279+ md_lines.append("")
280280+ md_lines.append(f"### `{sig['content']}`")
281281+282282+ # Add documentation if available
283283+ if sig["doc"]:
284284+ md_lines.append("")
285285+ md_lines.append(sig["doc"])
286286+ md_lines.append("")
287287+288288+ # Process modules
289289+ if "module" in sig_by_type:
290290+ md_lines.append("## Modules")
291291+ for sig in sig_by_type["module"]:
292292+ md_lines.append("")
293293+ md_lines.append(f"### `{sig['content']}`")
294294+295295+ # Add documentation if available
296296+ if sig["doc"]:
297297+ md_lines.append("")
298298+ md_lines.append(sig["doc"])
299299+ md_lines.append("")
300300+301301+ # Process classes
302302+ if "class" in sig_by_type:
303303+ md_lines.append("## Classes")
304304+ for sig in sig_by_type["class"]:
305305+ md_lines.append("")
306306+ md_lines.append(f"### `{sig['content']}`")
307307+308308+ # Add documentation if available
309309+ if sig["doc"]:
310310+ md_lines.append("")
311311+ md_lines.append(sig["doc"])
312312+ md_lines.append("")
313313+314314+ # Process remaining signature types
315315+ for sig_type, sigs in sig_by_type.items():
316316+ if sig_type not in ["type", "val", "module", "class", "exception"]:
317317+ md_lines.append(f"## {sig_type.capitalize()}s")
318318+ for sig in sigs:
319319+ md_lines.append("")
320320+ md_lines.append(f"### `{sig['content']}`")
321321+322322+ # Add documentation if available
323323+ if sig["doc"]:
324324+ md_lines.append("")
325325+ md_lines.append(sig["doc"])
326326+ md_lines.append("")
327327+328328+ return "\n".join(md_lines)
329329+330330+331331+def build_module_hierarchy(json_files, root_dir):
332332+ """Build a hierarchical structure from all the JSON files."""
333333+ hierarchy = defaultdict(list)
334334+335335+ for json_file in json_files:
336336+ rel_path = os.path.relpath(json_file, root_dir)
337337+ package_parts = rel_path.split(os.sep)
338338+339339+ # Skip irrelevant JSON files
340340+ if package_parts[-1] in ["index.html.json", "sidebar.json", "status.json", "sherlodoc_db.js"]:
341341+ # For index.html.json, check if it's a module documentation
342342+ if package_parts[-1] == "index.html.json" and len(package_parts) > 1:
343343+ try:
344344+ with open(json_file, 'r', encoding='utf-8') as f:
345345+ json_content = f.read()
346346+347347+ # Try to parse the module info
348348+ module_info = extract_module_info(json_content)
349349+ signatures = parse_module_signature(module_info["content"])
350350+351351+ # Group by package/library
352352+ if len(package_parts) > 1:
353353+ package_name = package_parts[0]
354354+ hierarchy[package_name].append({
355355+ "file": json_file,
356356+ "module_info": module_info,
357357+ "signatures": signatures,
358358+ "path_parts": package_parts
359359+ })
360360+ except Exception as e:
361361+ print(f"Error processing {json_file}: {e}", file=sys.stderr)
362362+363363+ continue
364364+365365+ # Try to parse other JSON files (non-index.html.json)
366366+ try:
367367+ with open(json_file, 'r', encoding='utf-8') as f:
368368+ json_content = f.read()
369369+370370+ module_info = extract_module_info(json_content)
371371+ signatures = parse_module_signature(module_info["content"])
372372+373373+ # Group by package/library
374374+ if len(package_parts) > 1:
375375+ package_name = package_parts[0]
376376+ hierarchy[package_name].append({
377377+ "file": json_file,
378378+ "module_info": module_info,
379379+ "signatures": signatures,
380380+ "path_parts": package_parts
381381+ })
382382+ except Exception as e:
383383+ print(f"Error processing {json_file}: {e}", file=sys.stderr)
384384+385385+ return hierarchy
386386+387387+388388+def sort_modules_hierarchically(modules):
389389+ """Sort modules to ensure proper hierarchical presentation."""
390390+ # First sort by breadcrumb length (shorter = higher in hierarchy)
391391+ # Then sort alphabetically within the same level
392392+ return sorted(modules, key=lambda x: (
393393+ len(x["module_info"]["breadcrumbs"]),
394394+ x["module_info"]["breadcrumbs"][-1] if x["module_info"]["breadcrumbs"] else ""
395395+ ))
396396+397397+398398+def generate_markdown_library(lib_name, modules):
399399+ """Generate markdown for a specific library."""
400400+ md_lines = []
401401+402402+ md_lines.append(f"# Library: {lib_name}")
403403+ md_lines.append("")
404404+405405+ # Sort modules hierarchically
406406+ sorted_modules = sort_modules_hierarchically(modules)
407407+408408+ for module in sorted_modules:
409409+ module_md = generate_markdown(module["module_info"], module["signatures"])
410410+ md_lines.append(module_md)
411411+ md_lines.append("\n---\n")
412412+413413+ return "\n".join(md_lines)
414414+415415+416416+def main():
417417+ parser = argparse.ArgumentParser(description='Convert odoc JSON to Markdown for LLMs.')
418418+ parser.add_argument('html_dir', help='Directory containing odoc generated HTML/JSON files')
419419+ parser.add_argument('--output', '-o', default='odoc_for_llm.md', help='Output Markdown file')
420420+ parser.add_argument('--package', '-p', help='Focus on a specific package/library')
421421+ parser.add_argument('--verbose', '-v', action='store_true', help='Enable verbose output')
422422+ args = parser.parse_args()
423423+424424+ html_dir = Path(args.html_dir)
425425+426426+ if not html_dir.exists() or not html_dir.is_dir():
427427+ print(f"Error: {html_dir} is not a valid directory", file=sys.stderr)
428428+ sys.exit(1)
429429+430430+ # Find all JSON files
431431+ json_files = []
432432+ for root, _, files in os.walk(html_dir):
433433+ for file in files:
434434+ if file.endswith('.html.json'):
435435+ json_files.append(os.path.join(root, file))
436436+437437+ if args.verbose:
438438+ print(f"Found {len(json_files)} JSON files", file=sys.stderr)
439439+440440+ # Build module hierarchy
441441+ hierarchy = build_module_hierarchy(json_files, html_dir)
442442+443443+ if args.verbose:
444444+ print(f"Processed {len(hierarchy)} libraries", file=sys.stderr)
445445+ for lib, modules in hierarchy.items():
446446+ print(f" - {lib}: {len(modules)} modules", file=sys.stderr)
447447+448448+ # Generate markdown for all or specific package
449449+ if args.package and args.package in hierarchy:
450450+ markdown = generate_markdown_library(args.package, hierarchy[args.package])
451451+ else:
452452+ # Combine all packages
453453+ markdown_parts = []
454454+ for lib_name, modules in sorted(hierarchy.items()):
455455+ if args.verbose:
456456+ print(f"Generating markdown for {lib_name} ({len(modules)} modules)...", file=sys.stderr)
457457+ lib_md = generate_markdown_library(lib_name, modules)
458458+ markdown_parts.append(lib_md)
459459+ markdown_parts.append("\n\n")
460460+461461+ markdown = "\n".join(markdown_parts)
462462+463463+ # Write markdown to output file
464464+ with open(args.output, 'w', encoding='utf-8') as f:
465465+ f.write(markdown)
466466+467467+ print(f"Generated Markdown documentation in {args.output}", file=sys.stderr)
468468+469469+470470+if __name__ == "__main__":
471471+ main()