···11+#!/usr/bin/env python3
22+# /// script
33+# requires-python = ">=3.11"
44+# dependencies = [
55+# "bs4",
66+# ]
77+# ///
88+"""
99+odoc2json.py - Convert odoc JSON output to structured JSON records
1010+1111+This script parses the JSON output files from odoc-driver (an OCaml documentation
1212+generator) and converts them into structured JSON records that include package name,
1313+version, and each function signature with associated documentation.
1414+1515+The output is intended for further processing, analysis, and search over OCaml type
1616+signatures, especially for loading into columnar formats like Parquet.
1717+"""
1818+1919+import os
2020+import json
2121+import re
2222+from bs4 import BeautifulSoup
2323+from typing import Dict, List, Any, Optional, Tuple
2424+import argparse
2525+from pathlib import Path
2626+2727+2828+def extract_package_info(path: str) -> Tuple[str, str]:
2929+ """
3030+ Extract package name and version from the path.
3131+3232+ Args:
3333+ path: Path to the odoc output directory
3434+3535+ Returns:
3636+ Tuple of (package_name, package_version)
3737+ """
3838+ # Use Path for more reliable path parsing
3939+ p = Path(path).resolve()
4040+ parts = list(p.parts)
4141+4242+ # If the path is in the format ".../package_name/version/..."
4343+ if len(parts) >= 2:
4444+ # The package name is typically the second-to-last component
4545+ # The version is typically the last component
4646+ return parts[-2], parts[-1]
4747+ elif len(parts) == 1:
4848+ # If only one component, assume it's the package name
4949+ return parts[0], "unknown"
5050+ else:
5151+ return "unknown", "unknown"
5252+5353+5454+def parse_html_content(content: str) -> List[Dict[str, Any]]:
5555+ """
5656+ Parse the HTML content from the odoc JSON to extract signatures and documentation.
5757+5858+ Args:
5959+ content: HTML content from the odoc JSON file
6060+6161+ Returns:
6262+ List of dictionaries containing extracted information
6363+ """
6464+ soup = BeautifulSoup(content, 'html.parser')
6565+ result = []
6666+6767+ # Process each specification block (function, type, module, etc.)
6868+ for spec in soup.find_all(class_="odoc-spec"):
6969+ item = {}
7070+7171+ # Get the spec element (contains the signature)
7272+ spec_elem = spec.find(class_="spec")
7373+ if not spec_elem:
7474+ continue
7575+7676+ # Determine the kind of element
7777+ kind = None
7878+ for cls in spec_elem.get('class', []):
7979+ if cls in ['type', 'value', 'module', 'class', 'exception', 'constructor']:
8080+ kind = cls
8181+ break
8282+8383+ if not kind:
8484+ continue
8585+8686+ item['kind'] = kind
8787+8888+ # Extract the signature first to use for name extraction if needed
8989+ code_elem = spec_elem.find('code')
9090+ signature = ""
9191+ if code_elem:
9292+ # Get the full signature text and strip all newlines and normalize whitespace
9393+ signature = code_elem.get_text()
9494+9595+ # Extract the name
9696+ name = None
9797+9898+ # First try to get name from anchor ID
9999+ anchor = spec_elem.find('a', class_="anchor")
100100+ if anchor and anchor.get('id'):
101101+ item_id = anchor.get('id')
102102+ # Clean up the ID to get the name
103103+ name = item_id.split('.')[-1] if '.' in item_id else item_id
104104+ # Remove prefixes like 'type-', 'val-', etc.
105105+ name = re.sub(r'^(type|val|module|class|exception)-', '', name)
106106+107107+ # For values (functions), extract the name from signature as a fallback
108108+ # This handles cases where the anchor doesn't contain the function name
109109+ if kind == 'value' and not name and signature:
110110+ # Look for "val name :" pattern in the signature
111111+ val_match = re.search(r'val\s+(\w+)\s*:', signature)
112112+ if val_match:
113113+ name = val_match.group(1)
114114+115115+ if name:
116116+ item['name'] = name
117117+118118+ # Add the processed signature
119119+ if signature:
120120+ # Replace newlines and multiple whitespace with a single space
121121+ signature = re.sub(r'\s+', ' ', signature)
122122+ item['signature'] = signature.strip()
123123+124124+ # Extract documentation
125125+ doc_elem = spec.find(class_="spec-doc")
126126+ if doc_elem:
127127+ # Get the raw HTML content and remove all HTML tags
128128+ html_content = str(doc_elem)
129129+ # First, convert <br> tags to spaces
130130+ html_content = re.sub(r'<br\s*/?\s*>', ' ', html_content)
131131+ # Parse the modified HTML
132132+ soup_doc = BeautifulSoup(html_content, 'html.parser')
133133+ # Get text with all whitespace normalized
134134+ doc = soup_doc.get_text()
135135+ # Replace all newlines and multiple spaces with a single space
136136+ doc = re.sub(r'\s+', ' ', doc)
137137+ item['documentation'] = doc.strip()
138138+139139+ # Add the item to our results
140140+ result.append(item)
141141+142142+ return result
143143+144144+145145+def process_json_file(file_path: str, package_name: str, package_version: str) -> List[Dict[str, Any]]:
146146+ """
147147+ Process a single odoc JSON file and extract the relevant information.
148148+149149+ Args:
150150+ file_path: Path to the JSON file
151151+ package_name: Name of the package
152152+ package_version: Version of the package
153153+154154+ Returns:
155155+ List of dictionaries containing extracted information
156156+ """
157157+ with open(file_path, 'r', encoding='utf-8') as f:
158158+ try:
159159+ data = json.load(f)
160160+ except json.JSONDecodeError:
161161+ print(f"Error decoding JSON from {file_path}")
162162+ return []
163163+164164+ if 'content' not in data:
165165+ return []
166166+167167+ # Extract module path from breadcrumbs
168168+ module_path = []
169169+ if 'breadcrumbs' in data:
170170+ for crumb in data['breadcrumbs']:
171171+ if crumb.get('kind') == 'module':
172172+ module_path.append(crumb.get('name'))
173173+174174+ module_name = ".".join(module_path) if module_path else os.path.basename(os.path.dirname(file_path))
175175+176176+ # Extract items from the content
177177+ items = parse_html_content(data['content'])
178178+179179+ # Add package and module information to each item
180180+ for item in items:
181181+ item['package_name'] = package_name
182182+ item['package_version'] = package_version
183183+ item['module_name'] = module_name
184184+185185+ # Create a full path for the item that includes the item name
186186+ # - module_name: just the module hierarchy (e.g., "Math.Operations")
187187+ # - full_path: complete path including item name (e.g., "Math.Operations.add")
188188+ if 'name' in item:
189189+ item['full_path'] = f"{module_name}.{item['name']}"
190190+ else:
191191+ item['full_path'] = module_name
192192+193193+ return items
194194+195195+196196+def process_directory(directory: str) -> List[Dict[str, Any]]:
197197+ """
198198+ Process all JSON files in a directory recursively.
199199+200200+ Args:
201201+ directory: Path to the directory containing odoc JSON files
202202+203203+ Returns:
204204+ List of all extracted items from all files
205205+ """
206206+ all_items = []
207207+ package_name, package_version = extract_package_info(directory)
208208+209209+ for root, _, files in os.walk(directory):
210210+ for file in files:
211211+ if file.endswith('.html.json'):
212212+ file_path = os.path.join(root, file)
213213+ items = process_json_file(file_path, package_name, package_version)
214214+ all_items.extend(items)
215215+216216+ return all_items
217217+218218+219219+def main():
220220+ parser = argparse.ArgumentParser(description='Convert odoc JSON to structured JSON records')
221221+ parser.add_argument('input_dir', help='Directory containing odoc JSON output')
222222+ parser.add_argument('output_file', help='Output JSON file path')
223223+ parser.add_argument('--pretty', action='store_true', help='Pretty-print the JSON output')
224224+ args = parser.parse_args()
225225+226226+ # Process all files in the directory
227227+ items = process_directory(args.input_dir)
228228+229229+ # Write the output
230230+ with open(args.output_file, 'w', encoding='utf-8') as f:
231231+ if args.pretty:
232232+ json.dump(items, f, indent=2, ensure_ascii=False)
233233+ else:
234234+ json.dump(items, f, ensure_ascii=False)
235235+236236+ print(f"Processed {len(items)} items and saved to {args.output_file}")
237237+238238+239239+if __name__ == "__main__":
240240+ main()