add a version that outputs something suitable for parquet

+240

1 changed file

expand all

odoc2json.py

+240

odoc2json.py

··· 1 + #!/usr/bin/env python3 2 + # /// script 3 + # requires-python = ">=3.11" 4 + # dependencies = [ 5 + # "bs4", 6 + # ] 7 + # /// 8 + """ 9 + odoc2json.py - Convert odoc JSON output to structured JSON records 10 + 11 + This script parses the JSON output files from odoc-driver (an OCaml documentation 12 + generator) and converts them into structured JSON records that include package name, 13 + version, and each function signature with associated documentation. 14 + 15 + The output is intended for further processing, analysis, and search over OCaml type 16 + signatures, especially for loading into columnar formats like Parquet. 17 + """ 18 + 19 + import os 20 + import json 21 + import re 22 + from bs4 import BeautifulSoup 23 + from typing import Dict, List, Any, Optional, Tuple 24 + import argparse 25 + from pathlib import Path 26 + 27 + 28 + def extract_package_info(path: str) -> Tuple[str, str]: 29 + """ 30 + Extract package name and version from the path. 31 + 32 + Args: 33 + path: Path to the odoc output directory 34 + 35 + Returns: 36 + Tuple of (package_name, package_version) 37 + """ 38 + # Use Path for more reliable path parsing 39 + p = Path(path).resolve() 40 + parts = list(p.parts) 41 + 42 + # If the path is in the format ".../package_name/version/..." 43 + if len(parts) >= 2: 44 + # The package name is typically the second-to-last component 45 + # The version is typically the last component 46 + return parts[-2], parts[-1] 47 + elif len(parts) == 1: 48 + # If only one component, assume it's the package name 49 + return parts[0], "unknown" 50 + else: 51 + return "unknown", "unknown" 52 + 53 + 54 + def parse_html_content(content: str) -> List[Dict[str, Any]]: 55 + """ 56 + Parse the HTML content from the odoc JSON to extract signatures and documentation. 57 + 58 + Args: 59 + content: HTML content from the odoc JSON file 60 + 61 + Returns: 62 + List of dictionaries containing extracted information 63 + """ 64 + soup = BeautifulSoup(content, 'html.parser') 65 + result = [] 66 + 67 + # Process each specification block (function, type, module, etc.) 68 + for spec in soup.find_all(class_="odoc-spec"): 69 + item = {} 70 + 71 + # Get the spec element (contains the signature) 72 + spec_elem = spec.find(class_="spec") 73 + if not spec_elem: 74 + continue 75 + 76 + # Determine the kind of element 77 + kind = None 78 + for cls in spec_elem.get('class', []): 79 + if cls in ['type', 'value', 'module', 'class', 'exception', 'constructor']: 80 + kind = cls 81 + break 82 + 83 + if not kind: 84 + continue 85 + 86 + item['kind'] = kind 87 + 88 + # Extract the signature first to use for name extraction if needed 89 + code_elem = spec_elem.find('code') 90 + signature = "" 91 + if code_elem: 92 + # Get the full signature text and strip all newlines and normalize whitespace 93 + signature = code_elem.get_text() 94 + 95 + # Extract the name 96 + name = None 97 + 98 + # First try to get name from anchor ID 99 + anchor = spec_elem.find('a', class_="anchor") 100 + if anchor and anchor.get('id'): 101 + item_id = anchor.get('id') 102 + # Clean up the ID to get the name 103 + name = item_id.split('.')[-1] if '.' in item_id else item_id 104 + # Remove prefixes like 'type-', 'val-', etc. 105 + name = re.sub(r'^(type|val|module|class|exception)-', '', name) 106 + 107 + # For values (functions), extract the name from signature as a fallback 108 + # This handles cases where the anchor doesn't contain the function name 109 + if kind == 'value' and not name and signature: 110 + # Look for "val name :" pattern in the signature 111 + val_match = re.search(r'val\s+(\w+)\s*:', signature) 112 + if val_match: 113 + name = val_match.group(1) 114 + 115 + if name: 116 + item['name'] = name 117 + 118 + # Add the processed signature 119 + if signature: 120 + # Replace newlines and multiple whitespace with a single space 121 + signature = re.sub(r'\s+', ' ', signature) 122 + item['signature'] = signature.strip() 123 + 124 + # Extract documentation 125 + doc_elem = spec.find(class_="spec-doc") 126 + if doc_elem: 127 + # Get the raw HTML content and remove all HTML tags 128 + html_content = str(doc_elem) 129 + # First, convert <br> tags to spaces 130 + html_content = re.sub(r'<br\s*/?\s*>', ' ', html_content) 131 + # Parse the modified HTML 132 + soup_doc = BeautifulSoup(html_content, 'html.parser') 133 + # Get text with all whitespace normalized 134 + doc = soup_doc.get_text() 135 + # Replace all newlines and multiple spaces with a single space 136 + doc = re.sub(r'\s+', ' ', doc) 137 + item['documentation'] = doc.strip() 138 + 139 + # Add the item to our results 140 + result.append(item) 141 + 142 + return result 143 + 144 + 145 + def process_json_file(file_path: str, package_name: str, package_version: str) -> List[Dict[str, Any]]: 146 + """ 147 + Process a single odoc JSON file and extract the relevant information. 148 + 149 + Args: 150 + file_path: Path to the JSON file 151 + package_name: Name of the package 152 + package_version: Version of the package 153 + 154 + Returns: 155 + List of dictionaries containing extracted information 156 + """ 157 + with open(file_path, 'r', encoding='utf-8') as f: 158 + try: 159 + data = json.load(f) 160 + except json.JSONDecodeError: 161 + print(f"Error decoding JSON from {file_path}") 162 + return [] 163 + 164 + if 'content' not in data: 165 + return [] 166 + 167 + # Extract module path from breadcrumbs 168 + module_path = [] 169 + if 'breadcrumbs' in data: 170 + for crumb in data['breadcrumbs']: 171 + if crumb.get('kind') == 'module': 172 + module_path.append(crumb.get('name')) 173 + 174 + module_name = ".".join(module_path) if module_path else os.path.basename(os.path.dirname(file_path)) 175 + 176 + # Extract items from the content 177 + items = parse_html_content(data['content']) 178 + 179 + # Add package and module information to each item 180 + for item in items: 181 + item['package_name'] = package_name 182 + item['package_version'] = package_version 183 + item['module_name'] = module_name 184 + 185 + # Create a full path for the item that includes the item name 186 + # - module_name: just the module hierarchy (e.g., "Math.Operations") 187 + # - full_path: complete path including item name (e.g., "Math.Operations.add") 188 + if 'name' in item: 189 + item['full_path'] = f"{module_name}.{item['name']}" 190 + else: 191 + item['full_path'] = module_name 192 + 193 + return items 194 + 195 + 196 + def process_directory(directory: str) -> List[Dict[str, Any]]: 197 + """ 198 + Process all JSON files in a directory recursively. 199 + 200 + Args: 201 + directory: Path to the directory containing odoc JSON files 202 + 203 + Returns: 204 + List of all extracted items from all files 205 + """ 206 + all_items = [] 207 + package_name, package_version = extract_package_info(directory) 208 + 209 + for root, _, files in os.walk(directory): 210 + for file in files: 211 + if file.endswith('.html.json'): 212 + file_path = os.path.join(root, file) 213 + items = process_json_file(file_path, package_name, package_version) 214 + all_items.extend(items) 215 + 216 + return all_items 217 + 218 + 219 + def main(): 220 + parser = argparse.ArgumentParser(description='Convert odoc JSON to structured JSON records') 221 + parser.add_argument('input_dir', help='Directory containing odoc JSON output') 222 + parser.add_argument('output_file', help='Output JSON file path') 223 + parser.add_argument('--pretty', action='store_true', help='Pretty-print the JSON output') 224 + args = parser.parse_args() 225 + 226 + # Process all files in the directory 227 + items = process_directory(args.input_dir) 228 + 229 + # Write the output 230 + with open(args.output_file, 'w', encoding='utf-8') as f: 231 + if args.pretty: 232 + json.dump(items, f, indent=2, ensure_ascii=False) 233 + else: 234 + json.dump(items, f, ensure_ascii=False) 235 + 236 + print(f"Processed {len(items)} items and saved to {args.output_file}") 237 + 238 + 239 + if __name__ == "__main__": 240 + main()

Configure Feed

Configure Feed