fix paths · anil.recoil.org/odoc-mcp@dd481f9

+303 -46

2 changed files

expand all

odoc2json.py

odoc2llm.py

+176 -19

odoc2json.py

··· 19 19 import os 20 20 import json 21 21 import re 22 + import time 22 23 from bs4 import BeautifulSoup 23 24 from typing import Dict, List, Any, Optional, Tuple 24 25 import argparse 25 26 from pathlib import Path 26 27 27 28 28 - def extract_package_info(path: str) -> Tuple[str, str]: 29 + def extract_package_info(path: str, mode: str = 'full', 30 + override_package_name: Optional[str] = None, 31 + override_package_version: Optional[str] = None) -> Tuple[str, str]: 29 32 """ 30 33 Extract package name and version from the path. 31 34 32 35 Args: 33 36 path: Path to the odoc output directory 37 + mode: Operating mode - 'full' for full packages list, 'single' for a single package 38 + override_package_name: Optional override for package name 39 + override_package_version: Optional override for package version 34 40 35 41 Returns: 36 42 Tuple of (package_name, package_version) 37 43 """ 44 + # Always prioritize explicit overrides if provided 45 + if override_package_name: 46 + package_name = override_package_name 47 + else: 48 + package_name = "unknown" 49 + 50 + if override_package_version: 51 + package_version = override_package_version 52 + else: 53 + package_version = "unknown" 54 + 55 + # If we have both overrides, no need to analyze path 56 + if override_package_name and override_package_version: 57 + return package_name, package_version 58 + 38 59 # Use Path for more reliable path parsing 39 60 p = Path(path).resolve() 40 61 parts = list(p.parts) 41 62 42 - # If the path is in the format ".../package_name/version/..." 43 - if len(parts) >= 2: 44 - # The package name is typically the second-to-last component 45 - # The version is typically the last component 46 - return parts[-2], parts[-1] 47 - elif len(parts) == 1: 48 - # If only one component, assume it's the package name 49 - return parts[0], "unknown" 50 - else: 51 - return "unknown", "unknown" 63 + if mode == 'single': 64 + # In single package mode, the package name is typically the directory name 65 + if not override_package_name and parts: 66 + # Extract package name from the last part of the path 67 + package_name = parts[-1] 68 + 69 + # Check if there's a subdirectory in the path that seems like a package name 70 + subdir = next((d for d in os.listdir(path) if os.path.isdir(os.path.join(path, d))), None) 71 + if subdir: 72 + package_name = subdir 73 + 74 + elif mode == 'full': 75 + # In full mode, we need to look at the directory structure more carefully 76 + # For test/ directory, the structure is test/package-name/package-version/ 77 + 78 + # First, check if the directory structure matches the expected pattern 79 + # Look for subdirectories in the current path 80 + try: 81 + subdirs = [d for d in os.listdir(path) if os.path.isdir(os.path.join(path, d))] 82 + 83 + # If we have subdirectories that might be package names 84 + if subdirs and not override_package_name: 85 + # For each subdirectory (potential package name), check if it contains version subdirectories 86 + for subdir in subdirs: 87 + version_dirs = [d for d in os.listdir(os.path.join(path, subdir)) 88 + if os.path.isdir(os.path.join(path, subdir, d))] 89 + 90 + # If this subdirectory contains potential version directories, it's likely a package 91 + if version_dirs: 92 + # We'll use the current file's path to determine which package and version it belongs to 93 + # We're processing files at the specific file level elsewhere, so here we just return 94 + # default values which will be overridden during actual file processing 95 + return subdir, "unknown" 96 + 97 + # If we found no package structure or we're processing a file already in a package context 98 + # In this case, we'll determine package/version from the path of the file being processed 99 + if len(parts) >= 3: 100 + # Path structure might be test/package-name/version/... 101 + # Check if the first part is "test" 102 + if parts[-3] == "test" or "test" in str(p): 103 + package_name = parts[-2] if not override_package_name else package_name 104 + package_version = parts[-1] if not override_package_version else package_version 105 + else: 106 + # Standard structure: .../package-name/package-version/... 107 + package_name = parts[-2] if not override_package_name else package_name 108 + package_version = parts[-1] if not override_package_version else package_version 109 + except (FileNotFoundError, PermissionError) as e: 110 + # Handle cases where we can't access the directory 111 + print(f"Error accessing directory {path}: {str(e)}") 112 + 113 + return package_name, package_version 52 114 53 115 54 116 def parse_html_content(content: str) -> List[Dict[str, Any]]: ··· 154 216 Returns: 155 217 List of dictionaries containing extracted information 156 218 """ 157 - with open(file_path, 'r', encoding='utf-8') as f: 219 + # Extract package and version from file path if not already properly set 220 + if package_version == "unknown" or package_name == "unknown": 221 + # Check if this file is in a test directory structure 222 + file_path_parts = Path(file_path).resolve().parts 223 + 224 + # Look for test/package-name/version pattern in the path 225 + for i, part in enumerate(file_path_parts): 226 + if part == "test" and i + 2 < len(file_path_parts): 227 + # We found a test directory, extract package name and version 228 + package_name = file_path_parts[i + 1] 229 + package_version = file_path_parts[i + 2] 230 + break 231 + 232 + try: 233 + with open(file_path, 'r', encoding='utf-8') as f: 234 + try: 235 + data = json.load(f) 236 + except json.JSONDecodeError: 237 + print(f"Error decoding JSON from {file_path}") 238 + return [] 239 + except UnicodeDecodeError: 240 + # Try opening with a different encoding or with errors='ignore' 158 241 try: 159 - data = json.load(f) 160 - except json.JSONDecodeError: 161 - print(f"Error decoding JSON from {file_path}") 242 + with open(file_path, 'r', encoding='latin-1') as f: 243 + try: 244 + data = json.load(f) 245 + except json.JSONDecodeError: 246 + print(f"Error decoding JSON from {file_path} with latin-1 encoding") 247 + return [] 248 + except Exception as e: 249 + print(f"Error reading {file_path}: {str(e)}") 162 250 return [] 163 251 164 252 if 'content' not in data: ··· 193 281 return items 194 282 195 283 196 - def process_directory(directory: str) -> List[Dict[str, Any]]: 284 + def process_directory(directory: str, mode: str = 'full', 285 + override_package_name: Optional[str] = None, 286 + override_package_version: Optional[str] = None) -> List[Dict[str, Any]]: 197 287 """ 198 288 Process all JSON files in a directory recursively. 199 289 200 290 Args: 201 291 directory: Path to the directory containing odoc JSON files 292 + mode: Operating mode - 'full' for full packages list, 'single' for a single package 293 + override_package_name: Optional override for package name 294 + override_package_version: Optional override for package version 202 295 203 296 Returns: 204 297 List of all extracted items from all files 205 298 """ 206 299 all_items = [] 207 - package_name, package_version = extract_package_info(directory) 300 + package_name, package_version = extract_package_info( 301 + directory, 302 + mode=mode, 303 + override_package_name=override_package_name, 304 + override_package_version=override_package_version 305 + ) 306 + 307 + # First count total files to process for progress tracking 308 + total_files = 0 309 + for root, _, files in os.walk(directory): 310 + for file in files: 311 + if file.endswith('.html.json'): 312 + total_files += 1 313 + 314 + if total_files == 0: 315 + print(f"No .html.json files found in {directory}") 316 + return all_items 317 + 318 + mode_str = f"single package mode" if mode == 'single' else "full packages mode" 319 + print(f"Processing {total_files} files from {package_name} {package_version} in {mode_str}...") 320 + 321 + # Process each file with progress indicator 322 + processed_files = 0 323 + extracted_items = 0 208 324 209 325 for root, _, files in os.walk(directory): 210 326 for file in files: ··· 212 328 file_path = os.path.join(root, file) 213 329 items = process_json_file(file_path, package_name, package_version) 214 330 all_items.extend(items) 331 + 332 + # Update progress 333 + processed_files += 1 334 + extracted_items += len(items) 335 + 336 + # Print progress every 100 files or on the last file 337 + if processed_files % 100 == 0 or processed_files == total_files: 338 + percent = (processed_files / total_files) * 100 339 + print(f"Progress: {processed_files}/{total_files} files ({percent:.1f}%) - {extracted_items} items extracted", 340 + end="\r", flush=True) 215 341 342 + print(f"\nCompleted processing {processed_files} files - extracted {extracted_items} items total.") 216 343 return all_items 217 344 218 345 219 346 def main(): 347 + """ 348 + Main entry point for the script. 349 + 350 + Usage examples: 351 + 352 + # Process in full mode (multiple packages) 353 + python odoc2json.py /path/to/odoc/output output.json 354 + 355 + # Process a single package with automatic detection 356 + python odoc2json.py /path/to/odoc/package output.json --mode single 357 + 358 + # Process with explicit package name and version 359 + python odoc2json.py /path/to/odoc/package output.json --mode single --package-name package-name --package-version 5.0.0 360 + """ 220 361 parser = argparse.ArgumentParser(description='Convert odoc JSON to structured JSON records') 221 362 parser.add_argument('input_dir', help='Directory containing odoc JSON output') 222 363 parser.add_argument('output_file', help='Output JSON file path') 223 364 parser.add_argument('--pretty', action='store_true', help='Pretty-print the JSON output') 365 + parser.add_argument('--verbose', '-v', action='store_true', help='Enable verbose output') 366 + parser.add_argument('--mode', choices=['full', 'single'], default='full', 367 + help='Run mode: "full" for complete list of packages, "single" for a single package') 368 + parser.add_argument('--package-name', help='Override the package name (useful in single mode)') 369 + parser.add_argument('--package-version', help='Override the package version (useful in single mode)') 224 370 args = parser.parse_args() 371 + 372 + start_time = time.time() 373 + print(f"Starting extraction from {args.input_dir} in {args.mode} mode") 225 374 226 375 # Process all files in the directory 227 - items = process_directory(args.input_dir) 376 + items = process_directory( 377 + args.input_dir, 378 + mode=args.mode, 379 + override_package_name=args.package_name, 380 + override_package_version=args.package_version 381 + ) 228 382 229 383 # Write the output 384 + print(f"Writing {len(items)} items to {args.output_file}...") 230 385 with open(args.output_file, 'w', encoding='utf-8') as f: 231 386 if args.pretty: 232 387 json.dump(items, f, indent=2, ensure_ascii=False) 233 388 else: 234 389 json.dump(items, f, ensure_ascii=False) 235 390 236 - print(f"Processed {len(items)} items and saved to {args.output_file}") 391 + elapsed_time = time.time() - start_time 392 + print(f"Processed {len(items)} items in {elapsed_time:.2f} seconds") 393 + print(f"Output saved to {args.output_file}") 237 394 238 395 239 396 if __name__ == "__main__":

+127 -27

odoc2llm.py

··· 26 26 27 27 def extract_module_info(json_content): 28 28 """Extract module information from odoc JSON content.""" 29 - data = json.loads(json_content) 29 + try: 30 + data = json.loads(json_content) 31 + except json.JSONDecodeError as e: 32 + print(f"JSON decode error: {e}") 33 + # Return a minimal structure that won't cause errors downstream 34 + return { 35 + "name": "Unknown", 36 + "type": "Module", 37 + "breadcrumbs": [], 38 + "content": BeautifulSoup("", "html.parser"), 39 + "preamble": "" 40 + } 30 41 31 42 # Extract module name and type from header 32 43 header = data.get("header", "") ··· 328 339 return "\n".join(md_lines) 329 340 330 341 342 + def read_json_file(file_path): 343 + """ 344 + Read a JSON file with robust error handling for encoding issues. 345 + 346 + Args: 347 + file_path: Path to the JSON file 348 + 349 + Returns: 350 + Content of the JSON file as a string, or None if there was an error 351 + """ 352 + # Try UTF-8 first (most common encoding) 353 + try: 354 + with open(file_path, 'r', encoding='utf-8') as f: 355 + return f.read() 356 + except UnicodeDecodeError: 357 + # Try other encodings if UTF-8 fails 358 + try: 359 + with open(file_path, 'r', encoding='latin-1') as f: 360 + return f.read() 361 + except Exception as e: 362 + print(f"Error reading {file_path}: {str(e)}", file=sys.stderr) 363 + return None 364 + 365 + 331 366 def build_module_hierarchy(json_files, root_dir): 332 367 """Build a hierarchical structure from all the JSON files.""" 333 368 hierarchy = defaultdict(list) ··· 340 375 if package_parts[-1] in ["index.html.json", "sidebar.json", "status.json", "sherlodoc_db.js"]: 341 376 # For index.html.json, check if it's a module documentation 342 377 if package_parts[-1] == "index.html.json" and len(package_parts) > 1: 343 - try: 344 - with open(json_file, 'r', encoding='utf-8') as f: 345 - json_content = f.read() 346 - 347 - # Try to parse the module info 348 - module_info = extract_module_info(json_content) 349 - signatures = parse_module_signature(module_info["content"]) 350 - 351 - # Group by package/library 352 - if len(package_parts) > 1: 353 - package_name = package_parts[0] 354 - hierarchy[package_name].append({ 378 + json_content = read_json_file(json_file) 379 + if json_content: 380 + try: 381 + # Try to parse the module info 382 + module_info = extract_module_info(json_content) 383 + signatures = parse_module_signature(module_info["content"]) 384 + 385 + # Determine package name and version from path 386 + package_name, package_version = determine_package_info(json_file, package_parts, module_info) 387 + 388 + # Use package name and version for the hierarchy key 389 + package_key = f"{package_name}" 390 + if package_version != "unknown": 391 + # Add version information to module_info for display in markdown 392 + module_info["package_version"] = package_version 393 + 394 + hierarchy[package_key].append({ 355 395 "file": json_file, 356 396 "module_info": module_info, 357 397 "signatures": signatures, 358 398 "path_parts": package_parts 359 399 }) 360 - except Exception as e: 361 - print(f"Error processing {json_file}: {e}", file=sys.stderr) 400 + except Exception as e: 401 + print(f"Error processing {json_file}: {e}", file=sys.stderr) 362 402 363 403 continue 364 404 365 405 # Try to parse other JSON files (non-index.html.json) 366 - try: 367 - with open(json_file, 'r', encoding='utf-8') as f: 368 - json_content = f.read() 369 - 370 - module_info = extract_module_info(json_content) 371 - signatures = parse_module_signature(module_info["content"]) 372 - 373 - # Group by package/library 374 - if len(package_parts) > 1: 375 - package_name = package_parts[0] 406 + json_content = read_json_file(json_file) 407 + if json_content: 408 + try: 409 + module_info = extract_module_info(json_content) 410 + signatures = parse_module_signature(module_info["content"]) 411 + 412 + # Determine package name from path 413 + package_name = determine_package_name(package_parts, module_info) 414 + 376 415 hierarchy[package_name].append({ 377 416 "file": json_file, 378 417 "module_info": module_info, 379 418 "signatures": signatures, 380 419 "path_parts": package_parts 381 420 }) 382 - except Exception as e: 383 - print(f"Error processing {json_file}: {e}", file=sys.stderr) 421 + except Exception as e: 422 + print(f"Error processing {json_file}: {e}", file=sys.stderr) 384 423 385 424 return hierarchy 386 425 387 426 427 + def determine_package_info(file_path, path_parts, module_info): 428 + """ 429 + Determine package name and version from file path and module info. 430 + 431 + Args: 432 + file_path: The full file path 433 + path_parts: Parts of the path 434 + module_info: Extracted module information 435 + 436 + Returns: 437 + Tuple of (package_name, package_version) 438 + """ 439 + package_name = "unknown" 440 + package_version = "unknown" 441 + 442 + # Try to extract from breadcrumbs if available 443 + if module_info["breadcrumbs"] and any("Library" in crumb for crumb in module_info["breadcrumbs"]): 444 + for crumb in module_info["breadcrumbs"]: 445 + if "Library" in crumb: 446 + # Extract library name from the breadcrumb 447 + match = re.search(r'Library\s+(.+)', crumb) 448 + if match: 449 + package_name = match.group(1).strip() 450 + 451 + # Look for test/package-name/version pattern in the path 452 + file_path_parts = Path(file_path).resolve().parts 453 + for i, part in enumerate(file_path_parts): 454 + if part == "test" and i + 2 < len(file_path_parts): 455 + # We found a test directory, extract package name and version 456 + package_name = file_path_parts[i + 1] 457 + package_version = file_path_parts[i + 2] 458 + break 459 + 460 + # If still unknown, fall back to using the first part of the path 461 + if package_name == "unknown" and len(path_parts) > 0: 462 + package_name = path_parts[0] 463 + 464 + # Last resort - use module name or "unknown" 465 + if package_name == "unknown": 466 + package_name = module_info["name"] if module_info["name"] else "unknown" 467 + 468 + return package_name, package_version 469 + 470 + 388 471 def sort_modules_hierarchically(modules): 389 472 """Sort modules to ensure proper hierarchical presentation.""" 390 473 # First sort by breadcrumb length (shorter = higher in hierarchy) ··· 414 497 415 498 416 499 def main(): 500 + """ 501 + Main entry point for the script. 502 + 503 + Usage examples: 504 + 505 + # Process all packages in a directory 506 + python odoc2llm.py /path/to/odoc/output 507 + 508 + # Process all packages and specify output file 509 + python odoc2llm.py /path/to/odoc/output --output documentation.md 510 + 511 + # Process a specific package only 512 + python odoc2llm.py /path/to/odoc/output --package package-name 513 + 514 + # Enable verbose output 515 + python odoc2llm.py /path/to/odoc/output --verbose 516 + """ 417 517 parser = argparse.ArgumentParser(description='Convert odoc JSON to Markdown for LLMs.') 418 518 parser.add_argument('html_dir', help='Directory containing odoc generated HTML/JSON files') 419 519 parser.add_argument('--output', '-o', default='odoc_for_llm.md', help='Output Markdown file')

Configure Feed

Configure Feed