···1919import os
2020import json
2121import re
2222+import time
2223from bs4 import BeautifulSoup
2324from typing import Dict, List, Any, Optional, Tuple
2425import argparse
2526from pathlib import Path
262727282828-def extract_package_info(path: str) -> Tuple[str, str]:
2929+def extract_package_info(path: str, mode: str = 'full',
3030+ override_package_name: Optional[str] = None,
3131+ override_package_version: Optional[str] = None) -> Tuple[str, str]:
2932 """
3033 Extract package name and version from the path.
31343235 Args:
3336 path: Path to the odoc output directory
3737+ mode: Operating mode - 'full' for full packages list, 'single' for a single package
3838+ override_package_name: Optional override for package name
3939+ override_package_version: Optional override for package version
34403541 Returns:
3642 Tuple of (package_name, package_version)
3743 """
4444+ # Always prioritize explicit overrides if provided
4545+ if override_package_name:
4646+ package_name = override_package_name
4747+ else:
4848+ package_name = "unknown"
4949+5050+ if override_package_version:
5151+ package_version = override_package_version
5252+ else:
5353+ package_version = "unknown"
5454+5555+ # If we have both overrides, no need to analyze path
5656+ if override_package_name and override_package_version:
5757+ return package_name, package_version
5858+3859 # Use Path for more reliable path parsing
3960 p = Path(path).resolve()
4061 parts = list(p.parts)
41624242- # If the path is in the format ".../package_name/version/..."
4343- if len(parts) >= 2:
4444- # The package name is typically the second-to-last component
4545- # The version is typically the last component
4646- return parts[-2], parts[-1]
4747- elif len(parts) == 1:
4848- # If only one component, assume it's the package name
4949- return parts[0], "unknown"
5050- else:
5151- return "unknown", "unknown"
6363+ if mode == 'single':
6464+ # In single package mode, the package name is typically the directory name
6565+ if not override_package_name and parts:
6666+ # Extract package name from the last part of the path
6767+ package_name = parts[-1]
6868+6969+ # Check if there's a subdirectory in the path that seems like a package name
7070+ subdir = next((d for d in os.listdir(path) if os.path.isdir(os.path.join(path, d))), None)
7171+ if subdir:
7272+ package_name = subdir
7373+7474+ elif mode == 'full':
7575+ # In full mode, we need to look at the directory structure more carefully
7676+ # For test/ directory, the structure is test/package-name/package-version/
7777+7878+ # First, check if the directory structure matches the expected pattern
7979+ # Look for subdirectories in the current path
8080+ try:
8181+ subdirs = [d for d in os.listdir(path) if os.path.isdir(os.path.join(path, d))]
8282+8383+ # If we have subdirectories that might be package names
8484+ if subdirs and not override_package_name:
8585+ # For each subdirectory (potential package name), check if it contains version subdirectories
8686+ for subdir in subdirs:
8787+ version_dirs = [d for d in os.listdir(os.path.join(path, subdir))
8888+ if os.path.isdir(os.path.join(path, subdir, d))]
8989+9090+ # If this subdirectory contains potential version directories, it's likely a package
9191+ if version_dirs:
9292+ # We'll use the current file's path to determine which package and version it belongs to
9393+ # We're processing files at the specific file level elsewhere, so here we just return
9494+ # default values which will be overridden during actual file processing
9595+ return subdir, "unknown"
9696+9797+ # If we found no package structure or we're processing a file already in a package context
9898+ # In this case, we'll determine package/version from the path of the file being processed
9999+ if len(parts) >= 3:
100100+ # Path structure might be test/package-name/version/...
101101+ # Check if the first part is "test"
102102+ if parts[-3] == "test" or "test" in str(p):
103103+ package_name = parts[-2] if not override_package_name else package_name
104104+ package_version = parts[-1] if not override_package_version else package_version
105105+ else:
106106+ # Standard structure: .../package-name/package-version/...
107107+ package_name = parts[-2] if not override_package_name else package_name
108108+ package_version = parts[-1] if not override_package_version else package_version
109109+ except (FileNotFoundError, PermissionError) as e:
110110+ # Handle cases where we can't access the directory
111111+ print(f"Error accessing directory {path}: {str(e)}")
112112+113113+ return package_name, package_version
521145311554116def parse_html_content(content: str) -> List[Dict[str, Any]]:
···154216 Returns:
155217 List of dictionaries containing extracted information
156218 """
157157- with open(file_path, 'r', encoding='utf-8') as f:
219219+ # Extract package and version from file path if not already properly set
220220+ if package_version == "unknown" or package_name == "unknown":
221221+ # Check if this file is in a test directory structure
222222+ file_path_parts = Path(file_path).resolve().parts
223223+224224+ # Look for test/package-name/version pattern in the path
225225+ for i, part in enumerate(file_path_parts):
226226+ if part == "test" and i + 2 < len(file_path_parts):
227227+ # We found a test directory, extract package name and version
228228+ package_name = file_path_parts[i + 1]
229229+ package_version = file_path_parts[i + 2]
230230+ break
231231+232232+ try:
233233+ with open(file_path, 'r', encoding='utf-8') as f:
234234+ try:
235235+ data = json.load(f)
236236+ except json.JSONDecodeError:
237237+ print(f"Error decoding JSON from {file_path}")
238238+ return []
239239+ except UnicodeDecodeError:
240240+ # Try opening with a different encoding or with errors='ignore'
158241 try:
159159- data = json.load(f)
160160- except json.JSONDecodeError:
161161- print(f"Error decoding JSON from {file_path}")
242242+ with open(file_path, 'r', encoding='latin-1') as f:
243243+ try:
244244+ data = json.load(f)
245245+ except json.JSONDecodeError:
246246+ print(f"Error decoding JSON from {file_path} with latin-1 encoding")
247247+ return []
248248+ except Exception as e:
249249+ print(f"Error reading {file_path}: {str(e)}")
162250 return []
163251164252 if 'content' not in data:
···193281 return items
194282195283196196-def process_directory(directory: str) -> List[Dict[str, Any]]:
284284+def process_directory(directory: str, mode: str = 'full',
285285+ override_package_name: Optional[str] = None,
286286+ override_package_version: Optional[str] = None) -> List[Dict[str, Any]]:
197287 """
198288 Process all JSON files in a directory recursively.
199289200290 Args:
201291 directory: Path to the directory containing odoc JSON files
292292+ mode: Operating mode - 'full' for full packages list, 'single' for a single package
293293+ override_package_name: Optional override for package name
294294+ override_package_version: Optional override for package version
202295203296 Returns:
204297 List of all extracted items from all files
205298 """
206299 all_items = []
207207- package_name, package_version = extract_package_info(directory)
300300+ package_name, package_version = extract_package_info(
301301+ directory,
302302+ mode=mode,
303303+ override_package_name=override_package_name,
304304+ override_package_version=override_package_version
305305+ )
306306+307307+ # First count total files to process for progress tracking
308308+ total_files = 0
309309+ for root, _, files in os.walk(directory):
310310+ for file in files:
311311+ if file.endswith('.html.json'):
312312+ total_files += 1
313313+314314+ if total_files == 0:
315315+ print(f"No .html.json files found in {directory}")
316316+ return all_items
317317+318318+ mode_str = f"single package mode" if mode == 'single' else "full packages mode"
319319+ print(f"Processing {total_files} files from {package_name} {package_version} in {mode_str}...")
320320+321321+ # Process each file with progress indicator
322322+ processed_files = 0
323323+ extracted_items = 0
208324209325 for root, _, files in os.walk(directory):
210326 for file in files:
···212328 file_path = os.path.join(root, file)
213329 items = process_json_file(file_path, package_name, package_version)
214330 all_items.extend(items)
331331+332332+ # Update progress
333333+ processed_files += 1
334334+ extracted_items += len(items)
335335+336336+ # Print progress every 100 files or on the last file
337337+ if processed_files % 100 == 0 or processed_files == total_files:
338338+ percent = (processed_files / total_files) * 100
339339+ print(f"Progress: {processed_files}/{total_files} files ({percent:.1f}%) - {extracted_items} items extracted",
340340+ end="\r", flush=True)
215341342342+ print(f"\nCompleted processing {processed_files} files - extracted {extracted_items} items total.")
216343 return all_items
217344218345219346def main():
347347+ """
348348+ Main entry point for the script.
349349+350350+ Usage examples:
351351+352352+ # Process in full mode (multiple packages)
353353+ python odoc2json.py /path/to/odoc/output output.json
354354+355355+ # Process a single package with automatic detection
356356+ python odoc2json.py /path/to/odoc/package output.json --mode single
357357+358358+ # Process with explicit package name and version
359359+ python odoc2json.py /path/to/odoc/package output.json --mode single --package-name package-name --package-version 5.0.0
360360+ """
220361 parser = argparse.ArgumentParser(description='Convert odoc JSON to structured JSON records')
221362 parser.add_argument('input_dir', help='Directory containing odoc JSON output')
222363 parser.add_argument('output_file', help='Output JSON file path')
223364 parser.add_argument('--pretty', action='store_true', help='Pretty-print the JSON output')
365365+ parser.add_argument('--verbose', '-v', action='store_true', help='Enable verbose output')
366366+ parser.add_argument('--mode', choices=['full', 'single'], default='full',
367367+ help='Run mode: "full" for complete list of packages, "single" for a single package')
368368+ parser.add_argument('--package-name', help='Override the package name (useful in single mode)')
369369+ parser.add_argument('--package-version', help='Override the package version (useful in single mode)')
224370 args = parser.parse_args()
371371+372372+ start_time = time.time()
373373+ print(f"Starting extraction from {args.input_dir} in {args.mode} mode")
225374226375 # Process all files in the directory
227227- items = process_directory(args.input_dir)
376376+ items = process_directory(
377377+ args.input_dir,
378378+ mode=args.mode,
379379+ override_package_name=args.package_name,
380380+ override_package_version=args.package_version
381381+ )
228382229383 # Write the output
384384+ print(f"Writing {len(items)} items to {args.output_file}...")
230385 with open(args.output_file, 'w', encoding='utf-8') as f:
231386 if args.pretty:
232387 json.dump(items, f, indent=2, ensure_ascii=False)
233388 else:
234389 json.dump(items, f, ensure_ascii=False)
235390236236- print(f"Processed {len(items)} items and saved to {args.output_file}")
391391+ elapsed_time = time.time() - start_time
392392+ print(f"Processed {len(items)} items in {elapsed_time:.2f} seconds")
393393+ print(f"Output saved to {args.output_file}")
237394238395239396if __name__ == "__main__":
+127-27
odoc2llm.py
···26262727def extract_module_info(json_content):
2828 """Extract module information from odoc JSON content."""
2929- data = json.loads(json_content)
2929+ try:
3030+ data = json.loads(json_content)
3131+ except json.JSONDecodeError as e:
3232+ print(f"JSON decode error: {e}")
3333+ # Return a minimal structure that won't cause errors downstream
3434+ return {
3535+ "name": "Unknown",
3636+ "type": "Module",
3737+ "breadcrumbs": [],
3838+ "content": BeautifulSoup("", "html.parser"),
3939+ "preamble": ""
4040+ }
30413142 # Extract module name and type from header
3243 header = data.get("header", "")
···328339 return "\n".join(md_lines)
329340330341342342+def read_json_file(file_path):
343343+ """
344344+ Read a JSON file with robust error handling for encoding issues.
345345+346346+ Args:
347347+ file_path: Path to the JSON file
348348+349349+ Returns:
350350+ Content of the JSON file as a string, or None if there was an error
351351+ """
352352+ # Try UTF-8 first (most common encoding)
353353+ try:
354354+ with open(file_path, 'r', encoding='utf-8') as f:
355355+ return f.read()
356356+ except UnicodeDecodeError:
357357+ # Try other encodings if UTF-8 fails
358358+ try:
359359+ with open(file_path, 'r', encoding='latin-1') as f:
360360+ return f.read()
361361+ except Exception as e:
362362+ print(f"Error reading {file_path}: {str(e)}", file=sys.stderr)
363363+ return None
364364+365365+331366def build_module_hierarchy(json_files, root_dir):
332367 """Build a hierarchical structure from all the JSON files."""
333368 hierarchy = defaultdict(list)
···340375 if package_parts[-1] in ["index.html.json", "sidebar.json", "status.json", "sherlodoc_db.js"]:
341376 # For index.html.json, check if it's a module documentation
342377 if package_parts[-1] == "index.html.json" and len(package_parts) > 1:
343343- try:
344344- with open(json_file, 'r', encoding='utf-8') as f:
345345- json_content = f.read()
346346-347347- # Try to parse the module info
348348- module_info = extract_module_info(json_content)
349349- signatures = parse_module_signature(module_info["content"])
350350-351351- # Group by package/library
352352- if len(package_parts) > 1:
353353- package_name = package_parts[0]
354354- hierarchy[package_name].append({
378378+ json_content = read_json_file(json_file)
379379+ if json_content:
380380+ try:
381381+ # Try to parse the module info
382382+ module_info = extract_module_info(json_content)
383383+ signatures = parse_module_signature(module_info["content"])
384384+385385+ # Determine package name and version from path
386386+ package_name, package_version = determine_package_info(json_file, package_parts, module_info)
387387+388388+ # Use package name and version for the hierarchy key
389389+ package_key = f"{package_name}"
390390+ if package_version != "unknown":
391391+ # Add version information to module_info for display in markdown
392392+ module_info["package_version"] = package_version
393393+394394+ hierarchy[package_key].append({
355395 "file": json_file,
356396 "module_info": module_info,
357397 "signatures": signatures,
358398 "path_parts": package_parts
359399 })
360360- except Exception as e:
361361- print(f"Error processing {json_file}: {e}", file=sys.stderr)
400400+ except Exception as e:
401401+ print(f"Error processing {json_file}: {e}", file=sys.stderr)
362402363403 continue
364404365405 # Try to parse other JSON files (non-index.html.json)
366366- try:
367367- with open(json_file, 'r', encoding='utf-8') as f:
368368- json_content = f.read()
369369-370370- module_info = extract_module_info(json_content)
371371- signatures = parse_module_signature(module_info["content"])
372372-373373- # Group by package/library
374374- if len(package_parts) > 1:
375375- package_name = package_parts[0]
406406+ json_content = read_json_file(json_file)
407407+ if json_content:
408408+ try:
409409+ module_info = extract_module_info(json_content)
410410+ signatures = parse_module_signature(module_info["content"])
411411+412412+ # Determine package name from path
413413+ package_name = determine_package_name(package_parts, module_info)
414414+376415 hierarchy[package_name].append({
377416 "file": json_file,
378417 "module_info": module_info,
379418 "signatures": signatures,
380419 "path_parts": package_parts
381420 })
382382- except Exception as e:
383383- print(f"Error processing {json_file}: {e}", file=sys.stderr)
421421+ except Exception as e:
422422+ print(f"Error processing {json_file}: {e}", file=sys.stderr)
384423385424 return hierarchy
386425387426427427+def determine_package_info(file_path, path_parts, module_info):
428428+ """
429429+ Determine package name and version from file path and module info.
430430+431431+ Args:
432432+ file_path: The full file path
433433+ path_parts: Parts of the path
434434+ module_info: Extracted module information
435435+436436+ Returns:
437437+ Tuple of (package_name, package_version)
438438+ """
439439+ package_name = "unknown"
440440+ package_version = "unknown"
441441+442442+ # Try to extract from breadcrumbs if available
443443+ if module_info["breadcrumbs"] and any("Library" in crumb for crumb in module_info["breadcrumbs"]):
444444+ for crumb in module_info["breadcrumbs"]:
445445+ if "Library" in crumb:
446446+ # Extract library name from the breadcrumb
447447+ match = re.search(r'Library\s+(.+)', crumb)
448448+ if match:
449449+ package_name = match.group(1).strip()
450450+451451+ # Look for test/package-name/version pattern in the path
452452+ file_path_parts = Path(file_path).resolve().parts
453453+ for i, part in enumerate(file_path_parts):
454454+ if part == "test" and i + 2 < len(file_path_parts):
455455+ # We found a test directory, extract package name and version
456456+ package_name = file_path_parts[i + 1]
457457+ package_version = file_path_parts[i + 2]
458458+ break
459459+460460+ # If still unknown, fall back to using the first part of the path
461461+ if package_name == "unknown" and len(path_parts) > 0:
462462+ package_name = path_parts[0]
463463+464464+ # Last resort - use module name or "unknown"
465465+ if package_name == "unknown":
466466+ package_name = module_info["name"] if module_info["name"] else "unknown"
467467+468468+ return package_name, package_version
469469+470470+388471def sort_modules_hierarchically(modules):
389472 """Sort modules to ensure proper hierarchical presentation."""
390473 # First sort by breadcrumb length (shorter = higher in hierarchy)
···414497415498416499def main():
500500+ """
501501+ Main entry point for the script.
502502+503503+ Usage examples:
504504+505505+ # Process all packages in a directory
506506+ python odoc2llm.py /path/to/odoc/output
507507+508508+ # Process all packages and specify output file
509509+ python odoc2llm.py /path/to/odoc/output --output documentation.md
510510+511511+ # Process a specific package only
512512+ python odoc2llm.py /path/to/odoc/output --package package-name
513513+514514+ # Enable verbose output
515515+ python odoc2llm.py /path/to/odoc/output --verbose
516516+ """
417517 parser = argparse.ArgumentParser(description='Convert odoc JSON to Markdown for LLMs.')
418518 parser.add_argument('html_dir', help='Directory containing odoc generated HTML/JSON files')
419519 parser.add_argument('--output', '-o', default='odoc_for_llm.md', help='Output Markdown file')