parallise · anil.recoil.org/odoc-mcp@94ea53a

+84 -34

1 changed file

expand all

odoc2json.py

+84 -34

odoc2json.py

··· 20 20 import json 21 21 import re 22 22 import time 23 + import multiprocessing as mp 23 24 from bs4 import BeautifulSoup 24 25 from typing import Dict, List, Any, Optional, Tuple 25 26 import argparse 26 27 from pathlib import Path 28 + from functools import partial 27 29 28 30 29 31 def extract_package_info(path: str, mode: str = 'full', ··· 281 283 return items 282 284 283 285 286 + def worker_process_files(file_batch, package_name, package_version): 287 + """ 288 + Worker function to process a batch of files in parallel. 289 + 290 + Args: 291 + file_batch: List of files to process 292 + package_name: Name of the package 293 + package_version: Version of the package 294 + 295 + Returns: 296 + List of all extracted items from all files in the batch 297 + """ 298 + batch_items = [] 299 + for file_path in file_batch: 300 + items = process_json_file(file_path, package_name, package_version) 301 + batch_items.extend(items) 302 + return batch_items 303 + 304 + 305 + def collect_json_files(directory): 306 + """ 307 + Collect all JSON files in a directory recursively. 308 + 309 + Args: 310 + directory: Path to the directory to search 311 + 312 + Returns: 313 + List of file paths 314 + """ 315 + json_files = [] 316 + for root, _, files in os.walk(directory): 317 + for file in files: 318 + if file.endswith('.html.json'): 319 + json_files.append(os.path.join(root, file)) 320 + return json_files 321 + 322 + 284 323 def process_directory(directory: str, mode: str = 'full', 285 324 override_package_name: Optional[str] = None, 286 - override_package_version: Optional[str] = None) -> List[Dict[str, Any]]: 325 + override_package_version: Optional[str] = None, 326 + num_workers: int = 1) -> List[Dict[str, Any]]: 287 327 """ 288 - Process all JSON files in a directory recursively. 328 + Process all JSON files in a directory recursively using multiple processes. 289 329 290 330 Args: 291 331 directory: Path to the directory containing odoc JSON files 292 332 mode: Operating mode - 'full' for full packages list, 'single' for a single package 293 333 override_package_name: Optional override for package name 294 334 override_package_version: Optional override for package version 335 + num_workers: Number of worker processes to use 295 336 296 337 Returns: 297 338 List of all extracted items from all files 298 339 """ 299 - all_items = [] 300 340 package_name, package_version = extract_package_info( 301 341 directory, 302 342 mode=mode, ··· 304 344 override_package_version=override_package_version 305 345 ) 306 346 307 - # First count total files to process for progress tracking 308 - total_files = 0 309 - for root, _, files in os.walk(directory): 310 - for file in files: 311 - if file.endswith('.html.json'): 312 - total_files += 1 347 + # Collect all JSON files 348 + json_files = collect_json_files(directory) 349 + total_files = len(json_files) 313 350 314 351 if total_files == 0: 315 352 print(f"No .html.json files found in {directory}") 316 - return all_items 353 + return [] 317 354 318 - mode_str = f"single package mode" if mode == 'single' else "full packages mode" 355 + mode_str = "single package mode" if mode == 'single' else "full packages mode" 319 356 print(f"Processing {total_files} files from {package_name} {package_version} in {mode_str}...") 357 + print(f"Using {num_workers} worker processes") 320 358 321 - # Process each file with progress indicator 322 - processed_files = 0 323 - extracted_items = 0 359 + # Split files into batches for workers 360 + batches = [] 361 + batch_size = max(1, total_files // num_workers) 362 + for i in range(0, total_files, batch_size): 363 + batches.append(json_files[i:i + batch_size]) 324 364 325 - for root, _, files in os.walk(directory): 326 - for file in files: 327 - if file.endswith('.html.json'): 328 - file_path = os.path.join(root, file) 329 - items = process_json_file(file_path, package_name, package_version) 330 - all_items.extend(items) 331 - 332 - # Update progress 333 - processed_files += 1 334 - extracted_items += len(items) 335 - 336 - # Print progress every 100 files or on the last file 337 - if processed_files % 100 == 0 or processed_files == total_files: 338 - percent = (processed_files / total_files) * 100 339 - print(f"Progress: {processed_files}/{total_files} files ({percent:.1f}%) - {extracted_items} items extracted", 340 - end="\r", flush=True) 365 + # Create partial function with fixed package name and version 366 + process_batch = partial(worker_process_files, package_name=package_name, package_version=package_version) 341 367 342 - print(f"\nCompleted processing {processed_files} files - extracted {extracted_items} items total.") 368 + # Process batches in parallel 369 + start_time = time.time() 370 + all_items = [] 371 + 372 + if num_workers > 1: 373 + # Use multiprocessing Pool 374 + with mp.Pool(processes=num_workers) as pool: 375 + # Submit all batches to the pool 376 + results = pool.map(process_batch, batches) 377 + # Collect all results 378 + for batch_result in results: 379 + all_items.extend(batch_result) 380 + else: 381 + # Single process mode 382 + all_items = process_batch(json_files) 383 + 384 + elapsed_time = time.time() - start_time 385 + print(f"\nCompleted processing {total_files} files in {elapsed_time:.2f} seconds") 386 + print(f"Extracted {len(all_items)} items total") 343 387 return all_items 344 388 345 389 ··· 357 401 358 402 # Process with explicit package name and version 359 403 python odoc2json.py /path/to/odoc/package output.json --mode single --package-name package-name --package-version 5.0.0 404 + 405 + # Process with multiple cores 406 + python odoc2json.py /path/to/odoc/output output.json --workers 8 360 407 """ 361 408 parser = argparse.ArgumentParser(description='Convert odoc JSON to structured JSON records') 362 409 parser.add_argument('input_dir', help='Directory containing odoc JSON output') ··· 367 414 help='Run mode: "full" for complete list of packages, "single" for a single package') 368 415 parser.add_argument('--package-name', help='Override the package name (useful in single mode)') 369 416 parser.add_argument('--package-version', help='Override the package version (useful in single mode)') 417 + parser.add_argument('--workers', type=int, default=mp.cpu_count(), 418 + help=f'Number of worker processes (default: {mp.cpu_count()})') 370 419 args = parser.parse_args() 371 420 372 421 start_time = time.time() 373 422 print(f"Starting extraction from {args.input_dir} in {args.mode} mode") 374 423 375 - # Process all files in the directory 424 + # Process all files in the directory with multiple workers 376 425 items = process_directory( 377 426 args.input_dir, 378 427 mode=args.mode, 379 428 override_package_name=args.package_name, 380 - override_package_version=args.package_version 429 + override_package_version=args.package_version, 430 + num_workers=args.workers 381 431 ) 382 432 383 433 # Write the output ··· 394 444 395 445 396 446 if __name__ == "__main__": 397 - main() 447 + main()

Configure Feed

Configure Feed