···2020import json
2121import re
2222import time
2323+import multiprocessing as mp
2324from bs4 import BeautifulSoup
2425from typing import Dict, List, Any, Optional, Tuple
2526import argparse
2627from pathlib import Path
2828+from functools import partial
272928302931def extract_package_info(path: str, mode: str = 'full',
···281283 return items
282284283285286286+def worker_process_files(file_batch, package_name, package_version):
287287+ """
288288+ Worker function to process a batch of files in parallel.
289289+290290+ Args:
291291+ file_batch: List of files to process
292292+ package_name: Name of the package
293293+ package_version: Version of the package
294294+295295+ Returns:
296296+ List of all extracted items from all files in the batch
297297+ """
298298+ batch_items = []
299299+ for file_path in file_batch:
300300+ items = process_json_file(file_path, package_name, package_version)
301301+ batch_items.extend(items)
302302+ return batch_items
303303+304304+305305+def collect_json_files(directory):
306306+ """
307307+ Collect all JSON files in a directory recursively.
308308+309309+ Args:
310310+ directory: Path to the directory to search
311311+312312+ Returns:
313313+ List of file paths
314314+ """
315315+ json_files = []
316316+ for root, _, files in os.walk(directory):
317317+ for file in files:
318318+ if file.endswith('.html.json'):
319319+ json_files.append(os.path.join(root, file))
320320+ return json_files
321321+322322+284323def process_directory(directory: str, mode: str = 'full',
285324 override_package_name: Optional[str] = None,
286286- override_package_version: Optional[str] = None) -> List[Dict[str, Any]]:
325325+ override_package_version: Optional[str] = None,
326326+ num_workers: int = 1) -> List[Dict[str, Any]]:
287327 """
288288- Process all JSON files in a directory recursively.
328328+ Process all JSON files in a directory recursively using multiple processes.
289329290330 Args:
291331 directory: Path to the directory containing odoc JSON files
292332 mode: Operating mode - 'full' for full packages list, 'single' for a single package
293333 override_package_name: Optional override for package name
294334 override_package_version: Optional override for package version
335335+ num_workers: Number of worker processes to use
295336296337 Returns:
297338 List of all extracted items from all files
298339 """
299299- all_items = []
300340 package_name, package_version = extract_package_info(
301341 directory,
302342 mode=mode,
···304344 override_package_version=override_package_version
305345 )
306346307307- # First count total files to process for progress tracking
308308- total_files = 0
309309- for root, _, files in os.walk(directory):
310310- for file in files:
311311- if file.endswith('.html.json'):
312312- total_files += 1
347347+ # Collect all JSON files
348348+ json_files = collect_json_files(directory)
349349+ total_files = len(json_files)
313350314351 if total_files == 0:
315352 print(f"No .html.json files found in {directory}")
316316- return all_items
353353+ return []
317354318318- mode_str = f"single package mode" if mode == 'single' else "full packages mode"
355355+ mode_str = "single package mode" if mode == 'single' else "full packages mode"
319356 print(f"Processing {total_files} files from {package_name} {package_version} in {mode_str}...")
357357+ print(f"Using {num_workers} worker processes")
320358321321- # Process each file with progress indicator
322322- processed_files = 0
323323- extracted_items = 0
359359+ # Split files into batches for workers
360360+ batches = []
361361+ batch_size = max(1, total_files // num_workers)
362362+ for i in range(0, total_files, batch_size):
363363+ batches.append(json_files[i:i + batch_size])
324364325325- for root, _, files in os.walk(directory):
326326- for file in files:
327327- if file.endswith('.html.json'):
328328- file_path = os.path.join(root, file)
329329- items = process_json_file(file_path, package_name, package_version)
330330- all_items.extend(items)
331331-332332- # Update progress
333333- processed_files += 1
334334- extracted_items += len(items)
335335-336336- # Print progress every 100 files or on the last file
337337- if processed_files % 100 == 0 or processed_files == total_files:
338338- percent = (processed_files / total_files) * 100
339339- print(f"Progress: {processed_files}/{total_files} files ({percent:.1f}%) - {extracted_items} items extracted",
340340- end="\r", flush=True)
365365+ # Create partial function with fixed package name and version
366366+ process_batch = partial(worker_process_files, package_name=package_name, package_version=package_version)
341367342342- print(f"\nCompleted processing {processed_files} files - extracted {extracted_items} items total.")
368368+ # Process batches in parallel
369369+ start_time = time.time()
370370+ all_items = []
371371+372372+ if num_workers > 1:
373373+ # Use multiprocessing Pool
374374+ with mp.Pool(processes=num_workers) as pool:
375375+ # Submit all batches to the pool
376376+ results = pool.map(process_batch, batches)
377377+ # Collect all results
378378+ for batch_result in results:
379379+ all_items.extend(batch_result)
380380+ else:
381381+ # Single process mode
382382+ all_items = process_batch(json_files)
383383+384384+ elapsed_time = time.time() - start_time
385385+ print(f"\nCompleted processing {total_files} files in {elapsed_time:.2f} seconds")
386386+ print(f"Extracted {len(all_items)} items total")
343387 return all_items
344388345389···357401358402 # Process with explicit package name and version
359403 python odoc2json.py /path/to/odoc/package output.json --mode single --package-name package-name --package-version 5.0.0
404404+405405+ # Process with multiple cores
406406+ python odoc2json.py /path/to/odoc/output output.json --workers 8
360407 """
361408 parser = argparse.ArgumentParser(description='Convert odoc JSON to structured JSON records')
362409 parser.add_argument('input_dir', help='Directory containing odoc JSON output')
···367414 help='Run mode: "full" for complete list of packages, "single" for a single package')
368415 parser.add_argument('--package-name', help='Override the package name (useful in single mode)')
369416 parser.add_argument('--package-version', help='Override the package version (useful in single mode)')
417417+ parser.add_argument('--workers', type=int, default=mp.cpu_count(),
418418+ help=f'Number of worker processes (default: {mp.cpu_count()})')
370419 args = parser.parse_args()
371420372421 start_time = time.time()
373422 print(f"Starting extraction from {args.input_dir} in {args.mode} mode")
374423375375- # Process all files in the directory
424424+ # Process all files in the directory with multiple workers
376425 items = process_directory(
377426 args.input_dir,
378427 mode=args.mode,
379428 override_package_name=args.package_name,
380380- override_package_version=args.package_version
429429+ override_package_version=args.package_version,
430430+ num_workers=args.workers
381431 )
382432383433 # Write the output
···394444395445396446if __name__ == "__main__":
397397- main()
447447+ main()