My aggregated monorepo of OCaml code, automaintained
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

Fix DAG executor race condition, add cascade logging, and deduplicate layer code

- Fix Os.mkdir EEXIST race when concurrent forked workers create the same
package directory (root cause of ~370 cascade failures in roll-forward)
- Add on_cascade callback to execute_dag for cascade root-cause logging
and skeleton layer creation (layer.json + opam-repository for rerun)
- Extract shared helpers: Util.populate_opam_repository,
Util.write_skeleton_layer, Util.wait_for_layer_json
- Refactor execute_dag internals: deduplicate propagate_failure,
promote_dependents, and complete_node into single definitions
- Use Unix._exit in forked children to avoid flushing parent buffers
- Replace Str with thread-safe pure-string substring matching
- Add Unix.lockf file locking to History.append for concurrent writes

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

+345 -407
+4 -4
day10/bin/dune
··· 1 - (executable 2 - (public_name day10) 3 - (name main) 1 + (executables 2 + (public_names day10 -) 3 + (names main test_incr_solver) 4 4 (enabled_if (>= %{ocaml_version} 5.3.0)) 5 5 (package day10) 6 - (libraries opam-0install yojson ppx_deriving_yojson.runtime cmdliner dockerfile day10_lib str) 6 + (libraries opam-0install yojson ppx_deriving_yojson.runtime cmdliner dockerfile day10_lib str git-unix) 7 7 (preprocess 8 8 (pps ppx_deriving_yojson)))
+266 -387
day10/bin/main.ml
··· 304 304 Day10_lib.History.append ~packages_dir ~pkg_str entry 305 305 end 306 306 307 - (** Check if any pattern in the list matches the given text (case-insensitive). *) 307 + (** Case-insensitive substring search. Thread-safe (no global state). *) 308 + let contains_substring_ci ~pattern text = 309 + let pat = String.lowercase_ascii pattern in 310 + let pat_len = String.length pat in 311 + let text_len = String.length text in 312 + if pat_len > text_len then false 313 + else 314 + let rec check i = 315 + if i > text_len - pat_len then false 316 + else if String.lowercase_ascii (String.sub text i pat_len) = pat then true 317 + else check (i + 1) 318 + in 319 + check 0 320 + 321 + (** Check if any substring in the list appears in the text (case-insensitive). *) 308 322 let matches_any patterns text = 309 - List.exists (fun pat -> 310 - try ignore (Str.search_forward (Str.regexp_case_fold pat) text 0); true 311 - with Not_found -> false 312 - ) patterns 323 + List.exists (fun pat -> contains_substring_ci ~pattern:pat text) patterns 313 324 314 325 (** Extract the compiler version from a layer.json's deps list. 315 326 Looks for packages starting with "ocaml-base-compiler" or "ocaml-variants". *) ··· 341 352 ] in 342 353 let depext_patterns = [ 343 354 "Unable to locate package"; 344 - "Package .* is not available"; 355 + "is not available"; 345 356 "unmet dependencies"; 346 357 "dpkg: dependency problems"; 347 358 ] in ··· 372 383 let build_log = Path.(temp_dir / "build.log") in 373 384 set_temp_log_path build_log; 374 385 let opam_repo = Util.create_opam_repository temp_dir in 375 - let () = 376 - List.iter 377 - (fun pkg -> 378 - let opam_relative_path = Path.("packages" / OpamPackage.name_to_string pkg / OpamPackage.to_string pkg) in 379 - List.find_map 380 - (fun opam_repository -> 381 - let opam = Path.(opam_repository / opam_relative_path) in 382 - if Sys.file_exists opam then Some opam else None) 383 - config.opam_repositories 384 - |> Option.iter (fun src -> 385 - let dst = Path.(opam_repo / opam_relative_path) in 386 - let () = Os.mkdir ~parents:true dst in 387 - let () = Os.cp Path.(src / "opam") Path.(dst / "opam") in 388 - let src_files = Path.(src / "files") in 389 - if Sys.file_exists src_files then 390 - let dst_files = Path.(dst / "files") in 391 - let () = Os.mkdir dst_files in 392 - Sys.readdir src_files |> Array.iter (fun f -> Os.cp Path.(src_files / f) Path.(dst_files / f)))) 393 - (pkg :: ordered_deps) 394 - in 386 + Util.populate_opam_repository ~opam_repo ~opam_repositories:config.opam_repositories (pkg :: ordered_deps); 395 387 let r = Container.build ~t ~temp_dir build_log pkg ordered_build_hashes in 396 388 let () = Os.safe_rename_dir ~marker_file:layer_json temp_dir target_dir in 397 389 (* Scan for files installed by this package (the upperdir contains only new files) *) ··· 423 415 Os.create_directory_exclusively ~marker_file:layer_json ~lock_info layer_dir safe_write_layer 424 416 in 425 417 let () = if config.log then Os.read_from_file Path.(layer_dir / "build.log") |> print_endline in 426 - (* Wait for layer.json to exist (might be created by another parallel worker) *) 427 - let rec wait_for_layer_json retries = 428 - if Sys.file_exists layer_json then () 429 - else if retries <= 0 then 430 - failwith (Printf.sprintf "Build layer %s never completed (layer.json missing)" build_layer_name) 431 - else begin 432 - Unix.sleepf 0.5; 433 - wait_for_layer_json (retries - 1) 434 - end 435 - in 436 - let () = wait_for_layer_json 600 in (* Wait up to 5 minutes *) 437 - let () = Unix.utimes layer_json 0.0 0.0 in 418 + Util.wait_for_layer_json ~layer_json ~layer_name:build_layer_name; 438 419 (* Ensure symlink exists even if layer was pre-existing from previous run *) 439 420 Util.ensure_package_layer_symlink ~cache_dir:config.dir ~os_key:(Config.os_key ~config) ~pkg_str ~layer_name:build_layer_name; 440 421 (* Create blessed-build symlink if this package is blessed *) ··· 514 495 if not (Sys.file_exists doc_layer_json) then 515 496 Os.create_directory_exclusively ~marker_file:doc_layer_json ~lock_info doc_layer_dir safe_write_layer 516 497 in 517 - (* Wait for layer.json to exist (might be created by another parallel worker) *) 518 - let rec wait_for_layer_json retries = 519 - if Sys.file_exists doc_layer_json then () 520 - else if retries <= 0 then 521 - failwith (Printf.sprintf "Doc layer %s never completed (layer.json missing)" doc_layer_name) 522 - else begin 523 - Unix.sleepf 0.5; 524 - wait_for_layer_json (retries - 1) 525 - end 526 - in 527 - let () = wait_for_layer_json 600 in (* Wait up to 5 minutes *) 528 - let () = Unix.utimes doc_layer_json 0.0 0.0 in 498 + Util.wait_for_layer_json ~layer_json:doc_layer_json ~layer_name:doc_layer_name; 529 499 (* Check if doc generation failed *) 530 500 if Util.load_layer_info_doc_failed doc_layer_json then 531 501 None ··· 584 554 if not (Sys.file_exists jtw_layer_json) then 585 555 Os.create_directory_exclusively ~marker_file:jtw_layer_json ~lock_info jtw_layer_dir safe_write_layer 586 556 in 587 - (* Wait for layer.json *) 588 - let rec wait_for_layer_json retries = 589 - if Sys.file_exists jtw_layer_json then () 590 - else if retries <= 0 then 591 - failwith (Printf.sprintf "JTW layer %s never completed (layer.json missing)" jtw_layer_name) 592 - else begin 593 - Unix.sleepf 0.5; 594 - wait_for_layer_json (retries - 1) 595 - end 596 - in 597 - let () = wait_for_layer_json 600 in 598 - let () = Unix.utimes jtw_layer_json 0.0 0.0 in 557 + Util.wait_for_layer_json ~layer_json:jtw_layer_json ~layer_name:jtw_layer_name; 599 558 (* Create symlink *) 600 559 Util.ensure_package_layer_symlink ~cache_dir:config.dir ~os_key ~pkg_str ~layer_name:jtw_layer_name; 601 560 Some jtw_layer_name 602 561 562 + (* Global cache for Container.layer_hash results, shared between build and summary phases. 563 + Key: comma-joined package names, Value: hash string. *) 564 + let layer_hash_global_cache : (string, string) Hashtbl.t = Hashtbl.create 4096 565 + 566 + let cached_layer_hash_global ~t pkgs = 567 + let key = String.concat "," (List.map OpamPackage.to_string pkgs) in 568 + match Hashtbl.find_opt layer_hash_global_cache key with 569 + | Some h -> h 570 + | None -> 571 + let h = Container.layer_hash ~t pkgs in 572 + Hashtbl.replace layer_hash_global_cache key h; 573 + h 574 + 603 575 let build config package = 604 576 match solve config package with 605 577 | Ok solution -> ··· 617 589 | [] -> true 618 590 | pkg :: rest -> 619 591 let ordered_deps = extract_dag dependencies pkg |> topological_sort |> List.rev |> List.tl in 620 - let hash = Container.layer_hash ~t (pkg :: ordered_deps) in 592 + let hash = cached_layer_hash_global ~t (pkg :: ordered_deps) in 621 593 let build_layer_name = "build-" ^ hash in 622 594 let layer_dir = Path.(config.dir / Config.os_key ~config / build_layer_name) in 623 595 let layer_json = Path.(layer_dir / "layer.json") in ··· 660 632 | _ -> None) 661 633 ordered_deps 662 634 in 663 - let hash = Container.layer_hash ~t (pkg :: ordered_deps) in 635 + let hash = cached_layer_hash_global ~t (pkg :: ordered_deps) in 664 636 let build_layer_name = "build-" ^ hash in 665 637 let do_build () = 666 638 let r = build_layer t pkg build_layer_name ordered_deps ordered_build_hashes in ··· 712 684 let r, dm = do_build () in 713 685 (r :: res, OpamPackage.Map.add pkg r bm, dm) 714 686 | _ -> 687 + (* Dep failed — create skeleton layer so cascade can rebuild without re-solving *) 688 + let config = Container.config ~t in 689 + let os_key = Config.os_key ~config in 690 + Util.write_skeleton_layer ~cache_dir:config.dir ~os_key 691 + ~opam_repositories:config.opam_repositories ~layer_name:build_layer_name 692 + ~pkg ~ordered_deps ~dep_build_hashes:ordered_build_hashes; 715 693 (Dependency_failed :: res, OpamPackage.Map.add pkg Dependency_failed bm, dm)) 716 694 ([], OpamPackage.Map.empty, OpamPackage.Map.empty) ordered_installation 717 695 in ··· 1182 1160 let nodes : (string, build_node) Hashtbl.t = Hashtbl.create 1024 in 1183 1161 (* Track dep edges: build_hash -> set of dep build_hashes *) 1184 1162 let edges : (string, string list) Hashtbl.t = Hashtbl.create 1024 in 1185 - (* Track ordering within each solution *) 1186 - let all_ordered : (string * string list) list ref = ref [] in 1187 - List.iter (fun (_target, solution) -> 1163 + let per_solution_hashes = List.map (fun (_target, solution) -> 1188 1164 let ordered = topological_sort solution in 1189 1165 let dependencies = pkg_deps solution ordered in 1190 1166 (* Map from pkg -> build_hash for this solution *) 1191 1167 let pkg_to_hash : (string, string) Hashtbl.t = Hashtbl.create 64 in 1192 1168 List.iter (fun pkg -> 1193 1169 let ordered_deps = extract_dag dependencies pkg |> topological_sort |> List.rev |> List.tl in 1194 - let hash = Container.layer_hash ~t (pkg :: ordered_deps) in 1170 + let hash = cached_layer_hash_global ~t (pkg :: ordered_deps) in 1195 1171 let build_hash = "build-" ^ hash in 1196 1172 let dep_build_hashes = List.filter_map (fun dep -> 1197 1173 Hashtbl.find_opt pkg_to_hash (OpamPackage.to_string dep) ··· 1199 1175 Hashtbl.replace pkg_to_hash (OpamPackage.to_string pkg) build_hash; 1200 1176 if not (Hashtbl.mem nodes build_hash) then begin 1201 1177 Hashtbl.replace nodes build_hash { pkg; build_hash; ordered_deps; dep_build_hashes }; 1202 - Hashtbl.replace edges build_hash dep_build_hashes; 1203 - all_ordered := (build_hash, dep_build_hashes) :: !all_ordered 1178 + Hashtbl.replace edges build_hash dep_build_hashes 1204 1179 end 1205 - ) ordered 1206 - ) solutions; 1180 + ) ordered; 1181 + (ordered, pkg_to_hash) 1182 + ) solutions in 1207 1183 Container.deinit ~t; 1208 1184 (* Topological sort of the global DAG by build_hash *) 1209 1185 let remaining : (string, int) Hashtbl.t = Hashtbl.create (Hashtbl.length nodes) in ··· 1234 1210 done; 1235 1211 (* result is in reverse topo order, reverse it *) 1236 1212 let ordered_hashes = List.rev !result in 1237 - List.filter_map (fun h -> Hashtbl.find_opt nodes h) ordered_hashes 1213 + (List.filter_map (fun h -> Hashtbl.find_opt nodes h) ordered_hashes, per_solution_hashes) 1238 1214 1239 1215 (** Execute build layers in dependency order with up to [np] concurrent runc containers. 1240 - Calls [build_one node] for each node; the function should return true on success. *) 1241 - let execute_dag ~np ~on_complete ~cache_dir ~os_key (nodes : build_node list) (build_one : build_node -> bool) = 1216 + Calls [build_one node] for each node; the function should return true on success. 1217 + [on_cascade] is called for each node that fails due to a dependency failure, 1218 + with the failing node and the build_hash of the dep that caused the cascade. *) 1219 + let execute_dag ~np ~on_complete ?on_cascade ~cache_dir ~os_key (nodes : build_node list) (build_one : build_node -> bool) = 1242 1220 (* Index: build_hash -> node *) 1243 1221 let node_of_hash : (string, build_node) Hashtbl.t = Hashtbl.create (List.length nodes) in 1244 1222 List.iter (fun n -> Hashtbl.replace node_of_hash n.build_hash n) nodes; ··· 1271 1249 let total = List.length nodes in 1272 1250 let completed_count = ref 0 in 1273 1251 let failed_count = ref 0 in 1252 + (* Mark a node as cascade-failed and notify callbacks *) 1253 + let cascade_fail ~failed_hash ~failed_dep_hash = 1254 + Hashtbl.replace completed failed_hash false; 1255 + incr completed_count; 1256 + incr failed_count; 1257 + on_complete ~total ~completed:!completed_count ~failed:!failed_count failed_hash false; 1258 + (match on_cascade with Some f -> f ~failed_hash ~failed_dep_hash | None -> ()) 1259 + in 1260 + (* Recursively propagate failure to all transitive dependents *) 1261 + let rec propagate_failure failed_dep_hash = 1262 + List.iter (fun rdep_hash -> 1263 + let c = Hashtbl.find remaining_deps rdep_hash - 1 in 1264 + Hashtbl.replace remaining_deps rdep_hash c; 1265 + if c = 0 then begin 1266 + cascade_fail ~failed_hash:rdep_hash ~failed_dep_hash; 1267 + propagate_failure rdep_hash 1268 + end 1269 + ) (try Hashtbl.find rdeps failed_dep_hash with Not_found -> []) 1270 + in 1271 + (* After a node completes, promote or cascade-fail its dependents *) 1272 + let promote_dependents hash = 1273 + List.iter (fun dep_hash -> 1274 + let count = Hashtbl.find remaining_deps dep_hash - 1 in 1275 + Hashtbl.replace remaining_deps dep_hash count; 1276 + if count = 0 then begin 1277 + let all_deps_ok = match Hashtbl.find_opt node_of_hash dep_hash with 1278 + | Some n -> List.for_all (fun dh -> 1279 + match Hashtbl.find_opt completed dh with 1280 + | Some true -> true 1281 + | _ -> not (Hashtbl.mem node_of_hash dh) (* not in DAG = pre-existing *) 1282 + ) n.dep_build_hashes 1283 + | None -> false 1284 + in 1285 + if all_deps_ok then 1286 + Queue.push (Hashtbl.find node_of_hash dep_hash) ready 1287 + else begin 1288 + cascade_fail ~failed_hash:dep_hash ~failed_dep_hash:hash; 1289 + propagate_failure dep_hash 1290 + end 1291 + end 1292 + ) (try Hashtbl.find rdeps hash with Not_found -> []) 1293 + in 1294 + (* Record a node's completion and promote its dependents *) 1295 + let complete_node hash success = 1296 + Hashtbl.replace completed hash success; 1297 + incr completed_count; 1298 + if not success then incr failed_count; 1299 + on_complete ~total ~completed:!completed_count ~failed:!failed_count hash success; 1300 + promote_dependents hash 1301 + in 1274 1302 let reap_one () = 1275 1303 let pid, status = Unix.waitpid [] (-1) in 1276 1304 let exit_code = match status with ··· 1280 1308 match Hashtbl.find_opt running pid with 1281 1309 | Some hash -> 1282 1310 Hashtbl.remove running pid; 1283 - let success = exit_code = 0 in 1284 - Hashtbl.replace completed hash success; 1285 - incr completed_count; 1286 - if not success then incr failed_count; 1287 - on_complete ~total ~completed:!completed_count ~failed:!failed_count hash success; 1288 - (* Promote dependents *) 1289 - List.iter (fun dep_hash -> 1290 - let count = Hashtbl.find remaining_deps dep_hash - 1 in 1291 - Hashtbl.replace remaining_deps dep_hash count; 1292 - if count = 0 then begin 1293 - (* Check if all deps actually succeeded *) 1294 - let all_deps_ok = match Hashtbl.find_opt node_of_hash dep_hash with 1295 - | Some n -> List.for_all (fun dh -> 1296 - match Hashtbl.find_opt completed dh with 1297 - | Some true -> true 1298 - | _ -> not (Hashtbl.mem node_of_hash dh) (* not in DAG = pre-existing *) 1299 - ) n.dep_build_hashes 1300 - | None -> false 1301 - in 1302 - if all_deps_ok then 1303 - Queue.push (Hashtbl.find node_of_hash dep_hash) ready 1304 - else begin 1305 - (* Dependency failed — mark as failed without building *) 1306 - Hashtbl.replace completed dep_hash false; 1307 - incr completed_count; 1308 - incr failed_count; 1309 - on_complete ~total ~completed:!completed_count ~failed:!failed_count dep_hash false; 1310 - (* Propagate failure to this node's dependents too *) 1311 - let rec propagate_failure h = 1312 - List.iter (fun rdep_hash -> 1313 - let c = Hashtbl.find remaining_deps rdep_hash - 1 in 1314 - Hashtbl.replace remaining_deps rdep_hash c; 1315 - if c = 0 then begin 1316 - Hashtbl.replace completed rdep_hash false; 1317 - incr completed_count; 1318 - incr failed_count; 1319 - on_complete ~total ~completed:!completed_count ~failed:!failed_count rdep_hash false; 1320 - propagate_failure rdep_hash 1321 - end 1322 - ) (try Hashtbl.find rdeps h with Not_found -> []) 1323 - in 1324 - propagate_failure dep_hash 1325 - end 1326 - end 1327 - ) (try Hashtbl.find rdeps hash with Not_found -> []) 1311 + complete_node hash (exit_code = 0) 1328 1312 | None -> () 1329 1313 in 1330 1314 while !completed_count < total do ··· 1335 1319 let layer_json = Path.(cache_dir / os_key / node.build_hash / "layer.json") in 1336 1320 if Sys.file_exists layer_json then begin 1337 1321 let exit_status = Util.load_layer_info_exit_status layer_json in 1338 - let success = exit_status = 0 in 1339 - Hashtbl.replace completed node.build_hash success; 1340 - incr completed_count; 1341 - if not success then incr failed_count; 1342 - on_complete ~total ~completed:!completed_count ~failed:!failed_count node.build_hash success; 1343 - (* Promote dependents *) 1344 - List.iter (fun dep_hash -> 1345 - let c = Hashtbl.find remaining_deps dep_hash - 1 in 1346 - Hashtbl.replace remaining_deps dep_hash c; 1347 - if c = 0 then begin 1348 - if success then 1349 - Queue.push (Hashtbl.find node_of_hash dep_hash) ready 1350 - (* If not success, will be handled by next reap cycle *) 1351 - end 1352 - ) (try Hashtbl.find rdeps node.build_hash with Not_found -> []) 1322 + complete_node node.build_hash (exit_status = 0) 1353 1323 end else begin 1354 1324 match Unix.fork () with 1355 1325 | 0 -> 1356 1326 Random.init (Unix.getpid () lxor int_of_float (Unix.gettimeofday () *. 1000000.)); 1357 1327 let success = (try build_one node with _ -> false) in 1358 - exit (if success then 0 else 1) 1328 + Unix._exit (if success then 0 else 1) 1359 1329 | child_pid -> 1360 1330 Hashtbl.replace running child_pid node.build_hash 1361 1331 end ··· 1372 1342 if not (Hashtbl.mem completed node.build_hash) then begin 1373 1343 Hashtbl.replace completed node.build_hash false; 1374 1344 incr completed_count; 1375 - incr failed_count 1345 + incr failed_count; 1346 + on_complete ~total ~completed:!completed_count ~failed:!failed_count node.build_hash false 1376 1347 end 1377 1348 ) nodes 1378 1349 end ··· 1409 1380 1410 1381 (* Save batch config so rerun --cascade can replay with same settings *) 1411 1382 let os_key = Config.os_key ~config in 1383 + Os.mkdir ~parents:true Path.(config.dir / os_key); 1412 1384 let build_config_path = Path.(config.dir / os_key / "build-config.json") in 1413 1385 let build_config_json = `Assoc [ 1414 1386 ("opam_repositories", `List (List.map (fun s -> `String s) config.opam_repositories)); ··· 1593 1565 let items = List.filter_map (fun (target, _solution) -> 1594 1566 List.find_opt (fun (t, _) -> OpamPackage.equal t target) blessing_maps 1595 1567 ) solutions in 1596 - let print_batch_summary () = 1568 + let print_batch_summary ?(per_solution_hashes : (OpamPackage.t list * (string, string) Hashtbl.t) list option) () = 1597 1569 let os_key = Config.os_key ~config in 1598 1570 let layer_dir = Path.(config.dir / os_key) in 1599 1571 let packages_dir = Path.(config.dir / os_key / "packages") in ··· 1650 1622 let build_layer_info = Hashtbl.create 64 in 1651 1623 (* Track which (pkg, build_hash) pairs we've already processed *) 1652 1624 let processed = Hashtbl.create 4096 in 1653 - (* Cache for Container.layer_hash results — avoids re-reading opam files *) 1654 - let hash_cache : (string, string) Hashtbl.t = Hashtbl.create 4096 in 1655 - let cached_layer_hash ~t pkgs = 1656 - (* Key: sorted package names *) 1657 - let key = String.concat "," (List.map OpamPackage.to_string pkgs) in 1658 - match Hashtbl.find_opt hash_cache key with 1659 - | Some h -> h 1625 + (* Precompute per-solution data: topological order, pkg->build_hash mapping. 1626 + When per_solution_hashes is provided (fork path), use those directly. 1627 + Otherwise compute from scratch (non-fork path). *) 1628 + let solution_info = match per_solution_hashes with 1629 + | Some hashes -> 1630 + List.map2 (fun (target, solution) (ordered, pkg_hashes) -> 1631 + (target, solution, ordered, pkg_hashes) 1632 + ) solutions hashes 1660 1633 | None -> 1661 - let h = Container.layer_hash ~t pkgs in 1662 - Hashtbl.replace hash_cache key h; 1663 - h 1634 + let t_for_hash = Container.init ~config in 1635 + List.map (fun (target, solution) -> 1636 + let ordered = topological_sort solution in 1637 + let dependencies = pkg_deps solution ordered in 1638 + let pkg_hashes = Hashtbl.create (List.length ordered) in 1639 + List.iter (fun pkg -> 1640 + let ordered_deps = extract_dag dependencies pkg |> topological_sort |> List.rev |> List.tl in 1641 + let hash = cached_layer_hash_global ~t:t_for_hash (pkg :: ordered_deps) in 1642 + Hashtbl.replace pkg_hashes (OpamPackage.to_string pkg) ("build-" ^ hash) 1643 + ) ordered; 1644 + (target, solution, ordered, pkg_hashes) 1645 + ) solutions 1664 1646 in 1665 - (* Iterate over solutions: for each target, compute build hashes and look up results. 1647 + (* Iterate over solutions: for each target, look up build hashes and results. 1666 1648 Use blessing maps to correctly assign blessed status. *) 1667 - let t_for_hash = Container.init ~config in 1668 - List.iter (fun (target, solution) -> 1649 + List.iter (fun (target, _solution, ordered, pkg_hashes) -> 1669 1650 let bless_map = List.find_opt (fun (t, _) -> 1670 1651 OpamPackage.equal t target 1671 1652 ) blessing_maps in 1672 - let ordered = topological_sort solution in 1673 - let dependencies = pkg_deps solution ordered in 1674 - let rec process_pkgs built_so_far = function 1675 - | [] -> () 1676 - | pkg :: rest -> 1677 - let pkg_str = OpamPackage.to_string pkg in 1678 - let ordered_deps = extract_dag dependencies pkg |> topological_sort |> List.rev |> List.tl in 1679 - let hash = cached_layer_hash ~t:t_for_hash (pkg :: ordered_deps) in 1680 - let build_layer_name = "build-" ^ hash in 1681 - let is_blessed = match bless_map with 1682 - | Some (_, map) -> Blessing.is_blessed map pkg 1683 - | None -> false 1684 - in 1685 - let key = (pkg_str, build_layer_name) in 1686 - (* Look up this layer on demand *) 1687 - match lookup_build_layer build_layer_name with 1688 - | Some (_, exit_status, compiler, _json) -> 1689 - Hashtbl.replace built_so_far pkg_str build_layer_name; 1690 - Hashtbl.replace built_packages pkg_str true; 1691 - Hashtbl.replace build_layer_info pkg_str (build_layer_name, exit_status, compiler); 1692 - if not (Hashtbl.mem processed key) then begin 1693 - Hashtbl.replace processed key true; 1694 - if exit_status = 0 then begin 1695 - incr build_success; 1696 - let build_log = Path.(layer_dir / build_layer_name / "build.log") in 1697 - Day10_lib.Run_log.add_build_log run_info ~package:pkg_str ~source_log:build_log; 1698 - record_build_result ~packages_dir ~run_id ~pkg_str 1699 - ~build_hash:build_layer_name ~compiler ~blessed:is_blessed 1700 - ~status:"success" ~category:"success" () 1701 - end else begin 1702 - incr build_fail; 1703 - failures := (pkg_str, Printf.sprintf "build exit code %d" exit_status) :: !failures; 1704 - let build_log = Path.(layer_dir / build_layer_name / "build.log") in 1705 - Day10_lib.Run_log.add_build_log run_info ~package:pkg_str ~source_log:build_log; 1706 - let (status, category, error) = classify_build_failure build_log in 1707 - record_build_result ~packages_dir ~run_id ~pkg_str 1708 - ~build_hash:build_layer_name ~compiler ~blessed:is_blessed 1709 - ~status ~category ?error () 1710 - end 1711 - end; 1712 - (* Continue building if this dep succeeded *) 1713 - if exit_status = 0 then 1714 - process_pkgs built_so_far rest 1715 - (* else: remaining deps are dependency failures, handled below *) 1716 - | None -> 1717 - (* Layer doesn't exist — this package was never built (dep failure upstream) *) 1718 - () 1719 - in 1720 - process_pkgs (Hashtbl.create 64) ordered; 1721 - ) solutions; 1653 + List.iter (fun pkg -> 1654 + let pkg_str = OpamPackage.to_string pkg in 1655 + let build_layer_name = Hashtbl.find pkg_hashes pkg_str in 1656 + let is_blessed = match bless_map with 1657 + | Some (_, map) -> Blessing.is_blessed map pkg 1658 + | None -> false 1659 + in 1660 + let key = (pkg_str, build_layer_name) in 1661 + (* Look up this layer on demand *) 1662 + match lookup_build_layer build_layer_name with 1663 + | Some (_, exit_status, compiler, _json) -> 1664 + Hashtbl.replace built_packages pkg_str true; 1665 + Hashtbl.replace build_layer_info pkg_str (build_layer_name, exit_status, compiler); 1666 + if not (Hashtbl.mem processed key) then begin 1667 + Hashtbl.replace processed key true; 1668 + if exit_status = 0 then begin 1669 + incr build_success; 1670 + let build_log = Path.(layer_dir / build_layer_name / "build.log") in 1671 + Day10_lib.Run_log.add_build_log run_info ~package:pkg_str ~source_log:build_log; 1672 + record_build_result ~packages_dir ~run_id ~pkg_str 1673 + ~build_hash:build_layer_name ~compiler ~blessed:is_blessed 1674 + ~status:"success" ~category:"success" () 1675 + end else begin 1676 + incr build_fail; 1677 + failures := (pkg_str, Printf.sprintf "build exit code %d" exit_status) :: !failures; 1678 + let build_log = Path.(layer_dir / build_layer_name / "build.log") in 1679 + Day10_lib.Run_log.add_build_log run_info ~package:pkg_str ~source_log:build_log; 1680 + let (status, category, error) = classify_build_failure build_log in 1681 + record_build_result ~packages_dir ~run_id ~pkg_str 1682 + ~build_hash:build_layer_name ~compiler ~blessed:is_blessed 1683 + ~status ~category ?error () 1684 + end 1685 + end 1686 + | None -> 1687 + (* Layer doesn't exist — dep failure, handled in next loop *) 1688 + () 1689 + ) ordered; 1690 + ) solution_info; 1722 1691 (* Process doc layers — only scan doc-* directories modified during this run *) 1723 1692 let run_start_time = Day10_lib.Run_log.get_start_time run_info in 1724 1693 (try ··· 1771 1740 (* Record dependency failures: packages in solutions that have no build layer. 1772 1741 Walk the dependency graph to find the root cause — the first dep that 1773 1742 actually failed to build (has a build layer with non-zero exit). *) 1774 - List.iter (fun (target, solution) -> 1743 + (* Cache: build_hash -> root failure info (keyed by build_hash since same package 1744 + can have different hashes/outcomes in different solutions) *) 1745 + let root_failure_cache : (string, (string * string) option) Hashtbl.t = Hashtbl.create 256 in 1746 + let rec find_root_failure solution pkg_hashes pkg visited = 1747 + let pkg_str = OpamPackage.to_string pkg in 1748 + let build_hash = try Hashtbl.find pkg_hashes pkg_str with Not_found -> "" in 1749 + if build_hash = "" then None 1750 + else 1751 + match Hashtbl.find_opt root_failure_cache build_hash with 1752 + | Some cached -> cached 1753 + | None -> 1754 + if OpamPackage.Set.mem pkg visited then None 1755 + else 1756 + let visited = OpamPackage.Set.add pkg visited in 1757 + let result = match lookup_build_layer build_hash with 1758 + | Some (_, exit_status, _, _) when exit_status <> 0 -> 1759 + Some (pkg_str, build_hash) 1760 + | Some _ -> 1761 + None (* This dep succeeded, not the cause *) 1762 + | None -> 1763 + (* No layer = dep-failure, walk deps to find root *) 1764 + let dep_pkgs = try OpamPackage.Set.elements (OpamPackage.Map.find pkg solution) with Not_found -> [] in 1765 + List.find_map (fun dep -> find_root_failure solution pkg_hashes dep visited) dep_pkgs 1766 + in 1767 + (* Only cache positive results — None may resolve via a different path *) 1768 + if result <> None then 1769 + Hashtbl.replace root_failure_cache build_hash result; 1770 + result 1771 + in 1772 + List.iter (fun (target, solution, _ordered, pkg_hashes) -> 1775 1773 let bless_map = List.find_opt (fun (t, _) -> 1776 1774 OpamPackage.equal t target 1777 1775 ) blessing_maps in 1778 - let ordered = topological_sort solution in 1779 - let dependencies = pkg_deps solution ordered in 1780 - (* Find the root failing dependency by walking transitive deps *) 1781 - let rec find_root_failure pkg visited = 1782 - let pkg_str = OpamPackage.to_string pkg in 1783 - if OpamPackage.Set.mem pkg visited then None 1784 - else 1785 - let visited = OpamPackage.Set.add pkg visited in 1786 - (* Check if this package itself failed to build *) 1787 - match Hashtbl.find_opt build_layer_info pkg_str with 1788 - | Some (hash, exit_status, _) when exit_status <> 0 -> 1789 - Some (pkg_str, hash) 1790 - | _ -> 1791 - (* Walk its deps to find the root cause *) 1792 - let dep_pkgs = try OpamPackage.Set.elements (OpamPackage.Map.find pkg solution) with Not_found -> [] in 1793 - List.find_map (fun dep -> find_root_failure dep visited) dep_pkgs 1794 - in 1795 - (* Compute universe hash for this target's solution *) 1796 - let target_universe_hash = 1797 - let build_hashes = List.filter_map (fun pkg -> 1798 - let ordered_deps = extract_dag dependencies pkg |> topological_sort |> List.rev |> List.tl in 1799 - let hash = cached_layer_hash ~t:t_for_hash (pkg :: ordered_deps) in 1800 - Some ("build-" ^ hash) 1801 - ) ordered in 1802 - Odoc_gen.compute_universe_hash build_hashes 1803 - in 1804 1776 OpamPackage.Map.iter (fun pkg _deps -> 1805 1777 let pkg_str = OpamPackage.to_string pkg in 1806 1778 if not (Hashtbl.mem built_packages pkg_str) then begin 1807 - let failed_dep_info = find_root_failure pkg OpamPackage.Set.empty in 1808 - let failed_dep, failed_dep_hash = match failed_dep_info with 1809 - | Some (dep, hash) -> (Some dep, Some hash) 1810 - | None -> (None, None) 1811 - in 1812 - let is_blessed = match bless_map with 1813 - | Some (_, map) -> Blessing.is_blessed map pkg 1814 - | None -> false 1815 - in 1816 - record_build_result ~packages_dir ~run_id ~pkg_str 1817 - ~build_hash:("universe-" ^ target_universe_hash) ~compiler:"" ~blessed:is_blessed 1818 - ~status:"failure" ~category:"dependency_failure" 1819 - ?failed_dep ?failed_dep_hash () 1779 + let build_hash = Hashtbl.find pkg_hashes pkg_str in 1780 + (* Skip if already recorded this exact (pkg, hash) pair *) 1781 + if not (Hashtbl.mem processed (pkg_str, build_hash)) then begin 1782 + Hashtbl.replace processed (pkg_str, build_hash) true; 1783 + let failed_dep_info = find_root_failure solution pkg_hashes pkg OpamPackage.Set.empty in 1784 + let failed_dep, failed_dep_hash = match failed_dep_info with 1785 + | Some (dep, hash) -> (Some dep, Some hash) 1786 + | None -> (None, None) 1787 + in 1788 + let is_blessed = match bless_map with 1789 + | Some (_, map) -> Blessing.is_blessed map pkg 1790 + | None -> false 1791 + in 1792 + record_build_result ~packages_dir ~run_id ~pkg_str 1793 + ~build_hash ~compiler:"" ~blessed:is_blessed 1794 + ~status:"failure" ~category:"dependency_failure" 1795 + ?failed_dep ?failed_dep_hash () 1796 + end 1820 1797 end 1821 1798 ) solution 1822 - ) solutions; 1799 + ) solution_info; 1823 1800 (* Write universe JSON files: for each target/solution, compute universe hash 1824 1801 and write a file listing the packages in that universe *) 1825 1802 let universes_dir = Path.(layer_dir / "universes") in 1826 1803 (try Unix.mkdir universes_dir 0o755 with Unix.Unix_error (Unix.EEXIST, _, _) -> ()); 1827 1804 let universe_hashes_written = Hashtbl.create 64 in 1828 - List.iter (fun (_target, solution) -> 1829 - let ordered = topological_sort solution in 1830 - let dependencies = pkg_deps solution ordered in 1831 - (* Compute build hashes for all packages in this solution *) 1805 + List.iter (fun (_target, _solution, ordered, pkg_hashes) -> 1806 + (* Look up build hashes from precomputed mapping *) 1832 1807 let build_hashes = List.filter_map (fun pkg -> 1833 - let ordered_deps = extract_dag dependencies pkg |> topological_sort |> List.rev |> List.tl in 1834 - let hash = cached_layer_hash ~t:t_for_hash (pkg :: ordered_deps) in 1835 - Some ("build-" ^ hash) 1808 + Hashtbl.find_opt pkg_hashes (OpamPackage.to_string pkg) 1836 1809 ) ordered in 1837 1810 let universe_hash = Odoc_gen.compute_universe_hash build_hashes in 1838 1811 if not (Hashtbl.mem universe_hashes_written universe_hash) then begin ··· 1853 1826 output_char oc '\n'); 1854 1827 Sys.rename tmp universe_file 1855 1828 end 1856 - ) solutions; 1829 + ) solution_info; 1857 1830 Printf.printf " Universes written: %d\n%!" (Hashtbl.length universe_hashes_written); 1858 1831 let html_versions = match config.html_output with 1859 1832 | None -> 0 ··· 1971 1944 Day10_lib.Progress.delete ~run_dir:(Day10_lib.Run_log.get_run_dir run_info) 1972 1945 | Some n -> 1973 1946 Printf.printf " Building global DAG...\n%!"; 1974 - let dag_nodes = build_global_dag ~config solutions in 1947 + let (dag_nodes, per_solution_hashes) = build_global_dag ~config solutions in 1975 1948 Printf.printf " %d unique build layers in DAG\n%!" (List.length dag_nodes); 1976 1949 let os_key = Config.os_key ~config in 1977 1950 let t = Container.init ~config in ··· 1995 1968 last_reported := completed 1996 1969 end 1997 1970 in 1971 + let on_cascade ~failed_hash ~failed_dep_hash = 1972 + let pkg_str = match Hashtbl.find_opt node_by_hash failed_hash with 1973 + | Some n -> OpamPackage.to_string n.pkg | None -> failed_hash in 1974 + let dep_pkg_str = match Hashtbl.find_opt node_by_hash failed_dep_hash with 1975 + | Some n -> OpamPackage.to_string n.pkg | None -> failed_dep_hash in 1976 + Os.log "dag: CASCADE %s (%s) — dep %s (%s) failed" pkg_str failed_hash dep_pkg_str failed_dep_hash; 1977 + (* Write skeleton layer so cascade rerun can rebuild without re-solving *) 1978 + (match Hashtbl.find_opt node_by_hash failed_hash with 1979 + | None -> () 1980 + | Some node -> 1981 + Util.write_skeleton_layer ~cache_dir:config.dir ~os_key 1982 + ~opam_repositories:config.opam_repositories ~layer_name:node.build_hash 1983 + ~pkg:node.pkg ~ordered_deps:node.ordered_deps ~dep_build_hashes:node.dep_build_hashes) 1984 + in 1998 1985 let build_one (node : build_node) = 1999 1986 let r = build_layer t node.pkg node.build_hash node.ordered_deps node.dep_build_hashes in 2000 1987 match r with 2001 1988 | Success _ -> true 2002 1989 | _ -> false 2003 1990 in 2004 - execute_dag ~np:n ~on_complete ~cache_dir:config.dir ~os_key dag_nodes build_one; 1991 + execute_dag ~np:n ~on_complete ~on_cascade ~cache_dir:config.dir ~os_key dag_nodes build_one; 2005 1992 Container.deinit ~t; 2006 1993 Printf.printf "\n%!"; 2007 1994 (* Run global deferred doc link pass for x-extra-doc-deps *) ··· 2055 2042 Day10_lib.Progress.write ~run_dir:(Day10_lib.Run_log.get_run_dir run_info) !progress_ref; 2056 2043 (* Run garbage collection *) 2057 2044 run_gc ~config ~solutions; 2058 - print_batch_summary (); 2045 + print_batch_summary ~per_solution_hashes (); 2059 2046 (* Delete progress.json - summary.json takes over *) 2060 2047 Day10_lib.Progress.delete ~run_dir:(Day10_lib.Run_log.get_run_dir run_info) 2061 2048 ··· 2360 2347 2361 2348 (** Compute universe hash from a build layer's layer.json hashes field *) 2362 2349 let universe_hash_of_layer ~os_dir layer_name = 2363 - (* For dependency failures, the build_hash is "universe-{hash}" directly *) 2350 + (* Legacy: old dep failures used "universe-{hash}" as build_hash *) 2364 2351 if String.length layer_name > 9 && String.sub layer_name 0 9 = "universe-" then 2365 2352 Some (String.sub layer_name 9 (String.length layer_name - 9)) 2366 2353 else ··· 2852 2839 (** Rerun a single build layer: read its layer.json for the original deps/hashes, 2853 2840 delete the layer, and rebuild using exactly the same inputs. No re-solving. 2854 2841 Records the result in history. *) 2855 - let rerun_build_layer ~cache_dir ~os_key ~opam_repositories ~local_repos ~arch ~os ~os_distribution ~os_family ~os_version ~packages_dir ~run_id ~build_hash = 2842 + let rerun_build_layer ~cache_dir ~os_key ~arch ~os ~os_distribution ~os_family ~os_version ~packages_dir ~run_id ~build_hash = 2856 2843 let layer_dir = Path.(cache_dir / os_key / build_hash) in 2857 2844 let layer_json = Path.(layer_dir / "layer.json") in 2858 2845 if not (Sys.file_exists layer_json) then begin ··· 2877 2864 ) in 2878 2865 let ordered_build_hashes = json |> member "hashes" |> to_list |> List.map to_string in 2879 2866 Printf.printf "Rerunning %s (%s)\n%!" pkg_str build_hash; 2867 + (* Save the layer's opam-repository before deleting — it contains the exact 2868 + opam files used for the original build *) 2869 + let saved_opam_repo = Path.(cache_dir / os_key / ".rerun-opam-repo-" ^ build_hash) in 2870 + let layer_opam_repo = Path.(layer_dir / "opam-repository") in 2871 + (if Sys.file_exists layer_opam_repo then begin 2872 + Printf.printf " Saving opam-repository from layer...\n%!"; 2873 + let _ = Os.sudo ["cp"; "-a"; layer_opam_repo; saved_opam_repo] in () 2874 + end else 2875 + Printf.eprintf " Warning: no opam-repository in layer, rebuild may use wrong opam files\n%!"); 2880 2876 Printf.printf " Removing cached layer...\n%!"; 2881 2877 let _ = Os.sudo ["rm"; "-rf"; layer_dir] in 2878 + (* Use the saved opam-repository from the layer itself *) 2879 + let opam_repositories = if Sys.file_exists saved_opam_repo then [saved_opam_repo] else [] in 2882 2880 (* Build a minimal config — only what build_layer needs *) 2883 2881 let config : Config.t = { 2884 2882 dir = cache_dir; ocaml_version = None; opam_repositories; ··· 2887 2885 with_test = false; with_doc = false; with_jtw = false; 2888 2886 doc_tools_repo = ""; doc_tools_branch = ""; 2889 2887 jtw_tools_repo = ""; jtw_tools_branch = ""; 2890 - local_repos; html_output = None; jtw_output = None; tag = None; 2888 + local_repos = []; html_output = None; jtw_output = None; tag = None; 2891 2889 log = false; dry_run = false; fork = None; prune_layers = false; blessed_map = None; 2892 2890 } in 2893 2891 let t = Container.init ~config in 2892 + init t; 2894 2893 let result = build_layer t pkg build_hash ordered_deps ordered_build_hashes in 2895 2894 let (success, status, category, error) = match result with 2896 2895 | Success _ -> (true, "success", "success", None) ··· 2899 2898 (false, "failure", cat, err) 2900 2899 | _ -> (false, "failure", "build_failure", Some "unexpected result") 2901 2900 in 2901 + (* Clean up saved opam-repository *) 2902 + if Sys.file_exists saved_opam_repo then 2903 + ignore (Os.sudo ["rm"; "-rf"; saved_opam_repo]); 2902 2904 Printf.printf " Result: %s\n%!" category; 2903 2905 record_build_result ~packages_dir ~run_id ~pkg_str ~build_hash 2904 2906 ~compiler ~blessed ~status ~category ?error (); 2905 2907 success 2906 2908 end 2907 2909 2908 - (** After a cascade rebuild via run_health_check, scan the package's build layers 2909 - and record history entries for any new/changed layers. 2910 - [was_blessed] indicates whether the original dependency_failure entry was blessed — 2911 - if so, any new build layer for this package inherits blessed status. *) 2912 - let record_cascade_results ~cache_dir ~os_key ~packages_dir ~run_id ~pkg_str ~was_blessed = 2913 - let os_dir = Path.(cache_dir / os_key) in 2914 - let pkg_dir = Path.(packages_dir / pkg_str) in 2915 - if Sys.file_exists pkg_dir then begin 2916 - (* Find build-* and doc-* symlinks in the package directory *) 2917 - try 2918 - Sys.readdir pkg_dir |> Array.iter (fun name -> 2919 - if String.length name > 6 && String.sub name 0 6 = "build-" then begin 2920 - let layer_json = Path.(os_dir / name / "layer.json") in 2921 - if Sys.file_exists layer_json then begin 2922 - try 2923 - let json = Yojson.Safe.from_file layer_json in 2924 - let open Yojson.Safe.Util in 2925 - let exit_status = try json |> member "exit_status" |> to_int with _ -> -1 in 2926 - let compiler = extract_compiler_from_deps json in 2927 - let (status, category, error) = 2928 - if exit_status = 0 then ("success", "success", None) 2929 - else 2930 - let (_s, cat, err) = classify_build_failure Path.(os_dir / name / "build.log") in 2931 - ("failure", cat, match err with Some _ -> err | None -> Some (Printf.sprintf "exit code %d" exit_status)) 2932 - in 2933 - record_build_result ~packages_dir ~run_id ~pkg_str ~build_hash:name 2934 - ~compiler ~blessed:was_blessed ~status ~category ?error () 2935 - with _ -> () 2936 - end 2937 - end; 2938 - if String.length name > 4 && String.sub name 0 4 = "doc-" then begin 2939 - let layer_json = Path.(os_dir / name / "layer.json") in 2940 - if Sys.file_exists layer_json then begin 2941 - try 2942 - let json = Yojson.Safe.from_file layer_json in 2943 - let open Yojson.Safe.Util in 2944 - let exit_status = try json |> member "exit_status" |> to_int with _ -> -1 in 2945 - if exit_status <> 0 then begin 2946 - let compiler = extract_compiler_from_deps json in 2947 - let (_s, category, err) = classify_build_failure Path.(os_dir / name / "odoc-voodoo-all.log") in 2948 - let error = match err with Some _ -> err | None -> Some (Printf.sprintf "doc exit code %d" exit_status) in 2949 - record_build_result ~packages_dir ~run_id ~pkg_str ~build_hash:name 2950 - ~compiler ~blessed:was_blessed ~status:"failure" ~category ?error () 2951 - end 2952 - with _ -> () 2953 - end 2954 - end 2955 - ) 2956 - with _ -> () 2957 - end 2958 - 2959 2910 let run_rerun ~cache_dir ~format ~arch ~os ~os_distribution ~os_family ~os_version ~cascade ~target = 2960 2911 let os_key = Printf.sprintf "%s-%s-%s" os_distribution os_version arch in 2961 2912 let packages_dir = Path.(cache_dir / os_key / "packages") in 2962 - (* Read opam_repositories from build-config.json *) 2963 - let build_config_path = Path.(cache_dir / os_key / "build-config.json") in 2964 - let (opam_repositories, local_repos) = 2965 - if Sys.file_exists build_config_path then begin 2966 - try 2967 - let json = Yojson.Safe.from_file build_config_path in 2968 - let open Yojson.Safe.Util in 2969 - let sl key = try json |> member key |> to_list |> List.map to_string with _ -> [] in 2970 - (sl "opam_repositories", sl "local_repos") 2971 - with _ -> 2972 - Printf.eprintf "Error: could not read %s\n%!" build_config_path; 2973 - Stdlib.exit 1 2974 - end else begin 2975 - Printf.eprintf "Error: no build-config.json found at %s (run batch first)\n%!" build_config_path; 2976 - Stdlib.exit 1 2977 - end 2978 - in 2979 2913 let run_id = Printf.sprintf "rerun-%s" (Day10_lib.Run_log.format_time (Unix.gettimeofday ())) in 2980 2914 (* Determine if target is a build hash or package name *) 2981 2915 let is_build_hash = String.length target > 6 && String.sub target 0 6 = "build-" in ··· 3025 2959 let rerun_hashes = List.map snd builds_to_rerun in 3026 2960 (* Rerun: delete layer, rebuild with same deps/hashes *) 3027 2961 let succeeded = List.filter_map (fun (_pkg_name, build_hash) -> 3028 - if rerun_build_layer ~cache_dir ~os_key ~opam_repositories ~local_repos ~arch ~os ~os_distribution ~os_family ~os_version ~packages_dir ~run_id ~build_hash then 2962 + if rerun_build_layer ~cache_dir ~os_key ~arch ~os ~os_distribution ~os_family ~os_version ~packages_dir ~run_id ~build_hash then 3029 2963 Some build_hash 3030 2964 else 3031 2965 None ··· 3034 2968 if cascade && succeeded <> [] then begin 3035 2969 let cascade_targets = find_cascade_targets ~packages_dir ~build_hashes:rerun_hashes in 3036 2970 if cascade_targets <> [] then begin 3037 - (* Read remaining config from build-config.json *) 3038 - let (with_doc, doc_tools_repo, doc_tools_branch, jtw_tools_repo, jtw_tools_branch, 3039 - html_output, jtw_output) = 3040 - try 3041 - let json = Yojson.Safe.from_file build_config_path in 3042 - let open Yojson.Safe.Util in 3043 - let s key default = json |> member key |> to_string_option |> Option.value ~default in 3044 - let b key default = json |> member key |> to_bool_option |> Option.value ~default in 3045 - let so key = json |> member key |> to_string_option in 3046 - (b "with_doc" false, 3047 - s "doc_tools_repo" "", s "doc_tools_branch" "", 3048 - s "jtw_tools_repo" "", s "jtw_tools_branch" "", 3049 - so "html_output", so "jtw_output") 3050 - with _ -> (false, "", "", "", "", None, None) 3051 - in 3052 - Printf.printf "Cascading to %d packages with dependency_failure on rerun targets%s:\n%!" 3053 - (List.length cascade_targets) (if with_doc then " (with docs)" else ""); 2971 + Printf.printf "Cascading to %d packages with dependency_failure on rerun targets:\n%!" 2972 + (List.length cascade_targets); 3054 2973 List.iter (fun (pkg, (e : Day10_lib.History.entry)) -> 3055 2974 Printf.printf " %s (was blocked on %s)\n%!" pkg 3056 2975 (match e.failed_dep with Some d -> d | None -> "unknown"); 3057 - run_health_check { dir = cache_dir; ocaml_version = None; opam_repositories; 3058 - package = pkg; arch; os; os_distribution; os_family; os_version; 3059 - directory = None; md = None; json = None; dot = None; 3060 - with_test = false; with_doc; with_jtw = false; 3061 - doc_tools_repo; doc_tools_branch; 3062 - jtw_tools_repo; jtw_tools_branch; 3063 - local_repos; html_output; jtw_output; tag = None; 3064 - log = false; dry_run = false; fork = None; prune_layers = false; blessed_map = None }; 3065 - record_cascade_results ~cache_dir ~os_key ~packages_dir ~run_id ~pkg_str:pkg ~was_blessed:e.blessed 2976 + ignore (rerun_build_layer ~cache_dir ~os_key ~arch ~os ~os_distribution ~os_family ~os_version 2977 + ~packages_dir ~run_id ~build_hash:e.build_hash) 3066 2978 ) cascade_targets 3067 2979 end else 3068 2980 Printf.printf "\nNo cascade targets found.\n%!" ··· 3267 3179 Printf.printf " %-40s (dep %s now passes)%s\n%!" pkg dep_info blessed_marker 3268 3180 ) all_cascades; 3269 3181 if not dry_run then begin 3270 - (* Read build-config.json from last batch run *) 3271 - let build_config_path = Path.(cache_dir / os_key / "build-config.json") in 3272 - let (opam_repos, with_doc, doc_tools_repo, doc_tools_branch, jtw_tools_repo, jtw_tools_branch, 3273 - html_output, jtw_output, local_repos) = 3274 - if Sys.file_exists build_config_path then begin 3275 - try 3276 - let json = Yojson.Safe.from_file build_config_path in 3277 - let open Yojson.Safe.Util in 3278 - let s key default = json |> member key |> to_string_option |> Option.value ~default in 3279 - let b key default = json |> member key |> to_bool_option |> Option.value ~default in 3280 - let so key = json |> member key |> to_string_option in 3281 - let sl key = try json |> member key |> to_list |> List.map to_string with _ -> [] in 3282 - Printf.printf "Using batch config from %s\n%!" build_config_path; 3283 - (sl "opam_repositories", b "with_doc" false, 3284 - s "doc_tools_repo" "", s "doc_tools_branch" "", 3285 - s "jtw_tools_repo" "", s "jtw_tools_branch" "", 3286 - so "html_output", so "jtw_output", sl "local_repos") 3287 - with _ -> 3288 - Printf.eprintf "Error: could not read %s\n%!" build_config_path; 3289 - Stdlib.exit 1 3290 - end else begin 3291 - Printf.eprintf "Error: no build-config.json found at %s (run batch first)\n%!" build_config_path; 3292 - Stdlib.exit 1 3293 - end 3294 - in 3295 - if with_doc then Printf.printf "Building with docs enabled\n%!"; 3296 3182 let run_id = Printf.sprintf "cascade-%s" (Day10_lib.Run_log.format_time (Unix.gettimeofday ())) in 3297 3183 let run_one (pkg, (e : Day10_lib.History.entry)) = 3298 - Printf.printf "Rerunning %s...\n%!" pkg; 3299 - run_health_check { dir = cache_dir; ocaml_version = None; opam_repositories = opam_repos; 3300 - package = pkg; arch; os; os_distribution; os_family; os_version; 3301 - directory = None; md = None; json = None; dot = None; 3302 - with_test = false; with_doc; with_jtw = false; 3303 - doc_tools_repo; doc_tools_branch; 3304 - jtw_tools_repo; jtw_tools_branch; 3305 - local_repos; html_output; jtw_output; tag = None; 3306 - log = false; dry_run = false; fork = None; prune_layers = false; blessed_map = None }; 3307 - record_cascade_results ~cache_dir ~os_key ~packages_dir ~run_id ~pkg_str:pkg ~was_blessed:e.blessed 3184 + Printf.printf "Rerunning %s (%s)...\n%!" pkg e.build_hash; 3185 + ignore (rerun_build_layer ~cache_dir ~os_key ~arch ~os ~os_distribution ~os_family ~os_version 3186 + ~packages_dir ~run_id ~build_hash:e.build_hash) 3308 3187 in 3309 3188 (match fork with 3310 3189 | Some n when n > 1 ->
+17 -11
day10/bin/os.ml
··· 76 76 (if parents then 77 77 let parent_dir = Filename.dirname dir in 78 78 if parent_dir <> dir then mkdir ~parents:true parent_dir); 79 - Sys.mkdir dir 0o755) 79 + try Sys.mkdir dir 0o755 80 + with Sys_error _ when Sys.file_exists dir && Sys.is_directory dir -> ()) 80 81 81 82 (** Create a unique temporary directory. Unlike Filename.temp_dir, this includes 82 83 the PID in the name to guarantee uniqueness across forked processes. *) ··· 164 165 let running, finished = 165 166 IntSet.partition 166 167 (fun pid -> 167 - let c, _ = Unix.waitpid [ WNOHANG ] pid in 168 - pid <> c) 168 + (try let c, _ = Unix.waitpid [ WNOHANG ] pid in pid <> c 169 + with Unix.Unix_error (Unix.EINTR, _, _) -> true)) 169 170 acc 170 171 in 171 172 let () = if IntSet.is_empty finished then Unix.sleepf 0.1 in ··· 194 195 (* Try to reap finished processes, returning (still_running, exit_codes) *) 195 196 let reap_finished pids = 196 197 IntSet.fold (fun pid (running, codes) -> 197 - let c, status = Unix.waitpid [ WNOHANG ] pid in 198 - if c = pid then 199 - (running, status_of_wait status :: codes) 200 - else 201 - (IntSet.add pid running, codes) 198 + match Unix.waitpid [ WNOHANG ] pid with 199 + | c, status when c = pid -> (running, status_of_wait status :: codes) 200 + | _ -> (IntSet.add pid running, codes) 201 + | exception Unix.Unix_error (Unix.EINTR, _, _) -> (IntSet.add pid running, codes) 202 202 ) pids (IntSet.empty, []) 203 203 in 204 204 List.fold_left ··· 247 247 let running, finished = 248 248 IntSet.partition 249 249 (fun pid -> 250 - let c, _ = Unix.waitpid [ WNOHANG ] pid in 251 - pid <> c) 250 + (try let c, _ = Unix.waitpid [ WNOHANG ] pid in pid <> c 251 + with Unix.Unix_error (Unix.EINTR, _, _) -> true)) 252 252 acc 253 253 in 254 254 let () = if IntSet.is_empty finished then Unix.sleepf 0.1 in ··· 269 269 | child -> IntSet.add child acc) 270 270 IntSet.empty indexed 271 271 in 272 - IntSet.iter (fun pid -> ignore (Unix.waitpid [] pid)) pids; 272 + IntSet.iter (fun pid -> 273 + let rec wait () = 274 + try ignore (Unix.waitpid [] pid) 275 + with Unix.Unix_error (Unix.EINTR, _, _) -> wait () 276 + in 277 + wait () 278 + ) pids; 273 279 (* Collect results *) 274 280 List.map (fun (i, x) -> 275 281 let result_file = Filename.concat temp_dir (string_of_int i) in
+49
day10/bin/util.ml
··· 212 212 let () = Os.write_to_file Path.(path / "repo") {|opam-version: "2.0"|} in 213 213 path 214 214 215 + (** Copy opam files for [packages] into an opam-repository directory. 216 + Searches [opam_repositories] in order for each package's opam file. *) 217 + let populate_opam_repository ~opam_repo ~opam_repositories packages = 218 + List.iter (fun dep_pkg -> 219 + let opam_relative_path = Path.("packages" / OpamPackage.name_to_string dep_pkg / OpamPackage.to_string dep_pkg) in 220 + List.find_map (fun opam_repository -> 221 + let opam = Path.(opam_repository / opam_relative_path) in 222 + if Sys.file_exists opam then Some opam else None 223 + ) opam_repositories 224 + |> Option.iter (fun src -> 225 + let dst = Path.(opam_repo / opam_relative_path) in 226 + Os.mkdir ~parents:true dst; 227 + Os.cp Path.(src / "opam") Path.(dst / "opam"); 228 + let src_files = Path.(src / "files") in 229 + if Sys.file_exists src_files then begin 230 + let dst_files = Path.(dst / "files") in 231 + Os.mkdir dst_files; 232 + Sys.readdir src_files |> Array.iter (fun f -> Os.cp Path.(src_files / f) Path.(dst_files / f)) 233 + end) 234 + ) packages 235 + 236 + (** Write a skeleton layer for a cascade-failed package: layer.json with 237 + exit_status=-1 and an opam-repository with the package's opam files, 238 + so that [rerun --cascade] can rebuild without re-solving from opam-repository.git. *) 239 + let write_skeleton_layer ~cache_dir ~os_key ~opam_repositories ~layer_name ~pkg ~ordered_deps ~dep_build_hashes = 240 + let layer_dir = Path.(cache_dir / os_key / layer_name) in 241 + if not (Sys.file_exists layer_dir) then begin 242 + Os.mkdir ~parents:true layer_dir; 243 + save_layer_info Path.(layer_dir / "layer.json") pkg ordered_deps dep_build_hashes (-1); 244 + let opam_repo = create_opam_repository layer_dir in 245 + populate_opam_repository ~opam_repo ~opam_repositories (pkg :: ordered_deps); 246 + ensure_package_layer_symlink ~cache_dir ~os_key ~pkg_str:(OpamPackage.to_string pkg) ~layer_name 247 + end 248 + 249 + (** Wait for a layer.json file to appear (may be created by another parallel worker). 250 + Polls every 0.5s for up to 5 minutes. *) 251 + let wait_for_layer_json ~layer_json ~layer_name = 252 + let rec loop retries = 253 + if Sys.file_exists layer_json then () 254 + else if retries <= 0 then 255 + failwith (Printf.sprintf "Layer %s never completed (layer.json missing)" layer_name) 256 + else begin 257 + Unix.sleepf 0.5; 258 + loop (retries - 1) 259 + end 260 + in 261 + loop 600; 262 + Unix.utimes layer_json 0.0 0.0 263 + 215 264 let opam_file opam_repositories pkg = 216 265 List.find_map 217 266 (fun opam_repository ->
+9 -5
day10/lib/history.ml
··· 84 84 let path = history_path ~packages_dir ~pkg_str in 85 85 mkdir_p (Filename.dirname path); 86 86 let line = Yojson.Safe.to_string (entry_to_json entry) in 87 - let oc = open_out_gen 88 - [Open_append; Open_creat; Open_wronly] 0o644 path in 89 - Fun.protect ~finally:(fun () -> close_out oc) (fun () -> 90 - output_string oc line; 91 - output_char oc '\n') 87 + let fd = Unix.openfile path [Unix.O_WRONLY; Unix.O_APPEND; Unix.O_CREAT] 0o644 in 88 + Fun.protect ~finally:(fun () -> Unix.close fd) (fun () -> 89 + Unix.lockf fd Unix.F_LOCK 0; 90 + let line = line ^ "\n" in 91 + let len = String.length line in 92 + let written = ref 0 in 93 + while !written < len do 94 + written := !written + Unix.write_substring fd line !written (len - !written) 95 + done) 92 96 93 97 let read ~packages_dir ~pkg_str = 94 98 let path = history_path ~packages_dir ~pkg_str in