···3535 stream grpc.BidiStreamingClient[pb.ConnectRequest, pb.ConnectResponse]
3636}
37373838-func (e *grpcEmitter) sendStepControl(stepID int, status string, exitCode int) {
3939- if e.stream != nil {
4040- _ = e.stream.Send(&pb.ConnectRequest{
4141- Event: &pb.ConnectRequest_StepControl{
4242- StepControl: &pb.StepControl{
4343- StepId: int32(stepID),
4444- Status: status,
4545- ExitCode: int32(exitCode),
4646- },
4747- },
4848- })
4949- }
5050-}
5151-5252-func (e *grpcEmitter) sendLogLine(stepID int, streamName, content string) {
5353- if e.stream != nil {
5454- _ = e.stream.Send(&pb.ConnectRequest{
5555- Event: &pb.ConnectRequest_LogLine{
5656- LogLine: &pb.LogLine{
5757- StepId: int32(stepID),
5858- Stream: streamName,
5959- Content: content,
6060- },
6161- },
6262- })
6363- }
6464-}
6565-6638func main() {
6739 // Handle --install flag for self-copying (used by init container in distroless image)
6840 if len(os.Args) >= 3 && os.Args[1] == "--install" {
···131103 return fmt.Errorf("failed to parse workflow spec: %w", err)
132104 }
133105134134- // Connect to operator via gRPC
106106+ // Connect to operator via gRPC (optional — only needed for artifact transfer).
107107+ // Single-arch workflows work without gRPC; all log/control events go to stdout.
135108 emitter, cleanup, err := connectToOperator(workflow)
136109 if err != nil {
137137- // gRPC connection failure is fatal — the operator won't see our events
138138- return fmt.Errorf("failed to connect to operator: %w", err)
110110+ needsGRPC := os.Getenv("LOOM_MATRIX_LEG") == "true" || os.Getenv("LOOM_FINAL") == "true"
111111+ if needsGRPC {
112112+ return fmt.Errorf("gRPC required for artifact transfer but failed to connect: %w", err)
113113+ }
114114+ fmt.Fprintf(os.Stderr, "WARNING: gRPC connection failed (artifacts unavailable): %v\n", err)
115115+ emitter = &grpcEmitter{}
116116+ cleanup = func() {}
139117 }
140118 defer cleanup()
141119···169147 // Execute each step
170148 ctx := context.Background()
171149 for i, step := range workflow.Steps {
172172- if err := executeStep(ctx, i, step, emitter); err != nil {
150150+ if err := executeStep(ctx, i, step); err != nil {
173151 return fmt.Errorf("step %d (%s) failed: %w", i, step.Name, err)
174152 }
175153 }
···228206 return emitter, cleanup, nil
229207}
230208231231-func executeStep(ctx context.Context, stepID int, step loomv1alpha1.WorkflowStep, emitter *grpcEmitter) error {
232232- // Emit step start — gRPC + stdout
233233- emitter.sendStepControl(stepID, "start", 0)
209209+func executeStep(ctx context.Context, stepID int, step loomv1alpha1.WorkflowStep) error {
210210+ // Emit step start to stdout (consumed by controller via pod logs API)
234211 emitStdoutControl(stepID, &simpleStep{name: step.Name, command: step.Command}, models.StepStatusStart)
235212236213 // Set step-specific environment variables
···259236 }
260237261238 if err := cmd.Start(); err != nil {
262262- emitter.sendStepControl(stepID, "end", 1)
239239+ emitStdoutControlWithCode(stepID, &simpleStep{name: step.Name, command: step.Command}, models.StepStatusEnd, 1)
263240 return fmt.Errorf("failed to start command: %w", err)
264241 }
265242266243 // Stream stdout and stderr concurrently
267244 done := make(chan error, 2)
268268- go streamOutput(stdout, stepID, "stdout", emitter, done)
269269- go streamOutput(stderr, stepID, "stderr", emitter, done)
245245+ go streamOutput(stdout, stepID, "stdout", done)
246246+ go streamOutput(stderr, stepID, "stderr", done)
270247271248 for i := 0; i < 2; i++ {
272249 if err := <-done; err != nil {
···285262 }
286263 }
287264288288- // Emit step end — gRPC + stdout
289289- emitter.sendStepControl(stepID, "end", exitCode)
265265+ // Emit step end to stdout (consumed by controller via pod logs API)
290266 emitStdoutControlWithCode(stepID, &simpleStep{name: step.Name, command: step.Command}, models.StepStatusEnd, exitCode)
291267292268 if exitCode != 0 {
···296272 return nil
297273}
298274299299-func streamOutput(reader io.Reader, stepID int, streamName string, emitter *grpcEmitter, done chan<- error) {
275275+func streamOutput(reader io.Reader, stepID int, streamName string, done chan<- error) {
300276 scanner := bufio.NewScanner(reader)
301277 buf := make([]byte, 0, 64*1024)
302278 scanner.Buffer(buf, 1024*1024)
303279304280 for scanner.Scan() {
305281 line := scanner.Text()
306306- // Send over gRPC (primary channel)
307307- emitter.sendLogLine(stepID, streamName, line)
308308- // Also emit to stdout for kubectl logs
309282 emitStdoutData(stepID, streamName, line)
310283 }
311284312285 done <- scanner.Err()
313286}
314287315315-// Stdout emitters — for kubectl logs debugging. Not consumed by the operator.
288288+// Stdout emitters — log content is consumed by the operator via the k8s pod logs API.
316289317290func emitStdoutControl(stepID int, step models.Step, status models.StepStatus) {
318291 logLine := models.NewControlLogLine(stepID, step, status)
+259-107
internal/engine/kubernetes_engine.go
···11package engine
2233import (
44+ "bufio"
45 "context"
66+ "encoding/json"
57 "fmt"
88+ "io"
69 "maps"
710 "strings"
1111+ "sync"
812 "time"
9131014 securejoin "github.com/cyphar/filepath-securejoin"
1115 "gopkg.in/yaml.v3"
1616+ corev1 "k8s.io/api/core/v1"
1217 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1818+ "k8s.io/client-go/kubernetes"
1319 "k8s.io/client-go/rest"
1420 "sigs.k8s.io/controller-runtime/pkg/client"
1521 "sigs.k8s.io/controller-runtime/pkg/log"
···4955 return 1000 + (numLegs+1)*100 + stepIdx
5056}
51575858+// workflowLogStream holds the state for streaming logs from a workflow's pod.
5959+type workflowLogStream struct {
6060+ scanner *bufio.Scanner
6161+ stream io.ReadCloser
6262+ pod *corev1.Pod
6363+}
6464+6565+// extendedLogLine extends models.LogLine with exit code for error reporting.
6666+type extendedLogLine struct {
6767+ models.LogLine
6868+ ExitCode int `json:"exit_code,omitempty"`
6969+}
7070+5271// KubernetesEngine implements the spindle Engine interface for Kubernetes Jobs.
5372type KubernetesEngine struct {
5473 client client.Client
···61806281 // Track created SpindleSets for cleanup
6382 spindleSets map[string]*loomv1alpha1.SpindleSet
8383+8484+ // Active log streams per workflow - persist across RunStep calls
8585+ logStreams map[string]*workflowLogStream
8686+ streamMutex sync.RWMutex
6487}
65886689// NewKubernetesEngine creates a new Kubernetes-based spindle engine.
···7497 hub: hub,
7598 artifacts: artifacts,
7699 spindleSets: make(map[string]*loomv1alpha1.SpindleSet),
100100+ logStreams: make(map[string]*workflowLogStream),
77101 }
78102}
79103···525549 // Remove from tracking map
526550 delete(e.spindleSets, wid.String())
527551552552+ // Close any open log streams for this workflow
553553+ e.closeLogStream(wid.Name + "/" + wid.Rkey)
554554+528555 // Clean up artifacts for this pipeline
529556 if e.artifacts != nil {
530557 if err := e.artifacts.Cleanup(wid.PipelineId.AtUri().String()); err != nil {
···536563 return nil
537564}
538565539539-// RunStep waits for step completion events from the runner via the gRPC hub.
566566+// findPodForWorkflow locates the runner pod for a workflow by its labels.
567567+// Polls until a pod exists and is Running, Succeeded, or Failed.
568568+func (e *KubernetesEngine) findPodForWorkflow(ctx context.Context, workflowName, pipelineID string) (*corev1.Pod, error) {
569569+ deadline := time.Now().Add(7 * time.Minute)
570570+ for {
571571+ if time.Now().After(deadline) {
572572+ return nil, fmt.Errorf("timeout waiting for pod for workflow %s", workflowName)
573573+ }
574574+575575+ pods := &corev1.PodList{}
576576+ if err := e.client.List(ctx, pods,
577577+ client.InNamespace(e.namespace),
578578+ client.MatchingLabels{
579579+ "loom.j5t.io/pipeline-id": pipelineID,
580580+ "loom.j5t.io/workflow": workflowName,
581581+ },
582582+ ); err != nil {
583583+ return nil, fmt.Errorf("failed to list pods: %w", err)
584584+ }
585585+586586+ for i := range pods.Items {
587587+ pod := &pods.Items[i]
588588+ switch pod.Status.Phase {
589589+ case corev1.PodRunning, corev1.PodSucceeded, corev1.PodFailed:
590590+ return pod, nil
591591+ }
592592+ }
593593+594594+ select {
595595+ case <-ctx.Done():
596596+ return nil, ctx.Err()
597597+ case <-time.After(2 * time.Second):
598598+ }
599599+ }
600600+}
601601+602602+// openLogStream opens a Kubernetes pod log stream for the runner container.
603603+func (e *KubernetesEngine) openLogStream(ctx context.Context, pod *corev1.Pod) (*workflowLogStream, error) {
604604+ clientset, err := kubernetes.NewForConfig(e.config)
605605+ if err != nil {
606606+ return nil, fmt.Errorf("failed to create kubernetes clientset: %w", err)
607607+ }
608608+609609+ // Only use Follow mode for running pods. For completed pods, we need to read
610610+ // existing logs (Follow:true only streams NEW logs after connection).
611611+ shouldFollow := pod.Status.Phase == corev1.PodRunning
612612+613613+ req := clientset.CoreV1().Pods(pod.Namespace).GetLogs(pod.Name, &corev1.PodLogOptions{
614614+ Container: "runner",
615615+ Follow: shouldFollow,
616616+ })
617617+618618+ logStream, err := req.Stream(ctx)
619619+ if err != nil {
620620+ return nil, fmt.Errorf("failed to open log stream: %w", err)
621621+ }
622622+623623+ scanner := bufio.NewScanner(logStream)
624624+ buf := make([]byte, 0, 64*1024)
625625+ scanner.Buffer(buf, 1024*1024)
626626+627627+ return &workflowLogStream{
628628+ scanner: scanner,
629629+ stream: logStream,
630630+ pod: pod,
631631+ }, nil
632632+}
633633+634634+// getOrCreateLogStream returns a cached log stream or creates a new one.
635635+func (e *KubernetesEngine) getOrCreateLogStream(ctx context.Context, workflowName, pipelineID string) (*workflowLogStream, error) {
636636+ key := workflowName + "/" + pipelineID
637637+638638+ e.streamMutex.RLock()
639639+ stream, exists := e.logStreams[key]
640640+ e.streamMutex.RUnlock()
641641+ if exists {
642642+ return stream, nil
643643+ }
644644+645645+ logger := log.FromContext(ctx)
646646+647647+ pod, err := e.findPodForWorkflow(ctx, workflowName, pipelineID)
648648+ if err != nil {
649649+ return nil, err
650650+ }
651651+ logger.Info("Found pod for log streaming", "pod", pod.Name, "phase", pod.Status.Phase)
652652+653653+ stream, err = e.openLogStream(ctx, pod)
654654+ if err != nil {
655655+ return nil, err
656656+ }
657657+658658+ e.streamMutex.Lock()
659659+ e.logStreams[key] = stream
660660+ e.streamMutex.Unlock()
661661+662662+ return stream, nil
663663+}
664664+665665+// closeLogStream closes and removes a log stream from the cache.
666666+func (e *KubernetesEngine) closeLogStream(key string) {
667667+ e.streamMutex.Lock()
668668+ defer e.streamMutex.Unlock()
669669+670670+ if stream, exists := e.logStreams[key]; exists {
671671+ stream.stream.Close()
672672+ delete(e.logStreams, key)
673673+ }
674674+}
675675+676676+// readUntilStepEnd reads from the pod log stream until the end control event
677677+// for the specified step. Data lines are forwarded to wfLogger. The stepIDMapper
678678+// translates runner step IDs to log step IDs (identity for single-arch,
679679+// matrixLegLogStepID/finalLogStepID for multi-arch).
680680+func (e *KubernetesEngine) readUntilStepEnd(ctx context.Context, stream *workflowLogStream, runnerStepID int, stepIDMapper func(int) int, wfLogger models.WorkflowLogger) error {
681681+ scanner := stream.scanner
682682+683683+ for scanner.Scan() {
684684+ line := scanner.Text()
685685+686686+ var logLine extendedLogLine
687687+ if err := json.Unmarshal([]byte(line), &logLine); err != nil {
688688+ continue
689689+ }
690690+691691+ // Only process events for the current step
692692+ if logLine.StepId != runnerStepID {
693693+ continue
694694+ }
695695+696696+ mappedID := stepIDMapper(runnerStepID)
697697+698698+ switch logLine.Kind {
699699+ case models.LogKindControl:
700700+ if logLine.StepStatus == models.StepStatusEnd {
701701+ if logLine.ExitCode != 0 {
702702+ return fmt.Errorf("step %d failed with exit code %d", runnerStepID, logLine.ExitCode)
703703+ }
704704+ return nil
705705+ }
706706+707707+ case models.LogKindData:
708708+ if wfLogger == nil {
709709+ continue
710710+ }
711711+ logStream := logLine.Stream
712712+ if logStream == "" {
713713+ logStream = "stdout"
714714+ }
715715+ dataWriter := wfLogger.DataWriter(mappedID, logStream)
716716+ _, _ = dataWriter.Write([]byte(logLine.Content + "\n"))
717717+ }
718718+ }
719719+720720+ if err := scanner.Err(); err != nil {
721721+ return fmt.Errorf("error reading logs: %w", err)
722722+ }
723723+724724+ // Scanner ended without seeing step end event — check pod status
725725+ currentPod := &corev1.Pod{}
726726+ if err := e.client.Get(ctx, client.ObjectKey{Namespace: stream.pod.Namespace, Name: stream.pod.Name}, currentPod); err == nil {
727727+ if currentPod.Status.Phase == corev1.PodSucceeded {
728728+ return nil
729729+ }
730730+ if currentPod.Status.Phase == corev1.PodFailed {
731731+ return fmt.Errorf("pod failed before step %d completed", runnerStepID)
732732+ }
733733+ }
734734+735735+ return fmt.Errorf("log stream ended before step %d completed", runnerStepID)
736736+}
737737+738738+// RunStep streams logs from the runner pod and waits for step completion.
540739// For single-arch workflows, blocks until the step's "end" control event.
541740// For multi-arch workflows:
542741// - idx 0: waits for ALL matrix leg runners to complete all their steps
···550749 }
551750552751 if data.IsMultiArch {
553553- return e.runMultiArchStep(ctx, wid, data, idx, wfLogger)
752752+ return e.runMultiArchStep(ctx, wid, w, data, idx, wfLogger)
554753 }
555754556556- // Single-arch: wait for one runner.
557557- // PipelineID must match what the runner sends — the framework injects
558558- // TANGLED_PIPELINE_ID as the full AT URI (see spindle/models.PipelineEnvVars),
559559- // and the runner uses that value when registering with the hub.
560560- key := loomgrpc.RunnerKey{
561561- PipelineID: wid.PipelineId.AtUri().String(),
562562- WorkflowName: wid.Name,
563563- Architecture: data.Spec.Architecture,
755755+ // Single-arch: get or create the pod log stream
756756+ logStream, err := e.getOrCreateLogStream(ctx, wid.Name, wid.Rkey)
757757+ if err != nil {
758758+ return fmt.Errorf("failed to get log stream: %w", err)
564759 }
565760566566- if idx == 0 {
567567- logger.Info("waiting for runner to connect", "key", key.String())
568568- select {
569569- case <-e.hub.WaitForRunner(key):
570570- logger.Info("runner connected", "key", key.String())
571571- case <-ctx.Done():
572572- return fmt.Errorf("context canceled while waiting for runner: %w", ctx.Err())
573573- case <-time.After(10 * time.Minute):
574574- return fmt.Errorf("timeout waiting for runner to connect")
575575- }
761761+ // Read from stream until this step's end event
762762+ logger.Info("Reading logs for step", "stepID", idx)
763763+ if err := e.readUntilStepEnd(ctx, logStream, idx, func(id int) int { return id }, wfLogger); err != nil {
764764+ e.closeLogStream(wid.Name + "/" + wid.Rkey)
765765+ return fmt.Errorf("failed to read logs for step %d: %w", idx, err)
576766 }
767767+ logger.Info("Step completed", "stepID", idx)
577768578578- return e.waitForRunnerStep(ctx, key, idx, idx, wfLogger)
769769+ // Close stream after last step
770770+ if idx == len(w.Steps)-1 {
771771+ e.closeLogStream(wid.Name + "/" + wid.Rkey)
772772+ }
773773+774774+ return nil
579775}
580776581777// runMultiArchStep handles RunStep for multi-arch workflows.
582582-func (e *KubernetesEngine) runMultiArchStep(ctx context.Context, wid models.WorkflowId, data *kubernetesWorkflowData, idx int, wfLogger models.WorkflowLogger) error {
778778+func (e *KubernetesEngine) runMultiArchStep(ctx context.Context, wid models.WorkflowId, w *models.Workflow, data *kubernetesWorkflowData, idx int, wfLogger models.WorkflowLogger) error {
583779 logger := log.FromContext(ctx).WithValues("workflow", wid.Name, "pipeline", wid.Rkey, "step", idx)
584780585781 if idx == 0 {
···595791 for legIdx, leg := range data.MatrixLegs {
596792 go func() {
597793 legName := fmt.Sprintf("%s-%s", data.Spec.Name, leg.Architecture)
598598- key := loomgrpc.RunnerKey{
599599- PipelineID: wid.PipelineId.AtUri().String(),
600600- WorkflowName: legName,
601601- Architecture: leg.Architecture,
602602- }
603794604604- // Wait for this leg's runner to connect
605605- select {
606606- case <-e.hub.WaitForRunner(key):
607607- logger.Info("matrix leg runner connected", "key", key.String())
608608- case <-ctx.Done():
609609- results <- legResult{leg: leg, err: ctx.Err()}
610610- return
611611- case <-time.After(10 * time.Minute):
612612- results <- legResult{leg: leg, err: fmt.Errorf("timeout waiting for runner %s", key.String())}
795795+ // Open pod log stream for this leg
796796+ logStream, err := e.getOrCreateLogStream(ctx, legName, wid.Rkey)
797797+ if err != nil {
798798+ results <- legResult{leg: leg, err: fmt.Errorf("failed to get log stream for leg %s: %w", leg.Architecture, err)}
613799 return
614800 }
615801616616- // Wait for all steps in this leg to complete, emitting per-leg
617617- // control log lines so each architecture's output gets its own
618618- // step section in the rendered log.
802802+ mapper := func(id int) int { return matrixLegLogStepID(legIdx, id) }
803803+804804+ // Wait for all steps in this leg to complete
619805 for stepIdx, userStep := range data.Spec.Steps {
620806 logStepID := matrixLegLogStepID(legIdx, stepIdx)
621807 sStep := syntheticStep{
···626812 if wfLogger != nil {
627813 _, _ = wfLogger.ControlWriter(logStepID, sStep, models.StepStatusStart).Write([]byte{0})
628814 }
629629- err := e.waitForRunnerStep(ctx, key, stepIdx, logStepID, wfLogger)
815815+ err := e.readUntilStepEnd(ctx, logStream, stepIdx, mapper, wfLogger)
630816 if wfLogger != nil {
631817 _, _ = wfLogger.ControlWriter(logStepID, sStep, models.StepStatusEnd).Write([]byte{0})
632818 }
···636822 }
637823 }
638824825825+ e.closeLogStream(legName + "/" + wid.Rkey)
639826 results <- legResult{leg: leg, err: nil}
640827 }()
641828 }
···661848 }
662849663850 if idx == 1 && data.FinalSpec != nil {
664664- // Final phase: wait for the final runner
851851+ // Final phase
665852 finalName := fmt.Sprintf("%s-final", data.Spec.Name)
666666- key := loomgrpc.RunnerKey{
667667- PipelineID: wid.PipelineId.AtUri().String(),
668668- WorkflowName: finalName,
669669- Architecture: data.FinalSpec.Architecture,
670670- }
853853+854854+ // Stream artifacts from matrix legs to the final runner via gRPC
855855+ if e.artifacts != nil {
856856+ key := loomgrpc.RunnerKey{
857857+ PipelineID: wid.PipelineId.AtUri().String(),
858858+ WorkflowName: finalName,
859859+ Architecture: data.FinalSpec.Architecture,
860860+ }
671861672672- logger.Info("waiting for final runner to connect", "key", key.String())
673673- select {
674674- case <-e.hub.WaitForRunner(key):
675675- logger.Info("final runner connected", "key", key.String())
676676- case <-ctx.Done():
677677- return fmt.Errorf("context canceled while waiting for final runner: %w", ctx.Err())
678678- case <-time.After(10 * time.Minute):
679679- return fmt.Errorf("timeout waiting for final runner to connect")
680680- }
862862+ // Wait for the final runner to connect to gRPC (needed for artifact transfer)
863863+ logger.Info("waiting for final runner to connect for artifact transfer", "key", key.String())
864864+ select {
865865+ case <-e.hub.WaitForRunner(key):
866866+ logger.Info("final runner connected", "key", key.String())
867867+ case <-ctx.Done():
868868+ return fmt.Errorf("context canceled while waiting for final runner: %w", ctx.Err())
869869+ case <-time.After(10 * time.Minute):
870870+ return fmt.Errorf("timeout waiting for final runner to connect")
871871+ }
681872682682- // Stream artifacts from matrix legs to the final runner
683683- if e.artifacts != nil {
684873 rs := e.hub.Get(key)
685874 if rs != nil {
686875 logger.Info("streaming artifacts to final runner", "pipeline", wid.Rkey)
···690879 }
691880 }
692881693693- // Wait for all final steps, emitting per-step control log lines so the
694694- // final phase's user steps are visible in the rendered log (the
695695- // framework only emits a single "Final" control entry above us).
882882+ // Open pod log stream for final phase
883883+ numLegs := len(data.MatrixLegs)
884884+ logStream, err := e.getOrCreateLogStream(ctx, finalName, wid.Rkey)
885885+ if err != nil {
886886+ return fmt.Errorf("failed to get log stream for final phase: %w", err)
887887+ }
888888+889889+ mapper := func(id int) int { return finalLogStepID(numLegs, id) }
890890+891891+ // Wait for all final steps
696892 for stepIdx, finalStep := range data.FinalSpec.Steps {
697697- logStepID := finalLogStepID(len(data.MatrixLegs), stepIdx)
893893+ logStepID := finalLogStepID(numLegs, stepIdx)
698894 sStep := syntheticStep{
699895 name: fmt.Sprintf("%s (final)", finalStep.Name),
700896 command: finalStep.Command,
···703899 if wfLogger != nil {
704900 _, _ = wfLogger.ControlWriter(logStepID, sStep, models.StepStatusStart).Write([]byte{0})
705901 }
706706- err := e.waitForRunnerStep(ctx, key, stepIdx, logStepID, wfLogger)
902902+ err := e.readUntilStepEnd(ctx, logStream, stepIdx, mapper, wfLogger)
707903 if wfLogger != nil {
708904 _, _ = wfLogger.ControlWriter(logStepID, sStep, models.StepStatusEnd).Write([]byte{0})
709905 }
···712908 }
713909 }
714910911911+ e.closeLogStream(finalName + "/" + wid.Rkey)
715912 logger.Info("final steps completed")
716913 return nil
717914 }
718915719916 return fmt.Errorf("unexpected step index %d for multi-arch workflow", idx)
720720-}
721721-722722-// waitForRunnerStep reads events from a runner's gRPC stream until a specific
723723-// step completes. runnerStepID is matched against StepID fields on runner
724724-// events to filter out other steps' events; logStepID is the step id passed to
725725-// wfLogger.DataWriter when forwarding log content. For single-arch workflows
726726-// these are the same; for matrix legs they differ so each leg's logs land in
727727-// a distinct UI step.
728728-func (e *KubernetesEngine) waitForRunnerStep(ctx context.Context, key loomgrpc.RunnerKey, runnerStepID, logStepID int, wfLogger models.WorkflowLogger) error {
729729- rs := e.hub.Get(key)
730730- if rs == nil {
731731- return fmt.Errorf("runner not connected for %s", key.String())
732732- }
733733-734734- for {
735735- select {
736736- case evt := <-rs.Steps:
737737- if evt.StepID != runnerStepID {
738738- continue
739739- }
740740- if evt.Status == "end" {
741741- if evt.ExitCode != 0 {
742742- return fmt.Errorf("step %d failed with exit code %d", runnerStepID, evt.ExitCode)
743743- }
744744- return nil
745745- }
746746-747747- case logEvt := <-rs.Logs:
748748- if logEvt.StepID != runnerStepID || wfLogger == nil {
749749- continue
750750- }
751751- stream := logEvt.Stream
752752- if stream == "" {
753753- stream = "stdout"
754754- }
755755- dataWriter := wfLogger.DataWriter(logStepID, stream)
756756- _, _ = dataWriter.Write([]byte(logEvt.Content + "\n"))
757757-758758- case <-rs.Done:
759759- return fmt.Errorf("runner disconnected before step %d completed", runnerStepID)
760760-761761- case <-ctx.Done():
762762- return fmt.Errorf("context canceled during step %d: %w", runnerStepID, ctx.Err())
763763- }
764764- }
765917}
766918767919// Ensure KubernetesEngine implements the Engine interface
+1-20
internal/grpc/hub.go
···1818 return fmt.Sprintf("%s/%s/%s", k.PipelineID, k.WorkflowName, k.Architecture)
1919}
20202121-// StepEvent represents a step lifecycle event received from a runner.
2222-type StepEvent struct {
2323- StepID int
2424- Status string // "start" or "end"
2525- ExitCode int
2626-}
2727-2828-// LogEvent represents a log line received from a runner.
2929-type LogEvent struct {
3030- StepID int
3131- Stream string // "stdout" or "stderr"
3232- Content string
3333-}
3434-3521// ArtifactEvent represents an artifact chunk received from a runner.
3622type ArtifactEvent struct {
3723 Path string
···4026}
41274228// RunnerStream holds the channels for a single runner connection.
4343-// The gRPC server writes to these channels; the engine reads from them.
2929+// Used for artifact transfer between matrix legs and final jobs.
4430type RunnerStream struct {
4545- Steps chan StepEvent
4646- Logs chan LogEvent
4747-4831 // SendToRunner allows the engine to send messages back to the runner.
4932 // The gRPC server reads from this channel and sends to the runner.
5033 SendToRunner chan *pb.ConnectResponse
···55385639func newRunnerStream() *RunnerStream {
5740 return &RunnerStream{
5858- Steps: make(chan StepEvent, 64),
5959- Logs: make(chan LogEvent, 256),
6041 SendToRunner: make(chan *pb.ConnectResponse, 64),
6142 Done: make(chan struct{}),
6243 }
+3-15
internal/grpc/server.go
···7171 }()
72727373 // Process the first message's event (if any)
7474- s.processEvent(key, rs, msg)
7474+ s.processEvent(key, msg)
75757676 // Start goroutine to send responses back to the runner
7777 go func() {
···100100 if err != nil {
101101 return fmt.Errorf("recv error from %s: %w", key.String(), err)
102102 }
103103- s.processEvent(key, rs, msg)
103103+ s.processEvent(key, msg)
104104 }
105105}
106106107107// processEvent routes a runner event to the appropriate channel.
108108-func (s *Server) processEvent(key RunnerKey, rs *RunnerStream, msg *pb.ConnectRequest) {
108108+func (s *Server) processEvent(key RunnerKey, msg *pb.ConnectRequest) {
109109 logger := log.Log.WithName("grpc")
110110111111 switch evt := msg.Event.(type) {
112112- case *pb.ConnectRequest_StepControl:
113113- rs.Steps <- StepEvent{
114114- StepID: int(evt.StepControl.StepId),
115115- Status: evt.StepControl.Status,
116116- ExitCode: int(evt.StepControl.ExitCode),
117117- }
118118- case *pb.ConnectRequest_LogLine:
119119- rs.Logs <- LogEvent{
120120- StepID: int(evt.LogLine.StepId),
121121- Stream: evt.LogLine.Stream,
122122- Content: evt.LogLine.Content,
123123- }
124112 case *pb.ConnectRequest_ArtifactChunk:
125113 // Persist artifact to disk; final jobs stream it back via StreamToRunner.
126114 if s.artifacts != nil {