my harness for niri
1import OpenAI from "openai"
2import { logMessage } from "../db.js"
3import { buildCompletionMessages, rememberRecalledMemoryChunks } from "../memory.js"
4import { recordMetric } from "../metrics.js"
5import { emit } from "../stream.js"
6import type { LoopState } from "./types.js"
7import {
8 API_BASE,
9 ENABLE_THINKING,
10 FALLBACK_BASE,
11 FALLBACK_MODEL,
12 FALLBACK_TOOL_CHOICE,
13 MODEL,
14 PRIMARY_TOOL_CHOICE,
15 SUMMARY_MODEL,
16 TOOLS,
17 USE_FALLBACK,
18 apiErrorDetails,
19 client,
20 errorSummary,
21 estimatePromptTokens,
22 fallbackClient,
23 fallbackContextWindow,
24 findSummaryMessageIndex,
25 isPromptTooLargeError,
26 retryDelayMs,
27 sanitizeMessages,
28 shouldFallback,
29 summaryClient,
30 summarizeConversationViaLLM,
31} from "./util.js"
32import { assistantContentText } from "./loop-content.js"
33import type { CompletionRequest, CompletionTurnResult, ToolCallAssembly } from "./loop-shared.js"
34
35/**
36 * Resolves the configured summary client/model pair.
37 *
38 * @returns Active summary provider config.
39 */
40export function configuredSummaryProvider(): { client: OpenAI | null; model: string } {
41 if (summaryClient && SUMMARY_MODEL) return { client: summaryClient, model: SUMMARY_MODEL }
42 return {
43 client: USE_FALLBACK ? fallbackClient : client,
44 model: USE_FALLBACK ? FALLBACK_MODEL : MODEL,
45 }
46}
47
48function logApiError(err: unknown, context: string): void {
49 if (!(err instanceof OpenAI.APIError)) return
50 console.error(`[api] ${err.status} ${err.message} - ${context}`)
51 for (const line of apiErrorDetails(err)) console.error(line)
52}
53
54/**
55 * Appends an assistant message to state and persists it to conversation logs.
56 *
57 * @param convId - Active conversation id.
58 * @param state - Mutable loop state.
59 * @param msg - Assistant message to append.
60 */
61export function addAssistantMessage(convId: number, state: LoopState, msg: OpenAI.Chat.ChatCompletionMessage): void {
62 state.conversation.push(msg)
63 logMessage(convId, msg.role, msg.content ?? "", msg.tool_calls ?? undefined)
64}
65
66function recordPromptResponse(request: CompletionRequest, result: CompletionTurnResult, promptMetricId: number | null): void {
67 recordMetric({
68 type: "prompt_response",
69 promptMetricId: promptMetricId ?? undefined,
70 model: request.model,
71 toolChoice: request.tool_choice,
72 messages: request.messages,
73 response: result.message,
74 usage: result.usage,
75 })
76}
77
78/**
79 * Applies token usage from a completion response to loop state counters.
80 *
81 * @param state - Mutable loop state.
82 * @param usage - Completion usage payload (if provided by the API).
83 */
84export function applyUsage(state: LoopState, usage: OpenAI.Completions.CompletionUsage | undefined): void {
85 if (!usage) return
86 state.tokenCount += usage.total_tokens
87 if (usage.prompt_tokens) state.contextSize = usage.prompt_tokens
88 console.log(`[tokens] +${usage.total_tokens} total=${state.tokenCount}`)
89 recordMetric({ type: "usage", usage })
90}
91
92/**
93 * Emits model reasoning text when exposed by the provider.
94 *
95 * Supports both `reasoning_content` and `<think>...</think>` wrappers.
96 *
97 * @param msg - Assistant message to inspect for reasoning traces.
98 */
99export function emitThinking(msg: OpenAI.Chat.ChatCompletionMessage): void {
100 const rawMsg = msg as unknown as Record<string, unknown>
101 let thinkingText: string | null = null
102
103 if (typeof rawMsg.reasoning_content === "string" && rawMsg.reasoning_content.trim()) {
104 thinkingText = rawMsg.reasoning_content.trim()
105 } else if (typeof msg.content === "string") {
106 const match = msg.content.match(/^<think>([\s\S]*?)<\/think>\s*/i)
107 if (match) {
108 thinkingText = match[1]!.trim()
109 ;(msg as unknown as Record<string, unknown>).content = msg.content.slice(match[0].length)
110 }
111 }
112
113 if (thinkingText) emit({ type: "thinking", text: thinkingText })
114}
115
116function sleep(ms: number): Promise<void> {
117 return new Promise((resolve) => setTimeout(resolve, ms))
118}
119
120function formatRetryAt(retryAfterMs: number): string {
121 const retryAt = new Date(Date.now() + retryAfterMs)
122 const local = retryAt.toLocaleString(undefined, {
123 hour12: false,
124 timeZoneName: "short",
125 })
126 return `${local} (${retryAt.toISOString()})`
127}
128
129function apiErrorSearchText(err: { message: string; error?: unknown }): string {
130 const parts = [err.message]
131 if (err.error !== undefined) {
132 try {
133 parts.push(JSON.stringify(err.error))
134 } catch {
135 parts.push(String(err.error))
136 }
137 }
138 return parts.join("\n")
139}
140
141function shouldRetryWithAutoToolChoice(err: unknown): boolean {
142 if (!(err instanceof OpenAI.APIError)) return false
143 const text = apiErrorSearchText(err)
144 return /no endpoints found that support the provided 'tool_choice' value|does not support this tool_choice/i.test(text)
145}
146
147function shouldRetryWithoutReasoningForTools(err: unknown): boolean {
148 if (!(err instanceof OpenAI.APIError)) return false
149 return /function call should not be used with prefix/i.test(apiErrorSearchText(err))
150}
151
152function toolCompatibleReasoningExtras(
153 request?: Pick<CompletionRequest, "provider" | "chat_template_kwargs">,
154): Partial<CompletionRequest> {
155 return {
156 include_reasoning: false,
157 reasoning: { enabled: false, exclude: true, effort: "none" },
158 provider: { ...request?.provider, require_parameters: true },
159 }
160}
161
162function prefixModeToolCallExtras(request?: Pick<CompletionRequest, "provider" | "chat_template_kwargs">): Partial<CompletionRequest> {
163 return {
164 ...toolCompatibleReasoningExtras(request),
165 enable_thinking: false,
166 chat_template_kwargs: {
167 ...request?.chat_template_kwargs,
168 enable_thinking: false,
169 },
170 }
171}
172
173function disableReasoningForToolCalls(request: CompletionRequest): CompletionRequest {
174 return {
175 ...request,
176 ...prefixModeToolCallExtras(request),
177 }
178}
179
180function configuredThinkingRequestExtras(
181 request?: Pick<CompletionRequest, "chat_template_kwargs">,
182): Partial<CompletionRequest> {
183 if (ENABLE_THINKING) return {}
184 return {
185 include_reasoning: false,
186 reasoning: { enabled: false, exclude: true, effort: "none" },
187 enable_thinking: false,
188 chat_template_kwargs: {
189 ...request?.chat_template_kwargs,
190 enable_thinking: false,
191 },
192 }
193}
194
195function openRouterToolRequestExtras(baseUrl: string): Partial<CompletionRequest> {
196 if (!baseUrl.includes("openrouter.ai")) return {}
197 return toolCompatibleReasoningExtras()
198}
199
200function shouldRetryWithoutStreamUsage(err: unknown): boolean {
201 if (!(err instanceof OpenAI.APIError)) return false
202 if (err.status !== 400) return false
203 return /stream_options|include_usage/i.test(err.message)
204}
205
206function coerceReasoningToolArgument(rawValue: string): unknown {
207 const value = rawValue.trim()
208 if (!value) return ""
209 if (/^true$/i.test(value)) return true
210 if (/^false$/i.test(value)) return false
211 if (/^null$/i.test(value)) return null
212
213 if (/^-?\d+(?:\.\d+)?$/.test(value)) {
214 const parsed = Number(value)
215 if (Number.isFinite(parsed)) return parsed
216 }
217
218 if (
219 (value.startsWith("{") && value.endsWith("}")) ||
220 (value.startsWith("[") && value.endsWith("]")) ||
221 (value.startsWith('"') && value.endsWith('"'))
222 ) {
223 try {
224 return JSON.parse(value)
225 } catch {
226 // keep raw string fallback
227 }
228 }
229
230 return value
231}
232
233function parseReasoningToolCallBlock(rawBlock: string): ToolCallAssembly | null {
234 const functionMatch = rawBlock.match(/<function(?:=|\s+name\s*=\s*["']?)([^>"'\s/]+)["']?\s*>/i)
235 if (!functionMatch || functionMatch.index === undefined) return null
236
237 const functionName = functionMatch[1]?.trim()
238 if (!functionName) return null
239
240 const functionBodyStart = functionMatch.index + functionMatch[0].length
241 const functionBodyEnd = rawBlock.indexOf("</function>", functionBodyStart)
242 if (functionBodyEnd < 0) return null
243
244 const functionBody = rawBlock.slice(functionBodyStart, functionBodyEnd)
245 const args: Record<string, unknown> = {}
246
247 const parameterRegex = /<parameter(?:=|\s+name\s*=\s*["']?)([^>"'\s/]+)["']?\s*>([\s\S]*?)<\/parameter>/gi
248 for (const match of functionBody.matchAll(parameterRegex)) {
249 const key = match[1]?.trim()
250 if (!key) continue
251 args[key] = coerceReasoningToolArgument(match[2] ?? "")
252 }
253
254 return {
255 id: "",
256 type: "function",
257 function: {
258 name: functionName,
259 arguments: JSON.stringify(args),
260 },
261 }
262}
263
264function drainReasoningToolCallBlocks(buffer: string): { blocks: string[]; remainder: string } {
265 const blocks: string[] = []
266 let remaining = buffer
267
268 while (true) {
269 const openMatch = remaining.match(/<tool_call(?:\s[^>]*)?>/i)
270 if (!openMatch || openMatch.index === undefined) {
271 const partialStart = remaining.lastIndexOf("<tool_call")
272 return {
273 blocks,
274 remainder: partialStart >= 0 ? remaining.slice(partialStart) : "",
275 }
276 }
277
278 const openStart = openMatch.index
279 const openEnd = openStart + openMatch[0].length
280 const closeStart = remaining.indexOf("</tool_call>", openEnd)
281 if (closeStart < 0) {
282 return {
283 blocks,
284 remainder: remaining.slice(openStart),
285 }
286 }
287
288 blocks.push(remaining.slice(openEnd, closeStart))
289 remaining = remaining.slice(closeStart + "</tool_call>".length)
290 }
291}
292
293async function consumeCompletionStream(
294 stream: AsyncIterable<OpenAI.Chat.ChatCompletionChunk>,
295): Promise<CompletionTurnResult> {
296 const contentParts: string[] = []
297 const streamedToolCalls = new Map<number, ToolCallAssembly>()
298 const reasoningToolCalls: ToolCallAssembly[] = []
299 let reasoningToolBuffer = ""
300
301 let usage: OpenAI.Completions.CompletionUsage | undefined
302 let emittedText = false
303 let emittedThinking = false
304 const reasoningParts: string[] = []
305
306 for await (const chunk of stream) {
307 if (chunk.usage) usage = chunk.usage
308
309 const choice = chunk.choices[0]
310 if (!choice) continue
311
312 const delta = choice.delta as OpenAI.Chat.ChatCompletionChunk.Choice.Delta & {
313 reasoning_content?: string
314 }
315
316 if (typeof delta.reasoning_content === "string" && delta.reasoning_content.length > 0) {
317 if (ENABLE_THINKING) reasoningParts.push(delta.reasoning_content)
318
319 reasoningToolBuffer += delta.reasoning_content
320 const { blocks, remainder } = drainReasoningToolCallBlocks(reasoningToolBuffer)
321 reasoningToolBuffer = remainder
322
323 for (const block of blocks) {
324 const parsedCall = parseReasoningToolCallBlock(block)
325 if (!parsedCall) continue
326 parsedCall.id = `call_reasoning_${reasoningToolCalls.length}`
327 reasoningToolCalls.push(parsedCall)
328 }
329 }
330
331 if (typeof delta.content === "string" && delta.content.length > 0) {
332 if (ENABLE_THINKING && !emittedThinking && reasoningParts.length > 0) {
333 emit({ type: "thinking", text: reasoningParts.join("") })
334 emittedThinking = true
335 }
336 contentParts.push(delta.content)
337 emit({ type: "text", text: delta.content })
338 emittedText = true
339 }
340
341 if (!Array.isArray(delta.tool_calls)) continue
342
343 for (const partial of delta.tool_calls) {
344 const index = partial.index ?? 0
345 const existing = streamedToolCalls.get(index) ?? {
346 id: partial.id ?? `call_${index}`,
347 type: "function" as const,
348 function: { name: "", arguments: "" },
349 }
350
351 if (partial.id) existing.id = partial.id
352 if (partial.type === "function") existing.type = "function"
353 if (partial.function?.name) existing.function.name += partial.function.name
354 if (partial.function?.arguments) existing.function.arguments += partial.function.arguments
355 streamedToolCalls.set(index, existing)
356 }
357 }
358
359 if (streamedToolCalls.size === 0) {
360 const trailingReasoningCall = parseReasoningToolCallBlock(reasoningToolBuffer)
361 if (trailingReasoningCall) {
362 trailingReasoningCall.id = `call_reasoning_${reasoningToolCalls.length}`
363 reasoningToolCalls.push(trailingReasoningCall)
364 }
365 }
366
367 const finalToolCalls =
368 streamedToolCalls.size > 0
369 ? [...streamedToolCalls.entries()]
370 .sort((a, b) => a[0] - b[0])
371 .map(([, toolCall]) => toolCall)
372 : reasoningToolCalls
373
374 const message: OpenAI.Chat.ChatCompletionMessage = {
375 role: "assistant",
376 content: contentParts.length > 0 ? contentParts.join("") : null,
377 refusal: null,
378 ...(finalToolCalls.length > 0
379 ? {
380 tool_calls: finalToolCalls,
381 }
382 : {}),
383 }
384
385 if (reasoningParts.length > 0) {
386 ;(message as OpenAI.Chat.ChatCompletionMessage & { reasoning_content?: string }).reasoning_content =
387 reasoningParts.join("")
388 }
389
390 return {
391 message,
392 usage,
393 emittedText,
394 emittedThinking,
395 bufferedThinking: reasoningParts.join(""),
396 }
397}
398
399async function createStreamedCompletion(
400 apiClient: OpenAI,
401 request: CompletionRequest,
402): Promise<CompletionTurnResult> {
403 const streamedRequest = {
404 ...request,
405 stream: true,
406 stream_options: { include_usage: true },
407 } as const
408
409 const promptMetricId = recordMetric({ type: "prompt", messages: request.messages })
410
411 try {
412 const stream = await apiClient.chat.completions.create(streamedRequest)
413 const result = await consumeCompletionStream(stream as AsyncIterable<OpenAI.Chat.ChatCompletionChunk>)
414 recordPromptResponse(request, result, promptMetricId)
415 return result
416 } catch (err) {
417 if (shouldRetryWithoutStreamUsage(err)) {
418 const stream = await apiClient.chat.completions.create({
419 ...request,
420 stream: true,
421 } as const)
422 const result = await consumeCompletionStream(stream as AsyncIterable<OpenAI.Chat.ChatCompletionChunk>)
423 recordPromptResponse(request, result, promptMetricId)
424 return result
425 }
426 throw err
427 }
428}
429
430async function createFallbackCompletion(messages: OpenAI.Chat.ChatCompletionMessageParam[]): Promise<CompletionTurnResult> {
431 const request: CompletionRequest = {
432 model: FALLBACK_MODEL,
433 messages,
434 tools: TOOLS,
435 tool_choice: FALLBACK_TOOL_CHOICE,
436 ...openRouterToolRequestExtras(FALLBACK_BASE),
437 ...configuredThinkingRequestExtras(),
438 }
439
440 let currentRequest = request
441 let retriedAutoToolChoice = false
442 let retriedWithoutReasoning = false
443 while (true) {
444 try {
445 return await createStreamedCompletion(fallbackClient, currentRequest)
446 } catch (err) {
447 if (currentRequest.tool_choice !== "auto" && !retriedAutoToolChoice && shouldRetryWithAutoToolChoice(err)) {
448 retriedAutoToolChoice = true
449 console.warn(
450 `[fallback] provider rejected tool_choice=${currentRequest.tool_choice}; retrying with tool_choice=auto`,
451 )
452 currentRequest = {
453 ...currentRequest,
454 tool_choice: "auto",
455 }
456 continue
457 }
458 if (!retriedWithoutReasoning && shouldRetryWithoutReasoningForTools(err)) {
459 retriedWithoutReasoning = true
460 console.warn("[fallback] provider rejected function calling in reasoning/prefix mode; retrying fallback with tool-compatible reasoning disabled")
461 currentRequest = disableReasoningForToolCalls(currentRequest)
462 continue
463 }
464 throw err
465 }
466 }
467}
468
469async function createPrimaryCompletion(messages: OpenAI.Chat.ChatCompletionMessageParam[]): Promise<CompletionTurnResult> {
470 const request: CompletionRequest = {
471 model: MODEL,
472 messages,
473 tools: TOOLS,
474 tool_choice: PRIMARY_TOOL_CHOICE,
475 ...openRouterToolRequestExtras(API_BASE),
476 ...configuredThinkingRequestExtras(),
477 }
478
479 let currentRequest = request
480 let retriedAutoToolChoice = false
481 let retriedWithoutReasoning = false
482 while (true) {
483 try {
484 return await createStreamedCompletion(client!, currentRequest)
485 } catch (err) {
486 if (currentRequest.tool_choice !== "auto" && !retriedAutoToolChoice && shouldRetryWithAutoToolChoice(err)) {
487 retriedAutoToolChoice = true
488 console.warn(`[api] provider rejected tool_choice=${currentRequest.tool_choice}; retrying primary with tool_choice=auto`)
489 currentRequest = {
490 ...currentRequest,
491 tool_choice: "auto",
492 }
493 continue
494 }
495 if (!retriedWithoutReasoning && shouldRetryWithoutReasoningForTools(err)) {
496 retriedWithoutReasoning = true
497 console.warn("[api] provider rejected function calling in reasoning/prefix mode; retrying primary with tool-compatible reasoning disabled")
498 currentRequest = disableReasoningForToolCalls(currentRequest)
499 continue
500 }
501 throw err
502 }
503 }
504}
505
506function logPromptSizeDebug(state: LoopState, err: unknown, label: string): void {
507 const messageCount = state.conversation.length
508 const roleCounts = state.conversation.reduce<Record<string, number>>((acc, m) => {
509 const role = (m as { role?: string }).role ?? "unknown"
510 acc[role] = (acc[role] ?? 0) + 1
511 return acc
512 }, {})
513 const estimate = estimatePromptTokens(state.conversation)
514 const charLength = JSON.stringify(state.conversation).length
515 const summary = err instanceof OpenAI.APIError ? `${err.status} ${err.message}` : errorSummary(err)
516 console.warn(
517 `[api] ${label}: ${summary} - messages=${messageCount} est_tokens=${estimate} chars=${charLength} roles=${JSON.stringify(roleCounts)} observedPromptTokens=${state.contextSize}`,
518 )
519}
520
521async function recoverFromPromptTooLarge(state: LoopState, attempt: number): Promise<boolean> {
522 const beforeCount = state.conversation.length
523 const beforeEstimate = estimatePromptTokens(state.conversation)
524
525 const summaryProvider = configuredSummaryProvider()
526 if (!summaryProvider.client || !summaryProvider.model) {
527 console.warn(`[context] recovery: no summary client available; cannot llm-summarize`)
528 return false
529 }
530
531 console.warn(`[context] recovery: attempting llm summarization via ${summaryProvider.model} (attempt=${attempt + 1})`)
532 const summarized = await summarizeConversationViaLLM(state.conversation, summaryProvider.client, summaryProvider.model)
533 if (!summarized) {
534 console.warn(`[context] recovery: llm summarization returned no changes`)
535 return false
536 }
537
538 const afterEstimate = estimatePromptTokens(summarized)
539 if (afterEstimate >= beforeEstimate) {
540 console.warn(`[context] recovery: llm summary not smaller (${beforeEstimate} -> ${afterEstimate}); keeping original`)
541 return false
542 }
543
544 state.conversation = summarized
545 state.contextSize = afterEstimate
546
547 const summaryIdx = findSummaryMessageIndex(state.conversation)
548 const summary = summaryIdx >= 0 ? (state.conversation[summaryIdx]?.content as string) : undefined
549
550 console.warn(
551 `[context] recovery: llm-summarized conversation (${beforeCount} -> ${summarized.length} msgs, ${beforeEstimate} -> ${afterEstimate} tokens)`,
552 )
553
554 recordMetric({
555 type: "compaction",
556 before: beforeEstimate,
557 after: afterEstimate,
558 method: "force-llm",
559 summary,
560 })
561 return true
562}
563
564/**
565 * Fetches the next assistant completion, including fallback and backoff behavior.
566 *
567 * @param state - Mutable loop state containing current conversation/context.
568 * @param baseConversation - Optional alternate base conversation for retries.
569 * @returns The next chat completion response.
570 * @throws If the primary request fails with a non-fallback error condition.
571 */
572export async function fetchCompletion(
573 state: LoopState,
574 baseConversation: OpenAI.Chat.ChatCompletionMessageParam[] = state.conversation,
575): Promise<CompletionTurnResult> {
576 let promptTooLargeAttempts = 0
577 while (true) {
578 if (baseConversation === state.conversation) {
579 state.conversation = sanitizeMessages(state.conversation)
580 baseConversation = state.conversation
581 } else {
582 baseConversation = sanitizeMessages(baseConversation)
583 }
584
585 const requestContext = await buildCompletionMessages(
586 baseConversation,
587 state.memoryRecallCooldowns,
588 state.memoryRecallTurn,
589 )
590 const requestMessages = requestContext.messages
591
592 if (USE_FALLBACK) {
593 const fallbackWindow = fallbackContextWindow(requestMessages)
594 if (fallbackWindow.nearLimit) {
595 console.warn(
596 `[fallback] prompt estimate ${fallbackWindow.estimate} nearing fallback limit ${fallbackWindow.softLimit} (${FALLBACK_MODEL})`,
597 )
598 }
599
600 try {
601 const completion = await createFallbackCompletion(requestMessages)
602 state.memoryRecallCooldowns = rememberRecalledMemoryChunks(
603 state.memoryRecallCooldowns,
604 requestContext.recalledChunkIds,
605 state.memoryRecallTurn,
606 )
607 return completion
608 } catch (fallbackErr) {
609 if (isPromptTooLargeError(fallbackErr) && promptTooLargeAttempts < 2) {
610 logPromptSizeDebug(state, fallbackErr, `fallback rejected prompt (attempt ${promptTooLargeAttempts + 1}/2)`)
611 const recovered = await recoverFromPromptTooLarge(state, promptTooLargeAttempts)
612 promptTooLargeAttempts++
613 if (recovered) continue
614 }
615 if (shouldFallback(fallbackErr)) {
616 const retryAfter = retryDelayMs(fallbackErr)
617 console.warn(
618 `[fallback] transient failure (${errorSummary(fallbackErr)}); retrying after ${Math.ceil(retryAfter / 1000)}s`,
619 )
620 console.log(
621 `[runner] backing off ${Math.ceil(retryAfter / 1000)}s (until ${formatRetryAt(retryAfter)}) before retrying fallback...`,
622 )
623 await sleep(retryAfter)
624 continue
625 }
626 logApiError(fallbackErr, `model=${FALLBACK_MODEL} api=${FALLBACK_BASE}`)
627 throw fallbackErr
628 }
629 }
630
631 try {
632 const completion = await createPrimaryCompletion(requestMessages)
633 state.memoryRecallCooldowns = rememberRecalledMemoryChunks(
634 state.memoryRecallCooldowns,
635 requestContext.recalledChunkIds,
636 state.memoryRecallTurn,
637 )
638 return completion
639 } catch (primaryErr) {
640 if (isPromptTooLargeError(primaryErr) && promptTooLargeAttempts < 2) {
641 logPromptSizeDebug(state, primaryErr, `primary rejected prompt (attempt ${promptTooLargeAttempts + 1}/2)`)
642 const recovered = await recoverFromPromptTooLarge(state, promptTooLargeAttempts)
643 promptTooLargeAttempts++
644 if (recovered) continue
645 logApiError(primaryErr, `model=${MODEL} api=${API_BASE}`)
646 throw primaryErr
647 }
648
649 if (!shouldFallback(primaryErr)) {
650 logApiError(primaryErr, `model=${MODEL} api=${API_BASE}`)
651 throw primaryErr
652 }
653
654 const fallbackWindow = fallbackContextWindow(requestMessages)
655 if (fallbackWindow.skip) {
656 console.warn(
657 `[api] primary down (${errorSummary(primaryErr)}) and fallback context estimate ${fallbackWindow.estimate} exceeds hard limit ${fallbackWindow.hardLimit}; retrying primary after backoff`,
658 )
659 const retryAfter = retryDelayMs(primaryErr)
660 console.log(
661 `[runner] backing off ${Math.ceil(retryAfter / 1000)}s (until ${formatRetryAt(retryAfter)}) before retrying primary...`,
662 )
663 await sleep(retryAfter)
664 continue
665 }
666
667 if (fallbackWindow.nearLimit) {
668 console.warn(
669 `[fallback] prompt estimate ${fallbackWindow.estimate} nearing fallback limit ${fallbackWindow.softLimit} (${FALLBACK_MODEL})`,
670 )
671 }
672
673 console.warn(`[api] primary down (${errorSummary(primaryErr)}) - switching to fallback`)
674 try {
675 const completion = await createFallbackCompletion(requestMessages)
676 state.memoryRecallCooldowns = rememberRecalledMemoryChunks(
677 state.memoryRecallCooldowns,
678 requestContext.recalledChunkIds,
679 state.memoryRecallTurn,
680 )
681 return completion
682 } catch (fallbackErr) {
683 if (isPromptTooLargeError(fallbackErr) && promptTooLargeAttempts < 2) {
684 logPromptSizeDebug(state, fallbackErr, `fallback rejected prompt during failover (attempt ${promptTooLargeAttempts + 1}/2)`)
685 const recovered = await recoverFromPromptTooLarge(state, promptTooLargeAttempts)
686 promptTooLargeAttempts++
687 if (recovered) continue
688 }
689 console.warn(
690 `[api] fallback failed (${errorSummary(fallbackErr)}) after primary failure (${errorSummary(primaryErr)}); retrying primary after backoff`,
691 )
692 const retryAfter = retryDelayMs(primaryErr)
693 console.log(
694 `[runner] backing off ${Math.ceil(retryAfter / 1000)}s (until ${formatRetryAt(retryAfter)}) before retrying primary...`,
695 )
696 await sleep(retryAfter)
697 }
698 }
699 }
700}
701
702export const __completionTest = {
703 consumeCompletionStream,
704}