···11+This project includes code derived from third-party open-source projects.
22+33+---
44+55+Project: mlx-knife
66+Author: The BROKE team 🦫
77+Source: https://github.com/mzau/mlx-knife
88+License: MIT
99+1010+Description:
1111+Modules regarding mlx from mlx-knife has been used as our starting point and for further references
1212+1313+1414+Project: mem-agent-mcp
1515+Author: Dria
1616+Source: https://github.com/firstbatchxyz/mem-agent-mcp
1717+License: Apache-2.0 license
1818+1919+Description:
2020+Modules regarding mem-agent cli from mem-agent-mcp has been used as our starting point and for further references
···11#!/usr/bin/env bash
22set -euo pipefail
3344-ENV="prod" # prod is another env, try taking it from github env
55-REPO="tilesprivacy/tilekit"
44+ENV="dev" # prod is another env, try taking it from github env
55+REPO="tilesprivacy/tiles"
66# VERSION="${TILES_VERSION:-latest}"
77-VERSION="0.2.0"
77+VERSION="0.3.0"
88INSTALL_DIR="$HOME/.local/bin" # CLI install location
99SERVER_DIR="$HOME/.local/share/tiles/server" # Python server folder
1010TMPDIR="$(mktemp -d)"
···4343 TAR_URL="https://github.com/${REPO}/releases/download/${VERSION}/tiles-v${VERSION}-${ARCH}-${OS}.tar.gz"
4444 curl -fsSL -o "${TMPDIR}/tiles.tar.gz" "$TAR_URL"
4545else
4646- # Installer suppose to ran from tilekit root folder after running the bundler
4646+ # Installer suppose to ran from tiles root folder after running the bundler
4747 mv "dist/tiles-v${VERSION}-${ARCH}-${OS}.tar.gz" "${TMPDIR}/tiles.tar.gz"
4848fi
4949···7171 log "Installing Python 3.13 via Homebrew..."
7272 brew install python@3.13 || err "Failed to install Python 3.13"
7373 else
7474- err "Python 3.13 is required but not found. Please install it manually."
7474+ err "Python 3.13 is required but not found. Please install it manuallyv and retry installing tiles"
7575 fi
7676fi
7777
···11-# MIT License
22-33-# Copyright (c) 2025 The BROKE team 🦫
44-55-# Permission is hereby granted, free of charge, to any person obtaining a copy
66-# of this software and associated documentation files (the "Software"), to deal
77-# in the Software without restriction, including without limitation the rights
88-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
99-# copies of the Software, and to permit persons to whom the Software is
1010-# furnished to do so, subject to the following conditions:
1111-1212-# The above copyright notice and this permission notice shall be included in all
1313-# copies or substantial portions of the Software.
1414-1515-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1616-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1717-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
1818-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
1919-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
2020-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
2121-# SOFTWARE.
2222-231"""
242Enhanced MLX model runner with direct API integration.
253Provides ollama-like run experience with streaming and interactive chat.
264"""
27566+import sys
287import json
298import os
309import time
···3211from pathlib import Path
3312from typing import Dict, Optional
34133535-import mlx.core as mx
1414+if sys.platform == "darwin":
1515+ import mlx.core as mx
1616+else:
1717+ mx = None
3618from mlx_lm import load
3719from mlx_lm.generate import generate_step
3820from mlx_lm.sample_utils import make_repetition_penalty, make_sampler
39214040-from .reasoning_utils import ReasoningExtractor, StreamingReasoningParser
2222+from ..reasoning_utils import ReasoningExtractor, StreamingReasoningParser
412342244325def get_model_context_length(model_path: str) -> int:
4426 """Extract max_position_embeddings from model config.
4545-2727+4628 Args:
4729 model_path: Path to the MLX model directory
4848-3030+4931 Returns:
5032 Maximum context length for the model (defaults to 4096 if not found)
5133 """
5234 config_path = os.path.join(model_path, "config.json")
5353-3535+5436 try:
5537 with open(config_path) as f:
5638 config = json.load(f)
5757-3939+5840 # Try various common config keys for context length
5941 context_keys = [
6042 "max_position_embeddings",
6143 "n_positions",
6244 "context_length",
6345 "max_sequence_length",
6464- "seq_len"
4646+ "seq_len",
6547 ]
6666-4848+6749 for key in context_keys:
6850 if key in config:
6951 return config[key]
7070-5252+7153 # If no context length found, return reasonable default
7254 return 4096
7373-5555+7456 except (FileNotFoundError, json.JSONDecodeError, KeyError):
7557 # Return default if config can't be read
7658 return 4096
···7961class MLXRunner:
8062 """Direct MLX model runner with streaming and interactive capabilities."""
81638282- def __init__(self, model_path: str, adapter_path: Optional[str] = None, verbose: bool = False):
6464+ def __init__(
6565+ self, model_path: str, adapter_path: Optional[str] = None, verbose: bool = False
6666+ ):
8367 """Initialize the runner with a model.
8484-6868+8569 Args:
8670 model_path: Path to the MLX model directory
8771 adapter_path: Optional path to LoRA adapter
···10791 def __enter__(self):
10892 """Context manager entry - loads the model."""
10993 if self._context_entered:
110110- raise RuntimeError("MLXRunner context manager cannot be entered multiple times")
111111-9494+ raise RuntimeError(
9595+ "MLXRunner context manager cannot be entered multiple times"
9696+ )
9797+11298 self._context_entered = True
11399 try:
114100 self.load_model()
···146132 try:
147133 # Load model and tokenizer
148134 self.model, self.tokenizer = load(
149149- str(self.model_path),
150150- adapter_path=self.adapter_path
135135+ str(self.model_path), adapter_path=self.adapter_path
151136 )
152137153138 load_time = time.time() - start_time
···156141157142 if self.verbose:
158143 print(f"Model loaded in {load_time:.1f}s")
159159- print(f"Memory: {model_memory:.1f}GB model, {current_memory:.1f}GB total")
144144+ print(
145145+ f"Memory: {model_memory:.1f}GB model, {current_memory:.1f}GB total"
146146+ )
160147161148 # Extract stop tokens from tokenizer
162149 self._extract_stop_tokens()
163163-150150+164151 # Extract context length from model config
165152 self._context_length = get_model_context_length(str(self.model_path))
166166-153153+167154 if self.verbose:
168155 print(f"Model context length: {self._context_length} tokens")
169169-156156+170157 self._model_loaded = True
171171-158158+172159 except Exception as e:
173160 # Ensure partial state is cleaned up on failure
174161 self.model = None
···177164 self._model_loaded = False
178165 # Clear any memory that might have been allocated
179166 mx.clear_cache()
180180- raise RuntimeError(f"Failed to load model from {self.model_path}: {e}") from e
167167+ raise RuntimeError(
168168+ f"Failed to load model from {self.model_path}: {e}"
169169+ ) from e
181170182171 def _extract_stop_tokens(self):
183172 """Extract stop tokens from the tokenizer dynamically.
184184-173173+185174 This method identifies ALL tokens that should stop generation:
186175 1. Official EOS token from tokenizer config
187176 2. Message-end tokens from training (e.g., <|end|> for MXFP4)
188177 3. Common stop tokens across models
189178 """
190179 self._stop_tokens = set()
191191- self._message_end_tokens = set() # Tokens that end messages but not conversations
180180+ self._message_end_tokens = (
181181+ set()
182182+ ) # Tokens that end messages but not conversations
192183193184 # Primary source: eos_token
194194- eos_token = getattr(self.tokenizer, 'eos_token', None)
185185+ eos_token = getattr(self.tokenizer, "eos_token", None)
195186 if eos_token:
196187 self._stop_tokens.add(eos_token)
197188198189 # Also check pad_token if it's different from eos_token
199199- pad_token = getattr(self.tokenizer, 'pad_token', None)
190190+ pad_token = getattr(self.tokenizer, "pad_token", None)
200191 if pad_token and pad_token != eos_token:
201192 self._stop_tokens.add(pad_token)
202193203194 # Check additional_special_tokens
204204- if hasattr(self.tokenizer, 'additional_special_tokens'):
195195+ if hasattr(self.tokenizer, "additional_special_tokens"):
205196 for token in self.tokenizer.additional_special_tokens:
206197 if token and isinstance(token, str):
207198 # Only add tokens that look like stop/end tokens
208208- if any(keyword in token.lower() for keyword in ['end', 'stop', 'eot']):
199199+ if any(
200200+ keyword in token.lower() for keyword in ["end", "stop", "eot"]
201201+ ):
209202 self._stop_tokens.add(token)
210210-203203+211204 # MLX-LM 0.27.0+: Extract tokens from added_tokens_decoder (comprehensive source)
212212- if hasattr(self.tokenizer, 'added_tokens_decoder'):
205205+ if hasattr(self.tokenizer, "added_tokens_decoder"):
213206 for _token_id, token_info in self.tokenizer.added_tokens_decoder.items():
214214- if isinstance(token_info, dict) and 'content' in token_info:
215215- token_content = token_info['content']
207207+ if isinstance(token_info, dict) and "content" in token_info:
208208+ token_content = token_info["content"]
216209 if token_content and isinstance(token_content, str):
217210 token_lower = token_content.lower()
218218-211211+219212 # NOTE: <|end|> is NOT a stop token for MXFP4 models!
220213 # It's a separator between reasoning and final answer
221221- if token_content == '<|end|>':
214214+ if token_content == "<|end|>":
222215 self._message_end_tokens.add(token_content)
223216 # Do NOT add as stop token - let model continue to final answer
224224-217217+225218 # Look for tokens that could be end/stop tokens
226219 # Expanded patterns for MLX-LM 0.27.0 token varieties
227220 # EXCLUDE <|end|> for MXFP4 models as it's a reasoning separator
228228- end_patterns = ['stop', 'eot', 'return', 'finish', 'done', 'im_end']
221221+ end_patterns = [
222222+ "stop",
223223+ "eot",
224224+ "return",
225225+ "finish",
226226+ "done",
227227+ "im_end",
228228+ ]
229229 if any(pattern in token_lower for pattern in end_patterns):
230230 # Decide if it's a message-end or conversation-end token
231231- if 'im_end' in token_lower:
231231+ if "im_end" in token_lower:
232232 self._message_end_tokens.add(token_content)
233233 self._stop_tokens.add(token_content)
234234 # Special handling for 'end' pattern - more selective
235235- elif 'end' in token_lower and token_content != '<|end|>':
235235+ elif "end" in token_lower and token_content != "<|end|>":
236236 # Only add non-<|end|> tokens with 'end' in them
237237 self._stop_tokens.add(token_content)
238238-238238+239239 # Special case: control tokens in |..| format
240240- elif token_content.startswith('<|') and token_content.endswith('|>'):
240240+ elif token_content.startswith("<|") and token_content.endswith(
241241+ "|>"
242242+ ):
241243 # Be inclusive with control tokens that might stop generation
242242- if any(pattern in token_lower for pattern in ['end', 'return', 'stop', 'finish']):
244244+ if any(
245245+ pattern in token_lower
246246+ for pattern in ["end", "return", "stop", "finish"]
247247+ ):
243248 self._stop_tokens.add(token_content)
244249245250 # Model-specific handling based on known patterns
246251 # Use reasoning_utils for reasoning model detection and patterns
247247- from .reasoning_utils import ReasoningExtractor
248248-249249- if hasattr(self.tokenizer, 'name_or_path'):
250250- name_or_path = str(getattr(self.tokenizer, 'name_or_path', '')).lower()
252252+ from ..reasoning_utils import ReasoningExtractor
253253+254254+ if hasattr(self.tokenizer, "name_or_path"):
255255+ name_or_path = str(getattr(self.tokenizer, "name_or_path", "")).lower()
251256 model_type = ReasoningExtractor.detect_model_type(name_or_path)
252252-257257+253258 if model_type:
254259 # This is a reasoning model
255260 self._is_reasoning_model = True
256256-261261+257262 # Get patterns from reasoning_utils
258263 if model_type in ReasoningExtractor.PATTERNS:
259259- markers = ReasoningExtractor.PATTERNS[model_type]['markers']
260260- self._reasoning_start = markers.get('reasoning_start')
261261- self._reasoning_end = markers.get('reasoning_end')
262262- self._final_start = markers.get('final_marker')
263263-264264+ markers = ReasoningExtractor.PATTERNS[model_type]["markers"]
265265+ self._reasoning_start = markers.get("reasoning_start")
266266+ self._reasoning_end = markers.get("reasoning_end")
267267+ self._final_start = markers.get("final_marker")
268268+264269 # For reasoning models, remove reasoning_end from stop tokens
265270 if self._reasoning_end:
266271 self._stop_tokens.discard(self._reasoning_end)
267267-272272+268273 # Add proper stop token for this model type
269269- if model_type == 'gpt-oss':
270270- if '<|return|>' not in self._stop_tokens:
271271- self._stop_tokens.add('<|return|>')
274274+ if model_type == "gpt-oss":
275275+ if "<|return|>" not in self._stop_tokens:
276276+ self._stop_tokens.add("<|return|>")
272277 else:
273278 self._is_reasoning_model = False
274279 else:
275280 self._is_reasoning_model = False
276281277282 # Add common stop tokens that might not be in special tokens
278278- common_stop_tokens = {'</s>', '<|endoftext|>', '<|im_end|>', '<|eot_id|>'}
279279-283283+ common_stop_tokens = {"</s>", "<|endoftext|>", "<|im_end|>", "<|eot_id|>"}
284284+280285 # Add chat-specific stop tokens to prevent model self-conversations
281286 # Based on our _format_conversation() format: "Human:" and "Assistant:"
282287 # Also include "You:" as models might use UI-visible format
283288 # Include single-letter variations (H:, A:, Y:) that some models use
284289 chat_stop_tokens = {
285285- '\nHuman:', '\nAssistant:', '\nYou:',
286286- '\n\nHuman:', '\n\nAssistant:', '\n\nYou:',
287287- '\nH:', '\nA:', '\nY:', # Single-letter variations
288288- '\n\nH:', '\n\nA:', '\n\nY:'
290290+ "\nHuman:",
291291+ "\nAssistant:",
292292+ "\nYou:",
293293+ "\n\nHuman:",
294294+ "\n\nAssistant:",
295295+ "\n\nYou:",
296296+ "\nH:",
297297+ "\nA:",
298298+ "\nY:", # Single-letter variations
299299+ "\n\nH:",
300300+ "\n\nA:",
301301+ "\n\nY:",
289302 }
290303291304 # Add common stop tokens only if they decode to themselves (i.e., they're single tokens)
···299312 self._stop_tokens.add(token)
300313 except:
301314 pass
302302-315315+303316 # Store chat stop tokens separately - only used in interactive chat mode
304317 # This prevents stopping mid-story when user asks for dialogues
305318 self._chat_stop_tokens = list(chat_stop_tokens)
···320333321334 def cleanup(self):
322335 """Clean up model resources and clear GPU memory.
323323-336336+324337 This method is safe to call multiple times and handles partial state cleanup.
325338 """
326339 if self.verbose and self._model_loaded:
···342355343356 # Force garbage collection and clear MLX cache
344357 import gc
358358+345359 gc.collect()
346360 try:
347361 mx.clear_cache()
···350364351365 if self.verbose:
352366 memory_after = mx.get_active_memory() / 1024**3
353353- if 'memory_before' in locals():
367367+ if "memory_before" in locals():
354368 memory_freed = memory_before - memory_after
355355- print(f"Cleanup complete (memory after: {memory_after:.1f}GB, freed: {memory_freed:.1f}GB)")
369369+ print(
370370+ f"Cleanup complete (memory after: {memory_after:.1f}GB, freed: {memory_freed:.1f}GB)"
371371+ )
356372 else:
357373 print(f"Cleanup complete (memory after: {memory_after:.1f}GB)")
358374359359- def get_effective_max_tokens(self, requested_tokens: Optional[int], interactive: bool = False) -> int:
375375+ def get_effective_max_tokens(
376376+ self, requested_tokens: Optional[int], interactive: bool = False
377377+ ) -> int:
360378 """Get effective max tokens based on model context and usage mode.
361361-379379+362380 Args:
363381 requested_tokens: The requested max tokens (None if user didn't specify --max-tokens)
364382 interactive: True if this is interactive mode (gets full context length)
365365-383383+366384 Returns:
367385 Effective max tokens to use
368386 """
···371389 fallback = 4096 if interactive else 2048
372390 if self.verbose:
373391 if requested_tokens is None:
374374- print(f"[WARNING] Model context length unknown, using fallback: {fallback} tokens")
392392+ print(
393393+ f"[WARNING] Model context length unknown, using fallback: {fallback} tokens"
394394+ )
375395 else:
376376- print(f"[WARNING] Model context length unknown, using user specified: {requested_tokens} tokens")
396396+ print(
397397+ f"[WARNING] Model context length unknown, using user specified: {requested_tokens} tokens"
398398+ )
377399 return requested_tokens if requested_tokens is not None else fallback
378378-400400+379401 if interactive:
380402 if requested_tokens is None:
381403 # User didn't specify --max-tokens: use full model context
···402424 hide_reasoning: bool = False,
403425 ) -> Iterator[str]:
404426 """Generate text with streaming output.
405405-427427+406428 Args:
407429 prompt: Input prompt
408430 max_tokens: Maximum tokens to generate
···413435 use_chat_template: Apply tokenizer's chat template if available
414436 use_chat_stop_tokens: Include chat turn markers as stop tokens (for interactive mode)
415437 interactive: True if this is interactive mode (affects token limits)
416416-438438+417439 Yields:
418440 Generated tokens as they are produced
419441 """
420442 if not self.model or not self.tokenizer:
421443 raise RuntimeError("Model not loaded. Call load_model() first.")
422422-444444+423445 # Initialize reasoning parser if this is a reasoning model
424446 reasoning_parser = None
425447 if self._is_reasoning_model:
426448 model_type = ReasoningExtractor.detect_model_type(
427427- getattr(self.tokenizer, 'name_or_path', '') or ''
449449+ getattr(self.tokenizer, "name_or_path", "") or ""
450450+ )
451451+ reasoning_parser = StreamingReasoningParser(
452452+ model_type, hide_reasoning=hide_reasoning
428453 )
429429- reasoning_parser = StreamingReasoningParser(model_type, hide_reasoning=hide_reasoning)
430454431455 # Apply context-aware token limits
432456 effective_max_tokens = self.get_effective_max_tokens(max_tokens, interactive)
433457434458 # Apply chat template if available and requested
435435- if use_chat_template and hasattr(self.tokenizer, 'chat_template') and self.tokenizer.chat_template:
459459+ if (
460460+ use_chat_template
461461+ and hasattr(self.tokenizer, "chat_template")
462462+ and self.tokenizer.chat_template
463463+ ):
436464 messages = [{"role": "user", "content": prompt}]
437465 formatted_prompt = self.tokenizer.apply_chat_template(
438438- messages,
439439- tokenize=False,
440440- add_generation_prompt=True
466466+ messages, tokenize=False, add_generation_prompt=True
441467 )
442468 else:
443469 formatted_prompt = prompt
···479505480506 for token, _ in generator:
481507 # Token might be an array or an int
482482- token_id = token.item() if hasattr(token, 'item') else token
508508+ token_id = token.item() if hasattr(token, "item") else token
483509 generated_tokens.append(token_id)
484510485511 # Use a sliding window approach for efficiency
···493519 if start_idx == 0:
494520 # We're still within the context window
495521 if window_text.startswith(previous_decoded):
496496- new_text = window_text[len(previous_decoded):]
522522+ new_text = window_text[len(previous_decoded) :]
497523 else:
498524 new_text = self.tokenizer.decode([token_id])
499525 previous_decoded = window_text
···504530 if len(window_tokens) > 1:
505531 prefix = self.tokenizer.decode(window_tokens[:-1])
506532 if new_text.startswith(prefix):
507507- new_text = new_text[len(prefix):]
533533+ new_text = new_text[len(prefix) :]
508534 else:
509535 new_text = self.tokenizer.decode([token_id])
510536511537 if new_text:
512538 # Update accumulated response for stop token checking
513539 accumulated_response += new_text
514514-540540+515541 # Filter out stop tokens with priority: native first, then chat fallback
516542 # Check native stop tokens FIRST in accumulated response (highest priority)
517543 native_stop_tokens = self._stop_tokens if self._stop_tokens else []
···522548 # Calculate what text came before the stop token
523549 text_before_stop = accumulated_response[:stop_pos]
524550 # Calculate how much of that is new (not previously yielded)
525525- previously_yielded_length = len(accumulated_response) - len(new_text)
551551+ previously_yielded_length = len(accumulated_response) - len(
552552+ new_text
553553+ )
526554 if len(text_before_stop) > previously_yielded_length:
527555 # Yield only the new part before stop token
528528- new_part_before_stop = text_before_stop[previously_yielded_length:]
556556+ new_part_before_stop = text_before_stop[
557557+ previously_yielded_length:
558558+ ]
529559 if new_part_before_stop:
530560 if reasoning_parser:
531561 # Process through reasoning parser for formatting
532532- for formatted_token in reasoning_parser.process_token(new_part_before_stop):
562562+ for (
563563+ formatted_token
564564+ ) in reasoning_parser.process_token(
565565+ new_part_before_stop
566566+ ):
533567 yield formatted_token
534568 else:
535569 yield new_part_before_stop
536570 return # Stop generation without yielding stop token
537537-571571+538572 # Only check chat stop tokens if no native stop token found (fallback)
539573 if use_chat_stop_tokens and self._chat_stop_tokens:
540574 for stop_token in self._chat_stop_tokens:
···544578 # Calculate what text came before the stop token
545579 text_before_stop = accumulated_response[:stop_pos]
546580 # Calculate how much of that is new (not previously yielded)
547547- previously_yielded_length = len(accumulated_response) - len(new_text)
581581+ previously_yielded_length = len(accumulated_response) - len(
582582+ new_text
583583+ )
548584 if len(text_before_stop) > previously_yielded_length:
549585 # Yield only the new part before stop token
550550- new_part_before_stop = text_before_stop[previously_yielded_length:]
586586+ new_part_before_stop = text_before_stop[
587587+ previously_yielded_length:
588588+ ]
551589 if new_part_before_stop:
552590 if reasoning_parser:
553591 # Process through reasoning parser for formatting
554554- for formatted_token in reasoning_parser.process_token(new_part_before_stop):
592592+ for (
593593+ formatted_token
594594+ ) in reasoning_parser.process_token(
595595+ new_part_before_stop
596596+ ):
555597 yield formatted_token
556598 else:
557599 yield new_part_before_stop
···574616 # Finalize reasoning parser if used
575617 if reasoning_parser:
576618 yield from reasoning_parser.finalize()
577577-619619+578620 # Print generation statistics if verbose
579621 if self.verbose:
580622 generation_time = time.time() - start_time
581581- tokens_per_second = tokens_generated / generation_time if generation_time > 0 else 0
582582- print(f"\n\nGenerated {tokens_generated} tokens in {generation_time:.1f}s ({tokens_per_second:.1f} tokens/s)")
623623+ tokens_per_second = (
624624+ tokens_generated / generation_time if generation_time > 0 else 0
625625+ )
626626+ print(
627627+ f"\n\nGenerated {tokens_generated} tokens in {generation_time:.1f}s ({tokens_per_second:.1f} tokens/s)"
628628+ )
583629584630 def generate_batch(
585631 self,
···593639 interactive: bool = False,
594640 ) -> str:
595641 """Generate text in batch mode (non-streaming).
596596-642642+597643 Args:
598644 prompt: Input prompt
599645 max_tokens: Maximum tokens to generate
···603649 repetition_context_size: Context size for repetition penalty
604650 use_chat_template: Apply tokenizer's chat template if available
605651 interactive: True if this is interactive mode (affects token limits)
606606-652652+607653 Returns:
608654 Generated text
609655 """
···614660 effective_max_tokens = self.get_effective_max_tokens(max_tokens, interactive)
615661616662 # Apply chat template if available and requested
617617- if use_chat_template and hasattr(self.tokenizer, 'chat_template') and self.tokenizer.chat_template:
663663+ if (
664664+ use_chat_template
665665+ and hasattr(self.tokenizer, "chat_template")
666666+ and self.tokenizer.chat_template
667667+ ):
618668 messages = [{"role": "user", "content": prompt}]
619669 formatted_prompt = self.tokenizer.apply_chat_template(
620620- messages,
621621- tokenize=False,
622622- add_generation_prompt=True
670670+ messages, tokenize=False, add_generation_prompt=True
623671 )
624672 else:
625673 formatted_prompt = prompt
···654702655703 for token, _ in generator:
656704 # Token might be an array or an int
657657- token_id = token.item() if hasattr(token, 'item') else token
705705+ token_id = token.item() if hasattr(token, "item") else token
658706 generated_tokens.append(token_id)
659707 all_tokens.append(token_id)
660708···667715668716 # Remove the prompt part
669717 if full_response.startswith(formatted_prompt):
670670- response = full_response[len(formatted_prompt):]
718718+ response = full_response[len(formatted_prompt) :]
671719 else:
672720 # Fallback: just decode generated tokens
673721 response = self.tokenizer.decode(generated_tokens)
674722675723 # Apply end-token filtering (same logic as streaming mode for Issue #20)
676676- response = self._filter_end_tokens_from_response(response, use_chat_stop_tokens=False)
677677-724724+ response = self._filter_end_tokens_from_response(
725725+ response, use_chat_stop_tokens=False
726726+ )
727727+678728 # Format reasoning models output
679729 response = self._format_reasoning_response(response)
680730···683733 # Count tokens for statistics
684734 if self.verbose:
685735 tokens_generated = len(generated_tokens)
686686- tokens_per_second = tokens_generated / generation_time if generation_time > 0 else 0
687687- print(f"\nGenerated {tokens_generated} tokens in {generation_time:.1f}s ({tokens_per_second:.1f} tokens/s)")
736736+ tokens_per_second = (
737737+ tokens_generated / generation_time if generation_time > 0 else 0
738738+ )
739739+ print(
740740+ f"\nGenerated {tokens_generated} tokens in {generation_time:.1f}s ({tokens_per_second:.1f} tokens/s)"
741741+ )
688742689743 return response
690744···698752 use_chat_template: bool = True,
699753 ):
700754 """Run an interactive chat session.
701701-755755+702756 Args:
703757 system_prompt: Optional system prompt to prepend
704758 max_tokens: Maximum tokens per response
···718772 # Get user input
719773 user_input = input("You: ").strip()
720774721721- if user_input.lower() in ['exit', 'quit', 'q']:
775775+ if user_input.lower() in ["exit", "quit", "q"]:
722776 print("\nGoodbye!")
723777 break
724778···729783 conversation_history.append({"role": "user", "content": user_input})
730784731785 # Format conversation for the model using chat template if available
732732- prompt = self._format_conversation(conversation_history, use_chat_template=use_chat_template)
786786+ prompt = self._format_conversation(
787787+ conversation_history, use_chat_template=use_chat_template
788788+ )
733789734790 # Generate response with streaming
735791 print("\nAssistant: ", end="", flush=True)
···751807752808 # Add assistant response to history
753809 assistant_response = "".join(response_tokens).strip()
754754- conversation_history.append({"role": "assistant", "content": assistant_response})
810810+ conversation_history.append(
811811+ {"role": "assistant", "content": assistant_response}
812812+ )
755813756814 print() # New line after response
757815···762820 print(f"\n[ERROR] {e}")
763821 continue
764822765765- def _format_conversation(self, messages: list, use_chat_template: bool = True) -> str:
823823+ def _format_conversation(
824824+ self, messages: list, use_chat_template: bool = True
825825+ ) -> str:
766826 """Format conversation history into a prompt.
767767-827827+768828 Uses the tokenizer's chat template if available, otherwise falls back
769829 to the legacy Human:/Assistant: format for compatibility.
770770-830830+771831 Args:
772832 messages: List of message dictionaries with 'role' and 'content'
773833 use_chat_template: Whether to use chat template if available
774774-834834+775835 Returns:
776836 Formatted conversation string
777837 """
778838 # Try to use native chat template if available
779779- if use_chat_template and hasattr(self.tokenizer, 'chat_template') and self.tokenizer.chat_template:
839839+ if (
840840+ use_chat_template
841841+ and hasattr(self.tokenizer, "chat_template")
842842+ and self.tokenizer.chat_template
843843+ ):
780844 try:
781845 # Apply the tokenizer's chat template
782846 formatted_prompt = self.tokenizer.apply_chat_template(
783783- messages,
784784- tokenize=False,
785785- add_generation_prompt=True
847847+ messages, tokenize=False, add_generation_prompt=True
786848 )
787849 return formatted_prompt
788850 except Exception as e:
789851 # If chat template fails, fall back to legacy format
790852 if self.verbose:
791853 print(f"[WARNING] Chat template failed, using legacy format: {e}")
792792-854854+793855 # Legacy format fallback for compatibility
794856 return self._legacy_format_conversation(messages)
795795-857857+796858 def _legacy_format_conversation(self, messages: list) -> str:
797859 """Legacy conversation formatting for backward compatibility.
798798-860860+799861 This format was used in earlier versions and remains as a fallback
800862 for models without chat templates.
801863 """
···819881820882 def get_memory_usage(self) -> Dict[str, float]:
821883 """Get current memory usage statistics.
822822-884884+823885 Returns:
824886 Dictionary with memory statistics in GB
825887 """
···834896 return {
835897 "current_gb": current_memory,
836898 "peak_gb": peak_memory,
837837- "model_gb": current_memory - self._memory_baseline if self._memory_baseline else 0,
899899+ "model_gb": (
900900+ current_memory - self._memory_baseline if self._memory_baseline else 0
901901+ ),
838902 }
839903840904 def _format_reasoning_response(self, response: str) -> str:
841905 """Format response from reasoning models for better readability.
842842-906906+843907 For MXFP4 models that generate reasoning followed by final answer,
844908 format it nicely for display.
845909 """
846910 if not self._is_reasoning_model:
847911 return response
848848-912912+849913 # Check if response contains reasoning markers
850914 if self._reasoning_start in response and self._final_start in response:
851915 # Extract reasoning and final parts
852916 try:
853917 # Split on the reasoning start
854918 before_reasoning, after_start = response.split(self._reasoning_start, 1)
855855-919919+856920 # Find the reasoning content (until <|end|>)
857921 if self._reasoning_end in after_start:
858858- reasoning_content, after_reasoning = after_start.split(self._reasoning_end, 1)
859859-922922+ reasoning_content, after_reasoning = after_start.split(
923923+ self._reasoning_end, 1
924924+ )
925925+860926 # Find the final answer
861927 if self._final_start in after_reasoning:
862928 # Extract everything after final marker
863929 final_parts = after_reasoning.split(self._final_start, 1)
864930 if len(final_parts) > 1:
865931 # Remove the <|channel|>final<|message|> marker
866866- final_answer = final_parts[1].replace('<|channel|>final<|message|>', '', 1)
867867-932932+ final_answer = final_parts[1].replace(
933933+ "<|channel|>final<|message|>", "", 1
934934+ )
935935+868936 # Format with clear markers for parsing but minimal visual impact
869937 formatted = []
870938 formatted.append("\n**[Reasoning]**\n")
871939 formatted.append(reasoning_content.strip())
872940 formatted.append("\n\n---\n\n**[Answer]**\n")
873941 formatted.append(final_answer.strip())
874874-875875- return '\n'.join(formatted)
942942+943943+ return "\n".join(formatted)
876944 except Exception:
877945 # If parsing fails, return original
878946 pass
879879-947947+880948 # Fallback: just clean up the control tokens
881949 cleaned = response
882882- for marker in ['<|channel|>analysis<|message|>', '<|end|>', '<|start|>assistant',
883883- '<|channel|>final<|message|>', '<|return|>']:
884884- cleaned = cleaned.replace(marker, '')
885885-950950+ for marker in [
951951+ "<|channel|>analysis<|message|>",
952952+ "<|end|>",
953953+ "<|start|>assistant",
954954+ "<|channel|>final<|message|>",
955955+ "<|return|>",
956956+ ]:
957957+ cleaned = cleaned.replace(marker, "")
958958+886959 return cleaned.strip()
887887-888888- def _filter_end_tokens_from_response(self, response: str, use_chat_stop_tokens: bool = False) -> str:
960960+961961+ def _filter_end_tokens_from_response(
962962+ self, response: str, use_chat_stop_tokens: bool = False
963963+ ) -> str:
889964 """Filter end tokens from a complete response (batch mode).
890890-965965+891966 This method applies the same filtering logic as the streaming mode
892967 to ensure consistent behavior between streaming and non-streaming.
893893-968968+894969 Args:
895970 response: The complete generated response
896971 use_chat_stop_tokens: Whether to apply chat stop tokens
897897-972972+898973 Returns:
899974 Response with end tokens filtered out
900975 """
···906981 stop_pos = response.find(stop_token)
907982 filtered_response = response[:stop_pos].rstrip()
908983 if self.verbose:
909909- print(f"[DEBUG] Filtered stop token '{stop_token}' at position {stop_pos}")
984984+ print(
985985+ f"[DEBUG] Filtered stop token '{stop_token}' at position {stop_pos}"
986986+ )
910987 return filtered_response
911911-988988+912989 # Only check chat stop tokens if no native stop token found (fallback)
913990 if use_chat_stop_tokens and self._chat_stop_tokens:
914991 for stop_token in self._chat_stop_tokens:
···916993 # Find the stop token position and return everything before it
917994 stop_pos = response.find(stop_token)
918995 return response[:stop_pos]
919919-996996+920997 # No stop tokens found, return original response
921998 return response
92299992310009241001def get_gpu_status() -> Dict[str, float]:
9251002 """Independent GPU status check - usable from anywhere.
926926-10031003+9271004 Returns:
9281005 Dictionary with GPU memory statistics in GB
9291006 """
···93510129361013def check_memory_available(required_gb: float) -> bool:
9371014 """Pre-flight check before model loading.
938938-10151015+9391016 Args:
9401017 required_gb: Required memory in GB
941941-10181018+9421019 Returns:
9431020 True if memory is likely available (conservative estimate)
9441021 """
···9661043 verbose: bool = False,
9671044) -> Optional[str]:
9681045 """Enhanced run function with direct MLX integration.
969969-10461046+9701047 Uses context manager pattern for automatic resource cleanup.
971971-10481048+9721049 Args:
9731050 model_path: Path to the MLX model
9741051 prompt: Input prompt (if None, enters interactive mode)
···9781055 top_p: Top-p sampling parameter
9791056 repetition_penalty: Penalty for repeated tokens
9801057 stream: Whether to stream output
981981-10581058+9821059 Returns:
9831060 Generated text (in non-interactive mode)
9841061 """
···10381115 # Show memory usage if verbose
10391116 if verbose:
10401117 memory_stats = runner.get_memory_usage()
10411041- print(f"\n\nMemory: {memory_stats['model_gb']:.1f}GB model, {memory_stats['current_gb']:.1f}GB total")
11181118+ print(
11191119+ f"\n\nMemory: {memory_stats['model_gb']:.1f}GB model, {memory_stats['current_gb']:.1f}GB total"
11201120+ )
1042112110431122 return response
10441123···10471126 except Exception as e:
10481127 print(f"\n[ERROR] {e}")
10491128 return None
10501050-
+27-43
server/model_card.py
···11-# MIT License
22-33-# Copyright (c) 2025 The BROKE team 🦫
44-55-# Permission is hereby granted, free of charge, to any person obtaining a copy
66-# of this software and associated documentation files (the "Software"), to deal
77-# in the Software without restriction, including without limitation the rights
88-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
99-# copies of the Software, and to permit persons to whom the Software is
1010-# furnished to do so, subject to the following conditions:
1111-1212-# The above copyright notice and this permission notice shall be included in all
1313-# copies or substantial portions of the Software.
1414-1515-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1616-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1717-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
1818-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
1919-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
2020-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
2121-# SOFTWARE.
221from __future__ import annotations
232243# ruff: noqa: UP045
···4524def _latest_snapshot_dir(model_base_dir: Path) -> Optional[Path]:
4625 """Return latest snapshot directory for a cached HF model base dir."""
4726 try:
4848- snaps = (model_base_dir / "snapshots")
2727+ snaps = model_base_dir / "snapshots"
4928 if not snaps.exists():
5029 return None
5130 candidates = [d for d in snaps.iterdir() if d.is_dir()]
···7352 """
7453 start = text.find("\n---\n")
7554 # Accept files starting directly with '---' too
7676- if text.startswith('---'):
5555+ if text.startswith("---"):
7756 start = 0
7857 elif start >= 0:
7958 start = start + 1 # move to line start
8059 else:
8160 # Try at very beginning without newline
8282- start = 0 if text[:3] == '---' else -1
6161+ start = 0 if text[:3] == "---" else -1
8362 if start != 0:
8463 return {}
85648665 # Find closing '---' after start
8787- end = text.find('\n---', 3)
6666+ end = text.find("\n---", 3)
8867 if end == -1:
8968 return {}
9090- header = text[3:end] if text.startswith('---') else text[start + 3:end]
6969+ header = text[3:end] if text.startswith("---") else text[start + 3 : end]
91709271 # Normalize lines
9372 lines = [ln.strip() for ln in header.splitlines() if ln.strip()]
···10382 list_acc = []
1048310584 for ln in lines:
106106- if ln.startswith('- '):
8585+ if ln.startswith("- "):
10786 # list item under current_key
108108- val = ln[2:].strip().strip('"\'')
8787+ val = ln[2:].strip().strip("\"'")
10988 if current_key is not None:
11089 list_acc.append(val)
11190 continue
11291 # key: value or key: [a, b]
113113- if ':' in ln:
9292+ if ":" in ln:
11493 # Close any previous list
11594 flush_list()
116116- key, val = ln.split(':', 1)
9595+ key, val = ln.split(":", 1)
11796 key = key.strip()
11897 val = val.strip()
11998 current_key = key
···122101 data.setdefault(key, [])
123102 continue
124103 # Inline list [a, b]
125125- if val.startswith('[') and val.endswith(']'):
104104+ if val.startswith("[") and val.endswith("]"):
126105 inner = val[1:-1].strip()
127127- items = [] if not inner else [it.strip().strip('"\'') for it in inner.split(',')]
106106+ items = (
107107+ []
108108+ if not inner
109109+ else [it.strip().strip("\"'") for it in inner.split(",")]
110110+ )
128111 data[key] = [x for x in items if x]
129112 continue
130113 # Scalar
131131- data[key] = val.strip('"\'')
114114+ data[key] = val.strip("\"'")
132115 continue
133116 # Non key-value, ignore
134117 # Flush last list
···136119 return data
137120138121139139-def read_readme_front_matter(model_base_dir: Path) -> Tuple[Optional[List[str]], Optional[str], Optional[str]]:
122122+def read_readme_front_matter(
123123+ model_base_dir: Path,
124124+) -> Tuple[Optional[List[str]], Optional[str], Optional[str]]:
140125 """Read README.md front matter and extract tags, pipeline_tag, library_name.
141126142127 Returns (tags, pipeline_tag, library_name) with lowercase normalization where applicable.
···146131 snap = _latest_snapshot_dir(model_base_dir)
147132 if not snap:
148133 return None, None, None
149149- readme = snap / 'README.md'
134134+ readme = snap / "README.md"
150135 if not readme.exists():
151136 return None, None, None
152152- text = readme.read_text(encoding='utf-8', errors='ignore')
137137+ text = readme.read_text(encoding="utf-8", errors="ignore")
153138 fm = _lenient_yaml_front_matter(text)
154139 if not fm:
155140 return None, None, None
156156- tags = fm.get('tags')
141141+ tags = fm.get("tags")
157142 if isinstance(tags, list):
158143 tags = [str(t).strip().lower() for t in tags if str(t).strip()]
159144 else:
160145 tags = None
161161- pipeline = fm.get('pipeline_tag')
146146+ pipeline = fm.get("pipeline_tag")
162147 pipeline = str(pipeline).strip().lower() if pipeline else None
163163- lib = fm.get('library_name')
148148+ lib = fm.get("library_name")
164149 lib = str(lib).strip().lower() if lib else None
165150 return tags, pipeline, lib
166151 except Exception:
···173158 snap = _latest_snapshot_dir(model_base_dir)
174159 if not snap:
175160 return False
176176- tk = snap / 'tokenizer_config.json'
161161+ tk = snap / "tokenizer_config.json"
177162 if not tk.exists():
178163 return False
179179- with open(tk, encoding='utf-8') as f:
164164+ with open(tk, encoding="utf-8") as f:
180165 data = json.load(f)
181181- tmpl = data.get('chat_template')
166166+ tmpl = data.get("chat_template")
182167 return bool(tmpl and isinstance(tmpl, str) and tmpl.strip())
183168 except Exception:
184169 return False
185185-
···11-# MIT License
22-33-# Copyright (c) 2025 The BROKE team 🦫
44-55-# Permission is hereby granted, free of charge, to any person obtaining a copy
66-# of this software and associated documentation files (the "Software"), to deal
77-# in the Software without restriction, including without limitation the rights
88-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
99-# copies of the Software, and to permit persons to whom the Software is
1010-# furnished to do so, subject to the following conditions:
1111-1212-# The above copyright notice and this permission notice shall be included in all
1313-# copies or substantial portions of the Software.
1414-1515-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1616-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1717-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
1818-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
1919-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
2020-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
2121-# SOFTWARE.
221"""
232Utilities for handling reasoning models and their output.
243···35143615class ReasoningExtractor:
3716 """Extract reasoning and final answer from model outputs."""
3838-1717+3918 # Model-specific patterns
4019 PATTERNS = {
4141- 'gpt-oss': {
4242- 'reasoning': r'<\|channel\|>analysis<\|message\|>(.*?)<\|end\|>',
4343- 'final': r'<\|channel\|>final<\|message\|>(.*?)(?:<\|return\|>|$)',
4444- 'markers': {
4545- 'reasoning_start': '<|channel|>analysis<|message|>',
4646- 'reasoning_end': '<|end|>',
4747- 'final_marker': '<|channel|>final<|message|>',
2020+ "gpt-oss": {
2121+ "reasoning": r"<\|channel\|>analysis<\|message\|>(.*?)<\|end\|>",
2222+ "final": r"<\|channel\|>final<\|message\|>(.*?)(?:<\|return\|>|$)",
2323+ "markers": {
2424+ "reasoning_start": "<|channel|>analysis<|message|>",
2525+ "reasoning_end": "<|end|>",
2626+ "final_marker": "<|channel|>final<|message|>",
4827 # Skip tokens that appear between reasoning and final
4949- 'skip_tokens': ['<|start|>assistant<|channel|>final<|message|>', '<|start|>assistant', '<|start|>', '<|channel|>final<|message|>'],
2828+ "skip_tokens": [
2929+ "<|start|>assistant<|channel|>final<|message|>",
3030+ "<|start|>assistant",
3131+ "<|start|>",
3232+ "<|channel|>final<|message|>",
3333+ ],
5034 # Conditional skip tokens - only skip if at start of final section
5151- 'conditional_skip': ['assistant']
5252- }
3535+ "conditional_skip": ["assistant"],
3636+ },
5337 },
5454- 'deepseek': {
5555- 'reasoning': r'<think>(.*?)</think>',
5656- 'final': r'</think>(.*?)$',
5757- 'markers': {
5858- 'reasoning_start': '<think>',
5959- 'reasoning_end': '</think>',
6060- }
3838+ "deepseek": {
3939+ "reasoning": r"<think>(.*?)</think>",
4040+ "final": r"</think>(.*?)$",
4141+ "markers": {
4242+ "reasoning_start": "<think>",
4343+ "reasoning_end": "</think>",
4444+ },
6145 },
6262- 'claude': {
6363- 'reasoning': r'<thinking>(.*?)</thinking>',
6464- 'final': r'</thinking>(.*?)$',
6565- 'markers': {
6666- 'reasoning_start': '<thinking>',
6767- 'reasoning_end': '</thinking>',
6868- }
6969- }
4646+ "claude": {
4747+ "reasoning": r"<thinking>(.*?)</thinking>",
4848+ "final": r"</thinking>(.*?)$",
4949+ "markers": {
5050+ "reasoning_start": "<thinking>",
5151+ "reasoning_end": "</thinking>",
5252+ },
5353+ },
7054 }
7171-5555+7256 @classmethod
7357 def detect_model_type(cls, model_name: str) -> Optional[str]:
7458 """Detect reasoning model type from model name."""
7559 model_lower = model_name.lower()
7676-7777- if 'gpt-oss' in model_lower:
7878- return 'gpt-oss'
7979- elif 'deepseek' in model_lower and 'r1' in model_lower:
8080- return 'deepseek'
8181- elif 'claude' in model_lower:
8282- return 'claude'
8383- elif 'qwq' in model_lower:
8484- return 'gpt-oss' # QwQ uses similar format to GPT-OSS
8585-6060+6161+ if "gpt-oss" in model_lower:
6262+ return "gpt-oss"
6363+ elif "deepseek" in model_lower and "r1" in model_lower:
6464+ return "deepseek"
6565+ elif "claude" in model_lower:
6666+ return "claude"
6767+ elif "qwq" in model_lower:
6868+ return "gpt-oss" # QwQ uses similar format to GPT-OSS
6969+8670 return None
8787-7171+8872 @classmethod
8989- def extract(cls, text: str, model_type: Optional[str] = None,
9090- model_name: Optional[str] = None) -> Dict[str, Optional[str]]:
7373+ def extract(
7474+ cls,
7575+ text: str,
7676+ model_type: Optional[str] = None,
7777+ model_name: Optional[str] = None,
7878+ ) -> Dict[str, Optional[str]]:
9179 """
9280 Extract reasoning and final answer from model output.
9393-8181+9482 Args:
9583 text: The full model output
9684 model_type: Explicit model type ('mxfp4', 'deepseek', etc.)
9785 model_name: Model name to auto-detect type
9898-8686+9987 Returns:
10088 Dictionary with 'reasoning', 'final_answer', and 'full_response'
10189 """
10290 # Auto-detect model type if not provided
10391 if not model_type and model_name:
10492 model_type = cls.detect_model_type(model_name)
105105-9393+10694 # If no model type detected, return text as-is
10795 if not model_type or model_type not in cls.PATTERNS:
10896 return {
109109- 'reasoning': None,
110110- 'final_answer': text,
111111- 'full_response': text,
112112- 'has_reasoning': False
9797+ "reasoning": None,
9898+ "final_answer": text,
9999+ "full_response": text,
100100+ "has_reasoning": False,
113101 }
114114-102102+115103 patterns = cls.PATTERNS[model_type]
116116-104104+117105 # Extract reasoning
118118- reasoning_match = re.search(patterns['reasoning'], text, re.DOTALL)
106106+ reasoning_match = re.search(patterns["reasoning"], text, re.DOTALL)
119107 reasoning = reasoning_match.group(1).strip() if reasoning_match else None
120120-108108+121109 # Extract final answer
122122- final_match = re.search(patterns['final'], text, re.DOTALL)
110110+ final_match = re.search(patterns["final"], text, re.DOTALL)
123111 final_answer = final_match.group(1).strip() if final_match else None
124124-112112+125113 # If no final answer found but we have reasoning,
126114 # the text after reasoning might be the answer
127115 if reasoning and not final_answer:
128116 # Try to find text after reasoning markers
129129- markers = patterns.get('markers', {})
130130- if 'reasoning_end' in markers:
131131- split_text = text.split(markers['reasoning_end'], 1)
117117+ markers = patterns.get("markers", {})
118118+ if "reasoning_end" in markers:
119119+ split_text = text.split(markers["reasoning_end"], 1)
132120 if len(split_text) > 1:
133121 # Clean up any remaining markers
134122 remaining = split_text[1]
135123 for marker in markers.values():
136136- remaining = remaining.replace(marker, '')
124124+ remaining = remaining.replace(marker, "")
137125 final_answer = remaining.strip()
138138-126126+139127 # If still no final answer, use full text minus reasoning markers
140128 if not final_answer:
141129 final_answer = text
142130 # Remove all known markers
143131 if model_type in cls.PATTERNS:
144144- markers = cls.PATTERNS[model_type].get('markers', {})
132132+ markers = cls.PATTERNS[model_type].get("markers", {})
145133 for marker in markers.values():
146146- final_answer = final_answer.replace(marker, '')
134134+ final_answer = final_answer.replace(marker, "")
147135 final_answer = final_answer.strip()
148148-136136+149137 return {
150150- 'reasoning': reasoning,
151151- 'final_answer': final_answer,
152152- 'full_response': text,
153153- 'has_reasoning': bool(reasoning),
154154- 'model_type': model_type
138138+ "reasoning": reasoning,
139139+ "final_answer": final_answer,
140140+ "full_response": text,
141141+ "has_reasoning": bool(reasoning),
142142+ "model_type": model_type,
155143 }
156156-144144+157145 @classmethod
158158- def format_for_display(cls, extracted: Dict[str, Optional[str]],
159159- show_reasoning: bool = False) -> str:
146146+ def format_for_display(
147147+ cls, extracted: Dict[str, Optional[str]], show_reasoning: bool = False
148148+ ) -> str:
160149 """
161150 Format extracted content for display.
162162-151151+163152 Args:
164153 extracted: Output from extract()
165154 show_reasoning: Whether to include reasoning in output
166166-155155+167156 Returns:
168157 Formatted string for display
169158 """
170170- if not extracted.get('has_reasoning'):
171171- return extracted.get('final_answer', '')
172172-159159+ if not extracted.get("has_reasoning"):
160160+ return extracted.get("final_answer", "")
161161+173162 if show_reasoning:
174163 output = []
175175- if extracted.get('reasoning'):
164164+ if extracted.get("reasoning"):
176165 output.append("═══ Reasoning ═══")
177177- output.append(extracted['reasoning'])
166166+ output.append(extracted["reasoning"])
178167 output.append("\n═══ Answer ═══")
179179- output.append(extracted.get('final_answer', ''))
180180- return '\n'.join(output)
168168+ output.append(extracted.get("final_answer", ""))
169169+ return "\n".join(output)
181170 else:
182182- return extracted.get('final_answer', '')
171171+ return extracted.get("final_answer", "")
183172184173185174class StreamingReasoningHandler:
186175 """Handle reasoning during streaming generation."""
187187-176176+188177 def __init__(self, model_type: Optional[str] = None):
189178 self.model_type = model_type
190179 self.buffer = ""
···193182 self.in_reasoning = False
194183 self.in_final = False
195184 self.markers = {}
196196-185185+197186 if model_type and model_type in ReasoningExtractor.PATTERNS:
198198- self.markers = ReasoningExtractor.PATTERNS[model_type].get('markers', {})
199199-187187+ self.markers = ReasoningExtractor.PATTERNS[model_type].get("markers", {})
188188+200189 def process_token(self, token: str) -> Tuple[str, bool]:
201190 """
202191 Process a streaming token.
203203-192192+204193 Args:
205194 token: The new token
206206-195195+207196 Returns:
208197 (output_token, should_display) - token to output and whether to display it
209198 """
210199 self.buffer += token
211211-200200+212201 # Check for reasoning start
213213- if not self.in_reasoning and self.markers.get('reasoning_start'):
214214- if self.markers['reasoning_start'] in self.buffer:
202202+ if not self.in_reasoning and self.markers.get("reasoning_start"):
203203+ if self.markers["reasoning_start"] in self.buffer:
215204 self.in_reasoning = True
216216- self.reasoning_buffer = self.buffer.split(self.markers['reasoning_start'])[1]
205205+ self.reasoning_buffer = self.buffer.split(
206206+ self.markers["reasoning_start"]
207207+ )[1]
217208 return ("", False) # Don't display reasoning start marker
218218-209209+219210 # If in reasoning, buffer it
220211 if self.in_reasoning:
221212 self.reasoning_buffer += token
222222-213213+223214 # Check for reasoning end
224224- if self.markers.get('reasoning_end') and self.markers['reasoning_end'] in self.reasoning_buffer:
215215+ if (
216216+ self.markers.get("reasoning_end")
217217+ and self.markers["reasoning_end"] in self.reasoning_buffer
218218+ ):
225219 self.in_reasoning = False
226220 self.in_final = True
227221 # Clean up reasoning buffer
228228- self.reasoning_buffer = self.reasoning_buffer.replace(self.markers['reasoning_end'], '')
222222+ self.reasoning_buffer = self.reasoning_buffer.replace(
223223+ self.markers["reasoning_end"], ""
224224+ )
229225 return ("", False) # Don't display reasoning end marker
230230-226226+231227 return ("", False) # Don't display reasoning content by default
232232-228228+233229 # If in final answer section
234230 if self.in_final:
235231 # Skip final answer markers
236236- if self.markers.get('final_marker') and self.markers['final_marker'] in token:
232232+ if (
233233+ self.markers.get("final_marker")
234234+ and self.markers["final_marker"] in token
235235+ ):
237236 return ("", False)
238238-237237+239238 self.final_buffer += token
240239 return (token, True) # Display final answer
241241-240240+242241 # Default: display token if not in special section
243242 return (token, True)
244243245244246245class StreamingReasoningParser:
247246 """Parser for real-time streaming with reasoning model formatting."""
248248-247247+249248 def __init__(self, model_type: Optional[str] = None, hide_reasoning: bool = False):
250249 self.model_type = model_type
251250 self.hide_reasoning = hide_reasoning
···253252 self.buffer = ""
254253 self.reasoning_content = ""
255254 self.patterns = {}
256256-255255+257256 if model_type and model_type in ReasoningExtractor.PATTERNS:
258258- self.patterns = ReasoningExtractor.PATTERNS[model_type].get('markers', {})
259259-257257+ self.patterns = ReasoningExtractor.PATTERNS[model_type].get("markers", {})
258258+260259 def process_token(self, token: str):
261260 """
262261 Process a streaming token and yield formatted output.
263263-262262+264263 Args:
265264 token: New token from model
266266-265265+267266 Yields:
268267 Formatted output tokens for display
269268 """
270269 self.buffer += token
271271-270270+272271 # State: WAITING - looking for reasoning start
273272 if self.state == "WAITING":
274274- reasoning_start = self.patterns.get('reasoning_start')
273273+ reasoning_start = self.patterns.get("reasoning_start")
275274 if reasoning_start and reasoning_start in self.buffer:
276275 # Found reasoning start
277276 before_reasoning = self.buffer.split(reasoning_start, 1)[0]
278278-277277+279278 # Yield any content before reasoning (but not control tokens)
280280- if before_reasoning.strip() and not before_reasoning.strip().startswith('<|'):
279279+ if before_reasoning.strip() and not before_reasoning.strip().startswith(
280280+ "<|"
281281+ ):
281282 yield before_reasoning
282282-283283+283284 # Start reasoning section (only if not hiding reasoning)
284285 if not self.hide_reasoning:
285286 yield "**[Reasoning]**\n\n"
286286-287287+287288 # Switch to reasoning state
288289 self.buffer = self.buffer.split(reasoning_start, 1)[1]
289290 self.state = "IN_REASONING"
290290-291291+291292 # Process remaining buffer recursively
292293 if self.buffer.strip():
293294 yield from self.process_token("")
294295 return
295295-296296+296297 # Check if buffer might contain start of reasoning pattern
297298 if reasoning_start:
298299 # Check if buffer ends with partial pattern
···301302 if self.buffer.endswith(reasoning_start[:i]):
302303 has_partial_match = True
303304 break
304304-305305+305306 if has_partial_match:
306307 # Don't yield yet - might be building up to pattern
307308 return
308308-309309+309310 # No partial match, safe to yield older content
310311 # Keep enough buffer to detect pattern
311312 pattern_len = len(reasoning_start)
···315316 if to_yield:
316317 yield to_yield
317318 return
318318-319319+319320 # No reasoning pattern expected or very short buffer
320321 if not reasoning_start:
321322 yield token
322322-323323+323324 # State: IN_REASONING - collecting reasoning content
324325 elif self.state == "IN_REASONING":
325325- reasoning_end = self.patterns.get('reasoning_end')
326326+ reasoning_end = self.patterns.get("reasoning_end")
326327 if reasoning_end and reasoning_end in self.buffer:
327328 # Found reasoning end
328329 reasoning_part = self.buffer.split(reasoning_end, 1)[0]
329329-330330+330331 # Yield reasoning content (only if not hiding reasoning)
331332 if reasoning_part and not self.hide_reasoning:
332333 yield reasoning_part
333333-334334+334335 # Add separator (only if not hiding reasoning)
335336 if not self.hide_reasoning:
336337 yield "\n\n---\n\n**[Answer]**\n\n"
337337-338338+338339 # Switch to final state
339340 self.buffer = self.buffer.split(reasoning_end, 1)[1]
340341 self.state = "IN_FINAL"
341341- self._final_content_started = False # Track if we've started outputting final content
342342-342342+ self._final_content_started = (
343343+ False # Track if we've started outputting final content
344344+ )
345345+343346 # Skip intermediate control tokens
344344- skip_tokens = self.patterns.get('skip_tokens', [])
347347+ skip_tokens = self.patterns.get("skip_tokens", [])
345348 for skip_token in skip_tokens:
346346- self.buffer = self.buffer.replace(skip_token, '')
347347-349349+ self.buffer = self.buffer.replace(skip_token, "")
350350+348351 # Skip final marker when we find it
349349- final_marker = self.patterns.get('final_marker')
352352+ final_marker = self.patterns.get("final_marker")
350353 if final_marker and final_marker in self.buffer:
351354 self.buffer = self.buffer.split(final_marker, 1)[1]
352352-355355+353356 # Process remaining buffer
354357 if self.buffer.strip():
355358 yield from self.process_token("")
356359 return
357357-360360+358361 # Still in reasoning, yield the content (only if not hiding reasoning)
359362 if not self.hide_reasoning:
360363 yield token
361361-364364+362365 # State: IN_FINAL - normal streaming of final answer
363366 elif self.state == "IN_FINAL":
364367 # Check for control tokens from patterns that should be filtered
365365- skip_tokens = self.patterns.get('skip_tokens', [])
366366- conditional_skip = self.patterns.get('conditional_skip', [])
367367-368368+ skip_tokens = self.patterns.get("skip_tokens", [])
369369+ conditional_skip = self.patterns.get("conditional_skip", [])
370370+368371 # Check if buffer contains any skip tokens and filter them out
369372 for skip_token in skip_tokens:
370373 if skip_token in self.buffer:
371374 # Remove the skip token and continue
372372- self.buffer = self.buffer.replace(skip_token, '')
375375+ self.buffer = self.buffer.replace(skip_token, "")
373376 # Process remaining buffer if any
374377 if self.buffer.strip():
375378 yield from self.process_token("")
376379 return
377377-380380+378381 # Check for final marker and filter it too
379379- final_marker = self.patterns.get('final_marker')
382382+ final_marker = self.patterns.get("final_marker")
380383 if final_marker and final_marker in self.buffer:
381384 # Split at final marker and yield only content after it
382385 parts = self.buffer.split(final_marker, 1)
···388391 else:
389392 # Just the marker itself, skip it
390393 return
391391-394394+392395 # Check conditional skip tokens - only at start of final section
393393- if not getattr(self, '_final_content_started', False):
396396+ if not getattr(self, "_final_content_started", False):
394397 for cond_token in conditional_skip:
395398 if token.strip() == cond_token:
396399 # Skip this token at the beginning of final section
397400 return
398401 # Mark that final content has started after first non-conditional token
399399- if token.strip() and not any(token.strip() == ct for ct in conditional_skip):
402402+ if token.strip() and not any(
403403+ token.strip() == ct for ct in conditional_skip
404404+ ):
400405 self._final_content_started = True
401401-406406+402407 # Check if we might be building up to a skip token - be conservative
403408 potential_skip = False
404409 for skip_token in skip_tokens:
405405- if skip_token.startswith(token) or any(skip_token.startswith(self.buffer[-i:]) for i in range(1, min(len(skip_token), len(self.buffer)) + 1)):
410410+ if skip_token.startswith(token) or any(
411411+ skip_token.startswith(self.buffer[-i:])
412412+ for i in range(1, min(len(skip_token), len(self.buffer)) + 1)
413413+ ):
406414 potential_skip = True
407415 break
408408-416416+409417 if potential_skip:
410418 # Don't yield yet, might be building up to a skip token
411419 return
412412-420420+413421 # Normal token in final answer - safe to yield
414422 yield token
415415-423423+416424 def finalize(self):
417425 """
418426 Finalize parsing and yield any remaining buffer content.
···428436 elif self.state == "IN_FINAL":
429437 # Final answer content
430438 yield self.buffer
431431-