···11+# MIT License
22+33+# Copyright (c) 2025 The BROKE team 🦫
44+55+# Permission is hereby granted, free of charge, to any person obtaining a copy
66+# of this software and associated documentation files (the "Software"), to deal
77+# in the Software without restriction, including without limitation the rights
88+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
99+# copies of the Software, and to permit persons to whom the Software is
1010+# furnished to do so, subject to the following conditions:
1111+1212+# The above copyright notice and this permission notice shall be included in all
1313+# copies or substantial portions of the Software.
1414+1515+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1616+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1717+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
1818+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
1919+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
2020+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
2121+# SOFTWARE.
2222+2323+"""
2424+Enhanced MLX model runner with direct API integration.
2525+Provides ollama-like run experience with streaming and interactive chat.
2626+"""
2727+2828+import json
2929+import os
3030+import time
3131+from collections.abc import Iterator
3232+from pathlib import Path
3333+from typing import Dict, Optional
3434+3535+import mlx.core as mx
3636+from mlx_lm import load
3737+from mlx_lm.generate import generate_step
3838+from mlx_lm.sample_utils import make_repetition_penalty, make_sampler
3939+4040+from .reasoning_utils import ReasoningExtractor, StreamingReasoningParser
4141+4242+4343+def get_model_context_length(model_path: str) -> int:
4444+ """Extract max_position_embeddings from model config.
4545+4646+ Args:
4747+ model_path: Path to the MLX model directory
4848+4949+ Returns:
5050+ Maximum context length for the model (defaults to 4096 if not found)
5151+ """
5252+ config_path = os.path.join(model_path, "config.json")
5353+5454+ try:
5555+ with open(config_path) as f:
5656+ config = json.load(f)
5757+5858+ # Try various common config keys for context length
5959+ context_keys = [
6060+ "max_position_embeddings",
6161+ "n_positions",
6262+ "context_length",
6363+ "max_sequence_length",
6464+ "seq_len"
6565+ ]
6666+6767+ for key in context_keys:
6868+ if key in config:
6969+ return config[key]
7070+7171+ # If no context length found, return reasonable default
7272+ return 4096
7373+7474+ except (FileNotFoundError, json.JSONDecodeError, KeyError):
7575+ # Return default if config can't be read
7676+ return 4096
7777+7878+7979+class MLXRunner:
8080+ """Direct MLX model runner with streaming and interactive capabilities."""
8181+8282+ def __init__(self, model_path: str, adapter_path: Optional[str] = None, verbose: bool = False):
8383+ """Initialize the runner with a model.
8484+8585+ Args:
8686+ model_path: Path to the MLX model directory
8787+ adapter_path: Optional path to LoRA adapter
8888+ verbose: Show detailed output
8989+ """
9090+ self.model_path = Path(model_path)
9191+ self.adapter_path = adapter_path
9292+ self.model = None
9393+ self.tokenizer = None
9494+ self._memory_baseline = None
9595+ self._stop_tokens = None # Will be populated from tokenizer
9696+ self._message_end_tokens = None # Message-end tokens (e.g., <|end|> for MXFP4)
9797+ self._chat_stop_tokens = None # Chat-specific stop tokens
9898+ self._context_length = None # Will be populated from model config
9999+ self._is_reasoning_model = False # Whether model uses reasoning (MXFP4)
100100+ self._reasoning_start = None # Reasoning start marker
101101+ self._reasoning_end = None # Reasoning end marker
102102+ self._final_start = None # Final answer start marker
103103+ self.verbose = verbose
104104+ self._model_loaded = False
105105+ self._context_entered = False # Prevent nested context usage
106106+107107+ def __enter__(self):
108108+ """Context manager entry - loads the model."""
109109+ if self._context_entered:
110110+ raise RuntimeError("MLXRunner context manager cannot be entered multiple times")
111111+112112+ self._context_entered = True
113113+ try:
114114+ self.load_model()
115115+ return self
116116+ except Exception:
117117+ # If load_model fails, ensure cleanup happens
118118+ self._context_entered = False
119119+ self.cleanup()
120120+ raise
121121+122122+ def __exit__(self, exc_type, exc_val, exc_tb):
123123+ """Context manager exit - cleans up the model."""
124124+ self._context_entered = False
125125+ self.cleanup()
126126+ return False # Don't suppress exceptions
127127+128128+ def load_model(self):
129129+ """Load the MLX model and tokenizer."""
130130+ if self._model_loaded:
131131+ if self.verbose:
132132+ print("Model already loaded, skipping...")
133133+ return
134134+135135+ if self.verbose:
136136+ print(f"Loading model from {self.model_path}...")
137137+ start_time = time.time()
138138+139139+ # Capture baseline memory before loading
140140+ try:
141141+ mx.clear_cache()
142142+ except Exception:
143143+ pass # Continue even if cache clear fails
144144+ self._memory_baseline = mx.get_active_memory() / 1024**3
145145+146146+ try:
147147+ # Load model and tokenizer
148148+ self.model, self.tokenizer = load(
149149+ str(self.model_path),
150150+ adapter_path=self.adapter_path
151151+ )
152152+153153+ load_time = time.time() - start_time
154154+ current_memory = mx.get_active_memory() / 1024**3
155155+ model_memory = current_memory - self._memory_baseline
156156+157157+ if self.verbose:
158158+ print(f"Model loaded in {load_time:.1f}s")
159159+ print(f"Memory: {model_memory:.1f}GB model, {current_memory:.1f}GB total")
160160+161161+ # Extract stop tokens from tokenizer
162162+ self._extract_stop_tokens()
163163+164164+ # Extract context length from model config
165165+ self._context_length = get_model_context_length(str(self.model_path))
166166+167167+ if self.verbose:
168168+ print(f"Model context length: {self._context_length} tokens")
169169+170170+ self._model_loaded = True
171171+172172+ except Exception as e:
173173+ # Ensure partial state is cleaned up on failure
174174+ self.model = None
175175+ self.tokenizer = None
176176+ self._stop_tokens = None
177177+ self._model_loaded = False
178178+ # Clear any memory that might have been allocated
179179+ mx.clear_cache()
180180+ raise RuntimeError(f"Failed to load model from {self.model_path}: {e}") from e
181181+182182+ def _extract_stop_tokens(self):
183183+ """Extract stop tokens from the tokenizer dynamically.
184184+185185+ This method identifies ALL tokens that should stop generation:
186186+ 1. Official EOS token from tokenizer config
187187+ 2. Message-end tokens from training (e.g., <|end|> for MXFP4)
188188+ 3. Common stop tokens across models
189189+ """
190190+ self._stop_tokens = set()
191191+ self._message_end_tokens = set() # Tokens that end messages but not conversations
192192+193193+ # Primary source: eos_token
194194+ eos_token = getattr(self.tokenizer, 'eos_token', None)
195195+ if eos_token:
196196+ self._stop_tokens.add(eos_token)
197197+198198+ # Also check pad_token if it's different from eos_token
199199+ pad_token = getattr(self.tokenizer, 'pad_token', None)
200200+ if pad_token and pad_token != eos_token:
201201+ self._stop_tokens.add(pad_token)
202202+203203+ # Check additional_special_tokens
204204+ if hasattr(self.tokenizer, 'additional_special_tokens'):
205205+ for token in self.tokenizer.additional_special_tokens:
206206+ if token and isinstance(token, str):
207207+ # Only add tokens that look like stop/end tokens
208208+ if any(keyword in token.lower() for keyword in ['end', 'stop', 'eot']):
209209+ self._stop_tokens.add(token)
210210+211211+ # MLX-LM 0.27.0+: Extract tokens from added_tokens_decoder (comprehensive source)
212212+ if hasattr(self.tokenizer, 'added_tokens_decoder'):
213213+ for _token_id, token_info in self.tokenizer.added_tokens_decoder.items():
214214+ if isinstance(token_info, dict) and 'content' in token_info:
215215+ token_content = token_info['content']
216216+ if token_content and isinstance(token_content, str):
217217+ token_lower = token_content.lower()
218218+219219+ # NOTE: <|end|> is NOT a stop token for MXFP4 models!
220220+ # It's a separator between reasoning and final answer
221221+ if token_content == '<|end|>':
222222+ self._message_end_tokens.add(token_content)
223223+ # Do NOT add as stop token - let model continue to final answer
224224+225225+ # Look for tokens that could be end/stop tokens
226226+ # Expanded patterns for MLX-LM 0.27.0 token varieties
227227+ # EXCLUDE <|end|> for MXFP4 models as it's a reasoning separator
228228+ end_patterns = ['stop', 'eot', 'return', 'finish', 'done', 'im_end']
229229+ if any(pattern in token_lower for pattern in end_patterns):
230230+ # Decide if it's a message-end or conversation-end token
231231+ if 'im_end' in token_lower:
232232+ self._message_end_tokens.add(token_content)
233233+ self._stop_tokens.add(token_content)
234234+ # Special handling for 'end' pattern - more selective
235235+ elif 'end' in token_lower and token_content != '<|end|>':
236236+ # Only add non-<|end|> tokens with 'end' in them
237237+ self._stop_tokens.add(token_content)
238238+239239+ # Special case: control tokens in |..| format
240240+ elif token_content.startswith('<|') and token_content.endswith('|>'):
241241+ # Be inclusive with control tokens that might stop generation
242242+ if any(pattern in token_lower for pattern in ['end', 'return', 'stop', 'finish']):
243243+ self._stop_tokens.add(token_content)
244244+245245+ # Model-specific handling based on known patterns
246246+ # Use reasoning_utils for reasoning model detection and patterns
247247+ from .reasoning_utils import ReasoningExtractor
248248+249249+ if hasattr(self.tokenizer, 'name_or_path'):
250250+ name_or_path = str(getattr(self.tokenizer, 'name_or_path', '')).lower()
251251+ model_type = ReasoningExtractor.detect_model_type(name_or_path)
252252+253253+ if model_type:
254254+ # This is a reasoning model
255255+ self._is_reasoning_model = True
256256+257257+ # Get patterns from reasoning_utils
258258+ if model_type in ReasoningExtractor.PATTERNS:
259259+ markers = ReasoningExtractor.PATTERNS[model_type]['markers']
260260+ self._reasoning_start = markers.get('reasoning_start')
261261+ self._reasoning_end = markers.get('reasoning_end')
262262+ self._final_start = markers.get('final_marker')
263263+264264+ # For reasoning models, remove reasoning_end from stop tokens
265265+ if self._reasoning_end:
266266+ self._stop_tokens.discard(self._reasoning_end)
267267+268268+ # Add proper stop token for this model type
269269+ if model_type == 'gpt-oss':
270270+ if '<|return|>' not in self._stop_tokens:
271271+ self._stop_tokens.add('<|return|>')
272272+ else:
273273+ self._is_reasoning_model = False
274274+ else:
275275+ self._is_reasoning_model = False
276276+277277+ # Add common stop tokens that might not be in special tokens
278278+ common_stop_tokens = {'</s>', '<|endoftext|>', '<|im_end|>', '<|eot_id|>'}
279279+280280+ # Add chat-specific stop tokens to prevent model self-conversations
281281+ # Based on our _format_conversation() format: "Human:" and "Assistant:"
282282+ # Also include "You:" as models might use UI-visible format
283283+ # Include single-letter variations (H:, A:, Y:) that some models use
284284+ chat_stop_tokens = {
285285+ '\nHuman:', '\nAssistant:', '\nYou:',
286286+ '\n\nHuman:', '\n\nAssistant:', '\n\nYou:',
287287+ '\nH:', '\nA:', '\nY:', # Single-letter variations
288288+ '\n\nH:', '\n\nA:', '\n\nY:'
289289+ }
290290+291291+ # Add common stop tokens only if they decode to themselves (i.e., they're single tokens)
292292+ for token in common_stop_tokens:
293293+ try:
294294+ # Try to encode and decode to verify it's a real single token
295295+ ids = self.tokenizer.encode(token, add_special_tokens=False)
296296+ if ids and len(ids) == 1: # Single token ID means it's a special token
297297+ decoded = self.tokenizer.decode(ids)
298298+ if decoded == token:
299299+ self._stop_tokens.add(token)
300300+ except:
301301+ pass
302302+303303+ # Store chat stop tokens separately - only used in interactive chat mode
304304+ # This prevents stopping mid-story when user asks for dialogues
305305+ self._chat_stop_tokens = list(chat_stop_tokens)
306306+307307+ # Remove any None values
308308+ self._stop_tokens.discard(None)
309309+ self._message_end_tokens.discard(None)
310310+311311+ # Convert to list for easier use
312312+ self._stop_tokens = list(self._stop_tokens)
313313+ self._message_end_tokens = list(self._message_end_tokens)
314314+315315+ if self.verbose:
316316+ if self._stop_tokens:
317317+ print(f"Stop tokens: {self._stop_tokens}")
318318+ if self._message_end_tokens:
319319+ print(f"Message end tokens: {self._message_end_tokens}")
320320+321321+ def cleanup(self):
322322+ """Clean up model resources and clear GPU memory.
323323+324324+ This method is safe to call multiple times and handles partial state cleanup.
325325+ """
326326+ if self.verbose and self._model_loaded:
327327+ memory_before = mx.get_active_memory() / 1024**3
328328+ print(f"Cleaning up model (memory before: {memory_before:.1f}GB)...")
329329+330330+ # Always clean up, even if model wasn't fully loaded
331331+ self.model = None
332332+ self.tokenizer = None
333333+ self._stop_tokens = None
334334+ self._message_end_tokens = None
335335+ self._chat_stop_tokens = None
336336+ self._context_length = None
337337+ self._is_reasoning_model = False
338338+ self._reasoning_start = None
339339+ self._reasoning_end = None
340340+ self._final_start = None
341341+ self._model_loaded = False
342342+343343+ # Force garbage collection and clear MLX cache
344344+ import gc
345345+ gc.collect()
346346+ try:
347347+ mx.clear_cache()
348348+ except Exception:
349349+ pass # Continue cleanup even if cache clear fails
350350+351351+ if self.verbose:
352352+ memory_after = mx.get_active_memory() / 1024**3
353353+ if 'memory_before' in locals():
354354+ memory_freed = memory_before - memory_after
355355+ print(f"Cleanup complete (memory after: {memory_after:.1f}GB, freed: {memory_freed:.1f}GB)")
356356+ else:
357357+ print(f"Cleanup complete (memory after: {memory_after:.1f}GB)")
358358+359359+ def get_effective_max_tokens(self, requested_tokens: Optional[int], interactive: bool = False) -> int:
360360+ """Get effective max tokens based on model context and usage mode.
361361+362362+ Args:
363363+ requested_tokens: The requested max tokens (None if user didn't specify --max-tokens)
364364+ interactive: True if this is interactive mode (gets full context length)
365365+366366+ Returns:
367367+ Effective max tokens to use
368368+ """
369369+ if not self._context_length:
370370+ # Fallback when context length is unknown
371371+ fallback = 4096 if interactive else 2048
372372+ if self.verbose:
373373+ if requested_tokens is None:
374374+ print(f"[WARNING] Model context length unknown, using fallback: {fallback} tokens")
375375+ else:
376376+ print(f"[WARNING] Model context length unknown, using user specified: {requested_tokens} tokens")
377377+ return requested_tokens if requested_tokens is not None else fallback
378378+379379+ if interactive:
380380+ if requested_tokens is None:
381381+ # User didn't specify --max-tokens: use full model context
382382+ return self._context_length
383383+ else:
384384+ # User specified --max-tokens explicitly: respect their choice but cap at context
385385+ return min(requested_tokens, self._context_length)
386386+ else:
387387+ # Server/batch mode uses half context length for DoS protection
388388+ server_limit = self._context_length // 2
389389+ return min(requested_tokens or server_limit, server_limit)
390390+391391+ def generate_streaming(
392392+ self,
393393+ prompt: str,
394394+ max_tokens: int = 500,
395395+ temperature: float = 0.7,
396396+ top_p: float = 0.9,
397397+ repetition_penalty: float = 1.1,
398398+ repetition_context_size: int = 20,
399399+ use_chat_template: bool = True,
400400+ use_chat_stop_tokens: bool = False,
401401+ interactive: bool = False,
402402+ hide_reasoning: bool = False,
403403+ ) -> Iterator[str]:
404404+ """Generate text with streaming output.
405405+406406+ Args:
407407+ prompt: Input prompt
408408+ max_tokens: Maximum tokens to generate
409409+ temperature: Sampling temperature
410410+ top_p: Top-p sampling parameter
411411+ repetition_penalty: Penalty for repeated tokens
412412+ repetition_context_size: Context size for repetition penalty
413413+ use_chat_template: Apply tokenizer's chat template if available
414414+ use_chat_stop_tokens: Include chat turn markers as stop tokens (for interactive mode)
415415+ interactive: True if this is interactive mode (affects token limits)
416416+417417+ Yields:
418418+ Generated tokens as they are produced
419419+ """
420420+ if not self.model or not self.tokenizer:
421421+ raise RuntimeError("Model not loaded. Call load_model() first.")
422422+423423+ # Initialize reasoning parser if this is a reasoning model
424424+ reasoning_parser = None
425425+ if self._is_reasoning_model:
426426+ model_type = ReasoningExtractor.detect_model_type(
427427+ getattr(self.tokenizer, 'name_or_path', '') or ''
428428+ )
429429+ reasoning_parser = StreamingReasoningParser(model_type, hide_reasoning=hide_reasoning)
430430+431431+ # Apply context-aware token limits
432432+ effective_max_tokens = self.get_effective_max_tokens(max_tokens, interactive)
433433+434434+ # Apply chat template if available and requested
435435+ if use_chat_template and hasattr(self.tokenizer, 'chat_template') and self.tokenizer.chat_template:
436436+ messages = [{"role": "user", "content": prompt}]
437437+ formatted_prompt = self.tokenizer.apply_chat_template(
438438+ messages,
439439+ tokenize=False,
440440+ add_generation_prompt=True
441441+ )
442442+ else:
443443+ formatted_prompt = prompt
444444+445445+ # Tokenize the prompt
446446+ prompt_tokens = self.tokenizer.encode(formatted_prompt)
447447+ prompt_array = mx.array(prompt_tokens)
448448+449449+ # Track generation metrics
450450+ start_time = time.time()
451451+ tokens_generated = 0
452452+453453+ # Create sampler with our parameters
454454+ sampler = make_sampler(temp=temperature, top_p=top_p)
455455+456456+ # Create repetition penalty processor if needed
457457+ logits_processors = []
458458+ if repetition_penalty > 1.0:
459459+ logits_processors.append(
460460+ make_repetition_penalty(repetition_penalty, repetition_context_size)
461461+ )
462462+463463+ # Generate tokens one by one for streaming
464464+ generator = generate_step(
465465+ prompt=prompt_array,
466466+ model=self.model,
467467+ max_tokens=effective_max_tokens,
468468+ sampler=sampler,
469469+ logits_processors=logits_processors if logits_processors else None,
470470+ )
471471+472472+ # Collect tokens and yield text
473473+ generated_tokens = []
474474+ previous_decoded = ""
475475+ accumulated_response = "" # Track full response for stop token detection
476476+477477+ # Keep a sliding window of recent tokens for context
478478+ context_window = 10 # Decode last N tokens for proper spacing
479479+480480+ for token, _ in generator:
481481+ # Token might be an array or an int
482482+ token_id = token.item() if hasattr(token, 'item') else token
483483+ generated_tokens.append(token_id)
484484+485485+ # Use a sliding window approach for efficiency
486486+ start_idx = max(0, len(generated_tokens) - context_window)
487487+ window_tokens = generated_tokens[start_idx:]
488488+489489+ # Decode the window
490490+ window_text = self.tokenizer.decode(window_tokens)
491491+492492+ # Figure out what's new
493493+ if start_idx == 0:
494494+ # We're still within the context window
495495+ if window_text.startswith(previous_decoded):
496496+ new_text = window_text[len(previous_decoded):]
497497+ else:
498498+ new_text = self.tokenizer.decode([token_id])
499499+ previous_decoded = window_text
500500+ else:
501501+ # We're beyond the context window, just decode the last token with context
502502+ # This is approximate but should preserve spaces
503503+ new_text = self.tokenizer.decode(window_tokens)
504504+ if len(window_tokens) > 1:
505505+ prefix = self.tokenizer.decode(window_tokens[:-1])
506506+ if new_text.startswith(prefix):
507507+ new_text = new_text[len(prefix):]
508508+ else:
509509+ new_text = self.tokenizer.decode([token_id])
510510+511511+ if new_text:
512512+ # Update accumulated response for stop token checking
513513+ accumulated_response += new_text
514514+515515+ # Filter out stop tokens with priority: native first, then chat fallback
516516+ # Check native stop tokens FIRST in accumulated response (highest priority)
517517+ native_stop_tokens = self._stop_tokens if self._stop_tokens else []
518518+ for stop_token in native_stop_tokens:
519519+ if stop_token in accumulated_response:
520520+ # Find the stop token position and yield everything before it
521521+ stop_pos = accumulated_response.find(stop_token)
522522+ # Calculate what text came before the stop token
523523+ text_before_stop = accumulated_response[:stop_pos]
524524+ # Calculate how much of that is new (not previously yielded)
525525+ previously_yielded_length = len(accumulated_response) - len(new_text)
526526+ if len(text_before_stop) > previously_yielded_length:
527527+ # Yield only the new part before stop token
528528+ new_part_before_stop = text_before_stop[previously_yielded_length:]
529529+ if new_part_before_stop:
530530+ if reasoning_parser:
531531+ # Process through reasoning parser for formatting
532532+ for formatted_token in reasoning_parser.process_token(new_part_before_stop):
533533+ yield formatted_token
534534+ else:
535535+ yield new_part_before_stop
536536+ return # Stop generation without yielding stop token
537537+538538+ # Only check chat stop tokens if no native stop token found (fallback)
539539+ if use_chat_stop_tokens and self._chat_stop_tokens:
540540+ for stop_token in self._chat_stop_tokens:
541541+ if stop_token in accumulated_response:
542542+ # Find the stop token position and yield everything before it
543543+ stop_pos = accumulated_response.find(stop_token)
544544+ # Calculate what text came before the stop token
545545+ text_before_stop = accumulated_response[:stop_pos]
546546+ # Calculate how much of that is new (not previously yielded)
547547+ previously_yielded_length = len(accumulated_response) - len(new_text)
548548+ if len(text_before_stop) > previously_yielded_length:
549549+ # Yield only the new part before stop token
550550+ new_part_before_stop = text_before_stop[previously_yielded_length:]
551551+ if new_part_before_stop:
552552+ if reasoning_parser:
553553+ # Process through reasoning parser for formatting
554554+ for formatted_token in reasoning_parser.process_token(new_part_before_stop):
555555+ yield formatted_token
556556+ else:
557557+ yield new_part_before_stop
558558+ return # Stop generation without yielding stop token
559559+560560+ # No stop token found, process the new text
561561+ if reasoning_parser:
562562+ # Process through reasoning parser for formatting
563563+ for formatted_token in reasoning_parser.process_token(new_text):
564564+ yield formatted_token
565565+ else:
566566+ # Normal streaming for non-reasoning models
567567+ yield new_text
568568+ tokens_generated += 1
569569+570570+ # Check for EOS token - don't yield it
571571+ if token_id == self.tokenizer.eos_token_id:
572572+ break
573573+574574+ # Finalize reasoning parser if used
575575+ if reasoning_parser:
576576+ yield from reasoning_parser.finalize()
577577+578578+ # Print generation statistics if verbose
579579+ if self.verbose:
580580+ generation_time = time.time() - start_time
581581+ tokens_per_second = tokens_generated / generation_time if generation_time > 0 else 0
582582+ print(f"\n\nGenerated {tokens_generated} tokens in {generation_time:.1f}s ({tokens_per_second:.1f} tokens/s)")
583583+584584+ def generate_batch(
585585+ self,
586586+ prompt: str,
587587+ max_tokens: int = 500,
588588+ temperature: float = 0.7,
589589+ top_p: float = 0.9,
590590+ repetition_penalty: float = 1.1,
591591+ repetition_context_size: int = 20,
592592+ use_chat_template: bool = True,
593593+ interactive: bool = False,
594594+ ) -> str:
595595+ """Generate text in batch mode (non-streaming).
596596+597597+ Args:
598598+ prompt: Input prompt
599599+ max_tokens: Maximum tokens to generate
600600+ temperature: Sampling temperature
601601+ top_p: Top-p sampling parameter
602602+ repetition_penalty: Penalty for repeated tokens
603603+ repetition_context_size: Context size for repetition penalty
604604+ use_chat_template: Apply tokenizer's chat template if available
605605+ interactive: True if this is interactive mode (affects token limits)
606606+607607+ Returns:
608608+ Generated text
609609+ """
610610+ if not self.model or not self.tokenizer:
611611+ raise RuntimeError("Model not loaded. Call load_model() first.")
612612+613613+ # Apply context-aware token limits
614614+ effective_max_tokens = self.get_effective_max_tokens(max_tokens, interactive)
615615+616616+ # Apply chat template if available and requested
617617+ if use_chat_template and hasattr(self.tokenizer, 'chat_template') and self.tokenizer.chat_template:
618618+ messages = [{"role": "user", "content": prompt}]
619619+ formatted_prompt = self.tokenizer.apply_chat_template(
620620+ messages,
621621+ tokenize=False,
622622+ add_generation_prompt=True
623623+ )
624624+ else:
625625+ formatted_prompt = prompt
626626+627627+ start_time = time.time()
628628+629629+ # Tokenize the prompt
630630+ prompt_tokens = self.tokenizer.encode(formatted_prompt)
631631+ prompt_array = mx.array(prompt_tokens)
632632+633633+ # Create sampler with our parameters
634634+ sampler = make_sampler(temp=temperature, top_p=top_p)
635635+636636+ # Create repetition penalty processor if needed
637637+ logits_processors = []
638638+ if repetition_penalty > 1.0:
639639+ logits_processors.append(
640640+ make_repetition_penalty(repetition_penalty, repetition_context_size)
641641+ )
642642+643643+ # Generate all tokens at once
644644+ generated_tokens = []
645645+ all_tokens = list(prompt_tokens) # Keep prompt for proper decoding
646646+647647+ generator = generate_step(
648648+ prompt=prompt_array,
649649+ model=self.model,
650650+ max_tokens=effective_max_tokens,
651651+ sampler=sampler,
652652+ logits_processors=logits_processors if logits_processors else None,
653653+ )
654654+655655+ for token, _ in generator:
656656+ # Token might be an array or an int
657657+ token_id = token.item() if hasattr(token, 'item') else token
658658+ generated_tokens.append(token_id)
659659+ all_tokens.append(token_id)
660660+661661+ # Check for EOS token - don't yield it
662662+ if token_id == self.tokenizer.eos_token_id:
663663+ break
664664+665665+ # Decode all tokens together for proper spacing
666666+ full_response = self.tokenizer.decode(all_tokens)
667667+668668+ # Remove the prompt part
669669+ if full_response.startswith(formatted_prompt):
670670+ response = full_response[len(formatted_prompt):]
671671+ else:
672672+ # Fallback: just decode generated tokens
673673+ response = self.tokenizer.decode(generated_tokens)
674674+675675+ # Apply end-token filtering (same logic as streaming mode for Issue #20)
676676+ response = self._filter_end_tokens_from_response(response, use_chat_stop_tokens=False)
677677+678678+ # Format reasoning models output
679679+ response = self._format_reasoning_response(response)
680680+681681+ generation_time = time.time() - start_time
682682+683683+ # Count tokens for statistics
684684+ if self.verbose:
685685+ tokens_generated = len(generated_tokens)
686686+ tokens_per_second = tokens_generated / generation_time if generation_time > 0 else 0
687687+ print(f"\nGenerated {tokens_generated} tokens in {generation_time:.1f}s ({tokens_per_second:.1f} tokens/s)")
688688+689689+ return response
690690+691691+ def interactive_chat(
692692+ self,
693693+ system_prompt: Optional[str] = None,
694694+ max_tokens: int = 500,
695695+ temperature: float = 0.7,
696696+ top_p: float = 0.9,
697697+ repetition_penalty: float = 1.1,
698698+ use_chat_template: bool = True,
699699+ ):
700700+ """Run an interactive chat session.
701701+702702+ Args:
703703+ system_prompt: Optional system prompt to prepend
704704+ max_tokens: Maximum tokens per response
705705+ temperature: Sampling temperature
706706+ top_p: Top-p sampling parameter
707707+ repetition_penalty: Penalty for repeated tokens
708708+ use_chat_template: Use tokenizer's chat template if available
709709+ """
710710+ print("Starting interactive chat. Type 'exit' or 'quit' to end.\n")
711711+712712+ conversation_history = []
713713+ if system_prompt:
714714+ conversation_history.append({"role": "system", "content": system_prompt})
715715+716716+ while True:
717717+ try:
718718+ # Get user input
719719+ user_input = input("You: ").strip()
720720+721721+ if user_input.lower() in ['exit', 'quit', 'q']:
722722+ print("\nGoodbye!")
723723+ break
724724+725725+ if not user_input:
726726+ continue
727727+728728+ # Add user message to history
729729+ conversation_history.append({"role": "user", "content": user_input})
730730+731731+ # Format conversation for the model using chat template if available
732732+ prompt = self._format_conversation(conversation_history, use_chat_template=use_chat_template)
733733+734734+ # Generate response with streaming
735735+ print("\nAssistant: ", end="", flush=True)
736736+737737+ response_tokens = []
738738+ for token in self.generate_streaming(
739739+ prompt=prompt,
740740+ max_tokens=max_tokens,
741741+ temperature=temperature,
742742+ top_p=top_p,
743743+ repetition_penalty=repetition_penalty,
744744+ use_chat_template=False, # Already applied in _format_conversation
745745+ use_chat_stop_tokens=True, # Enable chat stop tokens in interactive mode
746746+ interactive=True, # Enable full context length for interactive mode
747747+ ):
748748+ # Stream all tokens directly (already formatted by generate_streaming)
749749+ print(token, end="", flush=True)
750750+ response_tokens.append(token)
751751+752752+ # Add assistant response to history
753753+ assistant_response = "".join(response_tokens).strip()
754754+ conversation_history.append({"role": "assistant", "content": assistant_response})
755755+756756+ print() # New line after response
757757+758758+ except KeyboardInterrupt:
759759+ print("\n\nChat interrupted. Goodbye!")
760760+ break
761761+ except Exception as e:
762762+ print(f"\n[ERROR] {e}")
763763+ continue
764764+765765+ def _format_conversation(self, messages: list, use_chat_template: bool = True) -> str:
766766+ """Format conversation history into a prompt.
767767+768768+ Uses the tokenizer's chat template if available, otherwise falls back
769769+ to the legacy Human:/Assistant: format for compatibility.
770770+771771+ Args:
772772+ messages: List of message dictionaries with 'role' and 'content'
773773+ use_chat_template: Whether to use chat template if available
774774+775775+ Returns:
776776+ Formatted conversation string
777777+ """
778778+ # Try to use native chat template if available
779779+ if use_chat_template and hasattr(self.tokenizer, 'chat_template') and self.tokenizer.chat_template:
780780+ try:
781781+ # Apply the tokenizer's chat template
782782+ formatted_prompt = self.tokenizer.apply_chat_template(
783783+ messages,
784784+ tokenize=False,
785785+ add_generation_prompt=True
786786+ )
787787+ return formatted_prompt
788788+ except Exception as e:
789789+ # If chat template fails, fall back to legacy format
790790+ if self.verbose:
791791+ print(f"[WARNING] Chat template failed, using legacy format: {e}")
792792+793793+ # Legacy format fallback for compatibility
794794+ return self._legacy_format_conversation(messages)
795795+796796+ def _legacy_format_conversation(self, messages: list) -> str:
797797+ """Legacy conversation formatting for backward compatibility.
798798+799799+ This format was used in earlier versions and remains as a fallback
800800+ for models without chat templates.
801801+ """
802802+ formatted = []
803803+804804+ for message in messages:
805805+ role = message["role"]
806806+ content = message["content"]
807807+808808+ if role == "system":
809809+ formatted.append(f"System: {content}")
810810+ elif role == "user":
811811+ formatted.append(f"Human: {content}")
812812+ elif role == "assistant":
813813+ formatted.append(f"Assistant: {content}")
814814+815815+ # Add prompt for next assistant response
816816+ formatted.append("Assistant:")
817817+818818+ return "\n\n".join(formatted)
819819+820820+ def get_memory_usage(self) -> Dict[str, float]:
821821+ """Get current memory usage statistics.
822822+823823+ Returns:
824824+ Dictionary with memory statistics in GB
825825+ """
826826+ try:
827827+ current_memory = mx.get_active_memory() / 1024**3
828828+ peak_memory = mx.get_peak_memory() / 1024**3
829829+ except Exception:
830830+ # Return zeros if memory stats unavailable
831831+ current_memory = 0.0
832832+ peak_memory = 0.0
833833+834834+ return {
835835+ "current_gb": current_memory,
836836+ "peak_gb": peak_memory,
837837+ "model_gb": current_memory - self._memory_baseline if self._memory_baseline else 0,
838838+ }
839839+840840+ def _format_reasoning_response(self, response: str) -> str:
841841+ """Format response from reasoning models for better readability.
842842+843843+ For MXFP4 models that generate reasoning followed by final answer,
844844+ format it nicely for display.
845845+ """
846846+ if not self._is_reasoning_model:
847847+ return response
848848+849849+ # Check if response contains reasoning markers
850850+ if self._reasoning_start in response and self._final_start in response:
851851+ # Extract reasoning and final parts
852852+ try:
853853+ # Split on the reasoning start
854854+ before_reasoning, after_start = response.split(self._reasoning_start, 1)
855855+856856+ # Find the reasoning content (until <|end|>)
857857+ if self._reasoning_end in after_start:
858858+ reasoning_content, after_reasoning = after_start.split(self._reasoning_end, 1)
859859+860860+ # Find the final answer
861861+ if self._final_start in after_reasoning:
862862+ # Extract everything after final marker
863863+ final_parts = after_reasoning.split(self._final_start, 1)
864864+ if len(final_parts) > 1:
865865+ # Remove the <|channel|>final<|message|> marker
866866+ final_answer = final_parts[1].replace('<|channel|>final<|message|>', '', 1)
867867+868868+ # Format with clear markers for parsing but minimal visual impact
869869+ formatted = []
870870+ formatted.append("\n**[Reasoning]**\n")
871871+ formatted.append(reasoning_content.strip())
872872+ formatted.append("\n\n---\n\n**[Answer]**\n")
873873+ formatted.append(final_answer.strip())
874874+875875+ return '\n'.join(formatted)
876876+ except Exception:
877877+ # If parsing fails, return original
878878+ pass
879879+880880+ # Fallback: just clean up the control tokens
881881+ cleaned = response
882882+ for marker in ['<|channel|>analysis<|message|>', '<|end|>', '<|start|>assistant',
883883+ '<|channel|>final<|message|>', '<|return|>']:
884884+ cleaned = cleaned.replace(marker, '')
885885+886886+ return cleaned.strip()
887887+888888+ def _filter_end_tokens_from_response(self, response: str, use_chat_stop_tokens: bool = False) -> str:
889889+ """Filter end tokens from a complete response (batch mode).
890890+891891+ This method applies the same filtering logic as the streaming mode
892892+ to ensure consistent behavior between streaming and non-streaming.
893893+894894+ Args:
895895+ response: The complete generated response
896896+ use_chat_stop_tokens: Whether to apply chat stop tokens
897897+898898+ Returns:
899899+ Response with end tokens filtered out
900900+ """
901901+ # Apply native stop token filtering FIRST (highest priority)
902902+ native_stop_tokens = self._stop_tokens if self._stop_tokens else []
903903+ for stop_token in native_stop_tokens:
904904+ if stop_token in response:
905905+ # Find the stop token position and return everything before it
906906+ stop_pos = response.find(stop_token)
907907+ filtered_response = response[:stop_pos].rstrip()
908908+ if self.verbose:
909909+ print(f"[DEBUG] Filtered stop token '{stop_token}' at position {stop_pos}")
910910+ return filtered_response
911911+912912+ # Only check chat stop tokens if no native stop token found (fallback)
913913+ if use_chat_stop_tokens and self._chat_stop_tokens:
914914+ for stop_token in self._chat_stop_tokens:
915915+ if stop_token in response:
916916+ # Find the stop token position and return everything before it
917917+ stop_pos = response.find(stop_token)
918918+ return response[:stop_pos]
919919+920920+ # No stop tokens found, return original response
921921+ return response
922922+923923+924924+def get_gpu_status() -> Dict[str, float]:
925925+ """Independent GPU status check - usable from anywhere.
926926+927927+ Returns:
928928+ Dictionary with GPU memory statistics in GB
929929+ """
930930+ return {
931931+ "active_memory_gb": mx.get_active_memory() / 1024**3,
932932+ "peak_memory_gb": mx.get_peak_memory() / 1024**3,
933933+ }
934934+935935+936936+def check_memory_available(required_gb: float) -> bool:
937937+ """Pre-flight check before model loading.
938938+939939+ Args:
940940+ required_gb: Required memory in GB
941941+942942+ Returns:
943943+ True if memory is likely available (conservative estimate)
944944+ """
945945+ current_memory = mx.get_active_memory() / 1024**3
946946+947947+ # Conservative estimate: assume system has at least 8GB unified memory
948948+ # and we should leave some headroom (2GB) for system processes
949949+ estimated_total = 8.0 # This could be improved by detecting actual system memory
950950+ available = estimated_total - current_memory - 2.0 # 2GB headroom
951951+952952+ return available >= required_gb
953953+954954+955955+def run_model_enhanced(
956956+ model_path: str,
957957+ prompt: Optional[str] = None,
958958+ interactive: bool = False,
959959+ max_tokens: int = 500,
960960+ temperature: float = 0.7,
961961+ top_p: float = 0.9,
962962+ repetition_penalty: float = 1.1,
963963+ stream: bool = True,
964964+ use_chat_template: bool = True,
965965+ hide_reasoning: bool = False,
966966+ verbose: bool = False,
967967+) -> Optional[str]:
968968+ """Enhanced run function with direct MLX integration.
969969+970970+ Uses context manager pattern for automatic resource cleanup.
971971+972972+ Args:
973973+ model_path: Path to the MLX model
974974+ prompt: Input prompt (if None, enters interactive mode)
975975+ interactive: Force interactive mode
976976+ max_tokens: Maximum tokens to generate
977977+ temperature: Sampling temperature
978978+ top_p: Top-p sampling parameter
979979+ repetition_penalty: Penalty for repeated tokens
980980+ stream: Whether to stream output
981981+982982+ Returns:
983983+ Generated text (in non-interactive mode)
984984+ """
985985+ try:
986986+ with MLXRunner(model_path, verbose=verbose) as runner:
987987+ # Interactive mode
988988+ if interactive or prompt is None:
989989+ runner.interactive_chat(
990990+ max_tokens=max_tokens,
991991+ temperature=temperature,
992992+ top_p=top_p,
993993+ repetition_penalty=repetition_penalty,
994994+ use_chat_template=use_chat_template,
995995+ )
996996+ return None
997997+998998+ # Single prompt mode
999999+ if verbose:
10001000+ print(f"\nPrompt: {prompt}\n")
10011001+ print("Response: ", end="", flush=True)
10021002+10031003+ if stream:
10041004+ # Streaming generation
10051005+ response_tokens = []
10061006+ try:
10071007+ for token in runner.generate_streaming(
10081008+ prompt=prompt,
10091009+ max_tokens=max_tokens,
10101010+ temperature=temperature,
10111011+ top_p=top_p,
10121012+ repetition_penalty=repetition_penalty,
10131013+ use_chat_template=use_chat_template,
10141014+ hide_reasoning=hide_reasoning,
10151015+ ):
10161016+ # Stream all tokens directly (already formatted by generate_streaming)
10171017+ print(token, end="", flush=True)
10181018+ response_tokens.append(token)
10191019+ except KeyboardInterrupt:
10201020+ print("\n[INFO] Generation interrupted by user.")
10211021+ response = "".join(response_tokens)
10221022+ else:
10231023+ # Batch generation
10241024+ try:
10251025+ response = runner.generate_batch(
10261026+ prompt=prompt,
10271027+ max_tokens=max_tokens,
10281028+ temperature=temperature,
10291029+ top_p=top_p,
10301030+ repetition_penalty=repetition_penalty,
10311031+ use_chat_template=use_chat_template,
10321032+ )
10331033+ except KeyboardInterrupt:
10341034+ print("\n[INFO] Generation interrupted by user.")
10351035+ response = ""
10361036+ print(response)
10371037+10381038+ # Show memory usage if verbose
10391039+ if verbose:
10401040+ memory_stats = runner.get_memory_usage()
10411041+ print(f"\n\nMemory: {memory_stats['model_gb']:.1f}GB model, {memory_stats['current_gb']:.1f}GB total")
10421042+10431043+ return response
10441044+10451045+ # Note: cleanup happens automatically due to context manager
10461046+10471047+ except Exception as e:
10481048+ print(f"\n[ERROR] {e}")
10491049+ return None
10501050+
+165
server/model_card.py
···11+22+from __future__ import annotations
33+44+# ruff: noqa: UP045
55+66+"""
77+Lightweight helpers to read model metadata hints from cached Hugging Face models.
88+99+No external dependencies; YAML front matter is hand-parsed leniently.
1010+1111+Priority rules (Issue #31):
1212+- Tokenizer config: if tokenizer_config.json has chat_template -> Type = chat
1313+- README.md front matter (YAML):
1414+ - tags contains "mlx" OR library_name == "mlx" -> Framework = MLX
1515+ - pipeline_tag == text-generation OR tags contain chat/instruct -> Type = chat
1616+ - pipeline_tag == sentence-similarity OR tags contain embedding -> Type = embedding
1717+- Fallback for framework/type remains in cache_utils
1818+"""
1919+2020+import json
2121+from pathlib import Path
2222+from typing import Any, Dict, List, Optional, Tuple
2323+2424+2525+def _latest_snapshot_dir(model_base_dir: Path) -> Optional[Path]:
2626+ """Return latest snapshot directory for a cached HF model base dir."""
2727+ try:
2828+ snaps = (model_base_dir / "snapshots")
2929+ if not snaps.exists():
3030+ return None
3131+ candidates = [d for d in snaps.iterdir() if d.is_dir()]
3232+ if not candidates:
3333+ return None
3434+ return max(candidates, key=lambda p: p.stat().st_mtime)
3535+ except Exception:
3636+ return None
3737+3838+3939+def _lenient_yaml_front_matter(text: str) -> Dict[str, Any]:
4040+ """Very small YAML front matter parser for the fields we need.
4141+4242+ Supports forms:
4343+ ---
4444+ tags: [mlx, chat]
4545+ pipeline_tag: text-generation
4646+ library_name: mlx
4747+ ---
4848+4949+ And list style:
5050+ tags:
5151+ - mlx
5252+ - chat
5353+ """
5454+ start = text.find("\n---\n")
5555+ # Accept files starting directly with '---' too
5656+ if text.startswith('---'):
5757+ start = 0
5858+ elif start >= 0:
5959+ start = start + 1 # move to line start
6060+ else:
6161+ # Try at very beginning without newline
6262+ start = 0 if text[:3] == '---' else -1
6363+ if start != 0:
6464+ return {}
6565+6666+ # Find closing '---' after start
6767+ end = text.find('\n---', 3)
6868+ if end == -1:
6969+ return {}
7070+ header = text[3:end] if text.startswith('---') else text[start + 3:end]
7171+7272+ # Normalize lines
7373+ lines = [ln.strip() for ln in header.splitlines() if ln.strip()]
7474+7575+ data: Dict[str, Any] = {}
7676+ current_key: Optional[str] = None
7777+ list_acc: List[str] = []
7878+7979+ def flush_list():
8080+ nonlocal list_acc, current_key
8181+ if current_key is not None and list_acc:
8282+ data[current_key] = list_acc[:]
8383+ list_acc = []
8484+8585+ for ln in lines:
8686+ if ln.startswith('- '):
8787+ # list item under current_key
8888+ val = ln[2:].strip().strip('"\'')
8989+ if current_key is not None:
9090+ list_acc.append(val)
9191+ continue
9292+ # key: value or key: [a, b]
9393+ if ':' in ln:
9494+ # Close any previous list
9595+ flush_list()
9696+ key, val = ln.split(':', 1)
9797+ key = key.strip()
9898+ val = val.strip()
9999+ current_key = key
100100+ if not val:
101101+ # expect multi-line list next
102102+ data.setdefault(key, [])
103103+ continue
104104+ # Inline list [a, b]
105105+ if val.startswith('[') and val.endswith(']'):
106106+ inner = val[1:-1].strip()
107107+ items = [] if not inner else [it.strip().strip('"\'') for it in inner.split(',')]
108108+ data[key] = [x for x in items if x]
109109+ continue
110110+ # Scalar
111111+ data[key] = val.strip('"\'')
112112+ continue
113113+ # Non key-value, ignore
114114+ # Flush last list
115115+ flush_list()
116116+ return data
117117+118118+119119+def read_readme_front_matter(model_base_dir: Path) -> Tuple[Optional[List[str]], Optional[str], Optional[str]]:
120120+ """Read README.md front matter and extract tags, pipeline_tag, library_name.
121121+122122+ Returns (tags, pipeline_tag, library_name) with lowercase normalization where applicable.
123123+ Any read/parse error results in (None, None, None).
124124+ """
125125+ try:
126126+ snap = _latest_snapshot_dir(model_base_dir)
127127+ if not snap:
128128+ return None, None, None
129129+ readme = snap / 'README.md'
130130+ if not readme.exists():
131131+ return None, None, None
132132+ text = readme.read_text(encoding='utf-8', errors='ignore')
133133+ fm = _lenient_yaml_front_matter(text)
134134+ if not fm:
135135+ return None, None, None
136136+ tags = fm.get('tags')
137137+ if isinstance(tags, list):
138138+ tags = [str(t).strip().lower() for t in tags if str(t).strip()]
139139+ else:
140140+ tags = None
141141+ pipeline = fm.get('pipeline_tag')
142142+ pipeline = str(pipeline).strip().lower() if pipeline else None
143143+ lib = fm.get('library_name')
144144+ lib = str(lib).strip().lower() if lib else None
145145+ return tags, pipeline, lib
146146+ except Exception:
147147+ return None, None, None
148148+149149+150150+def tokenizer_has_chat_template(model_base_dir: Path) -> bool:
151151+ """Check tokenizer_config.json for a non-empty 'chat_template' field in latest snapshot."""
152152+ try:
153153+ snap = _latest_snapshot_dir(model_base_dir)
154154+ if not snap:
155155+ return False
156156+ tk = snap / 'tokenizer_config.json'
157157+ if not tk.exists():
158158+ return False
159159+ with open(tk, encoding='utf-8') as f:
160160+ data = json.load(f)
161161+ tmpl = data.get('chat_template')
162162+ return bool(tmpl and isinstance(tmpl, str) and tmpl.strip())
163163+ except Exception:
164164+ return False
165165+
···11+"""
22+Utilities for handling reasoning models and their output.
33+44+Different models use different formats for reasoning:
55+- MXFP4/GPT-OSS: <|channel|>analysis<|message|>REASONING<|end|>...<|channel|>final<|message|>ANSWER
66+- DeepSeek R1: <think>REASONING</think>ANSWER
77+- Claude: <thinking>REASONING</thinking>ANSWER
88+- QwQ: Similar to MXFP4
99+"""
1010+1111+import re
1212+from typing import Dict, Optional, Tuple
1313+1414+1515+class ReasoningExtractor:
1616+ """Extract reasoning and final answer from model outputs."""
1717+1818+ # Model-specific patterns
1919+ PATTERNS = {
2020+ 'gpt-oss': {
2121+ 'reasoning': r'<\|channel\|>analysis<\|message\|>(.*?)<\|end\|>',
2222+ 'final': r'<\|channel\|>final<\|message\|>(.*?)(?:<\|return\|>|$)',
2323+ 'markers': {
2424+ 'reasoning_start': '<|channel|>analysis<|message|>',
2525+ 'reasoning_end': '<|end|>',
2626+ 'final_marker': '<|channel|>final<|message|>',
2727+ # Skip tokens that appear between reasoning and final
2828+ 'skip_tokens': ['<|start|>assistant<|channel|>final<|message|>', '<|start|>assistant', '<|start|>', '<|channel|>final<|message|>'],
2929+ # Conditional skip tokens - only skip if at start of final section
3030+ 'conditional_skip': ['assistant']
3131+ }
3232+ },
3333+ 'deepseek': {
3434+ 'reasoning': r'<think>(.*?)</think>',
3535+ 'final': r'</think>(.*?)$',
3636+ 'markers': {
3737+ 'reasoning_start': '<think>',
3838+ 'reasoning_end': '</think>',
3939+ }
4040+ },
4141+ 'claude': {
4242+ 'reasoning': r'<thinking>(.*?)</thinking>',
4343+ 'final': r'</thinking>(.*?)$',
4444+ 'markers': {
4545+ 'reasoning_start': '<thinking>',
4646+ 'reasoning_end': '</thinking>',
4747+ }
4848+ }
4949+ }
5050+5151+ @classmethod
5252+ def detect_model_type(cls, model_name: str) -> Optional[str]:
5353+ """Detect reasoning model type from model name."""
5454+ model_lower = model_name.lower()
5555+5656+ if 'gpt-oss' in model_lower:
5757+ return 'gpt-oss'
5858+ elif 'deepseek' in model_lower and 'r1' in model_lower:
5959+ return 'deepseek'
6060+ elif 'claude' in model_lower:
6161+ return 'claude'
6262+ elif 'qwq' in model_lower:
6363+ return 'gpt-oss' # QwQ uses similar format to GPT-OSS
6464+6565+ return None
6666+6767+ @classmethod
6868+ def extract(cls, text: str, model_type: Optional[str] = None,
6969+ model_name: Optional[str] = None) -> Dict[str, Optional[str]]:
7070+ """
7171+ Extract reasoning and final answer from model output.
7272+7373+ Args:
7474+ text: The full model output
7575+ model_type: Explicit model type ('mxfp4', 'deepseek', etc.)
7676+ model_name: Model name to auto-detect type
7777+7878+ Returns:
7979+ Dictionary with 'reasoning', 'final_answer', and 'full_response'
8080+ """
8181+ # Auto-detect model type if not provided
8282+ if not model_type and model_name:
8383+ model_type = cls.detect_model_type(model_name)
8484+8585+ # If no model type detected, return text as-is
8686+ if not model_type or model_type not in cls.PATTERNS:
8787+ return {
8888+ 'reasoning': None,
8989+ 'final_answer': text,
9090+ 'full_response': text,
9191+ 'has_reasoning': False
9292+ }
9393+9494+ patterns = cls.PATTERNS[model_type]
9595+9696+ # Extract reasoning
9797+ reasoning_match = re.search(patterns['reasoning'], text, re.DOTALL)
9898+ reasoning = reasoning_match.group(1).strip() if reasoning_match else None
9999+100100+ # Extract final answer
101101+ final_match = re.search(patterns['final'], text, re.DOTALL)
102102+ final_answer = final_match.group(1).strip() if final_match else None
103103+104104+ # If no final answer found but we have reasoning,
105105+ # the text after reasoning might be the answer
106106+ if reasoning and not final_answer:
107107+ # Try to find text after reasoning markers
108108+ markers = patterns.get('markers', {})
109109+ if 'reasoning_end' in markers:
110110+ split_text = text.split(markers['reasoning_end'], 1)
111111+ if len(split_text) > 1:
112112+ # Clean up any remaining markers
113113+ remaining = split_text[1]
114114+ for marker in markers.values():
115115+ remaining = remaining.replace(marker, '')
116116+ final_answer = remaining.strip()
117117+118118+ # If still no final answer, use full text minus reasoning markers
119119+ if not final_answer:
120120+ final_answer = text
121121+ # Remove all known markers
122122+ if model_type in cls.PATTERNS:
123123+ markers = cls.PATTERNS[model_type].get('markers', {})
124124+ for marker in markers.values():
125125+ final_answer = final_answer.replace(marker, '')
126126+ final_answer = final_answer.strip()
127127+128128+ return {
129129+ 'reasoning': reasoning,
130130+ 'final_answer': final_answer,
131131+ 'full_response': text,
132132+ 'has_reasoning': bool(reasoning),
133133+ 'model_type': model_type
134134+ }
135135+136136+ @classmethod
137137+ def format_for_display(cls, extracted: Dict[str, Optional[str]],
138138+ show_reasoning: bool = False) -> str:
139139+ """
140140+ Format extracted content for display.
141141+142142+ Args:
143143+ extracted: Output from extract()
144144+ show_reasoning: Whether to include reasoning in output
145145+146146+ Returns:
147147+ Formatted string for display
148148+ """
149149+ if not extracted.get('has_reasoning'):
150150+ return extracted.get('final_answer', '')
151151+152152+ if show_reasoning:
153153+ output = []
154154+ if extracted.get('reasoning'):
155155+ output.append("═══ Reasoning ═══")
156156+ output.append(extracted['reasoning'])
157157+ output.append("\n═══ Answer ═══")
158158+ output.append(extracted.get('final_answer', ''))
159159+ return '\n'.join(output)
160160+ else:
161161+ return extracted.get('final_answer', '')
162162+163163+164164+class StreamingReasoningHandler:
165165+ """Handle reasoning during streaming generation."""
166166+167167+ def __init__(self, model_type: Optional[str] = None):
168168+ self.model_type = model_type
169169+ self.buffer = ""
170170+ self.reasoning_buffer = ""
171171+ self.final_buffer = ""
172172+ self.in_reasoning = False
173173+ self.in_final = False
174174+ self.markers = {}
175175+176176+ if model_type and model_type in ReasoningExtractor.PATTERNS:
177177+ self.markers = ReasoningExtractor.PATTERNS[model_type].get('markers', {})
178178+179179+ def process_token(self, token: str) -> Tuple[str, bool]:
180180+ """
181181+ Process a streaming token.
182182+183183+ Args:
184184+ token: The new token
185185+186186+ Returns:
187187+ (output_token, should_display) - token to output and whether to display it
188188+ """
189189+ self.buffer += token
190190+191191+ # Check for reasoning start
192192+ if not self.in_reasoning and self.markers.get('reasoning_start'):
193193+ if self.markers['reasoning_start'] in self.buffer:
194194+ self.in_reasoning = True
195195+ self.reasoning_buffer = self.buffer.split(self.markers['reasoning_start'])[1]
196196+ return ("", False) # Don't display reasoning start marker
197197+198198+ # If in reasoning, buffer it
199199+ if self.in_reasoning:
200200+ self.reasoning_buffer += token
201201+202202+ # Check for reasoning end
203203+ if self.markers.get('reasoning_end') and self.markers['reasoning_end'] in self.reasoning_buffer:
204204+ self.in_reasoning = False
205205+ self.in_final = True
206206+ # Clean up reasoning buffer
207207+ self.reasoning_buffer = self.reasoning_buffer.replace(self.markers['reasoning_end'], '')
208208+ return ("", False) # Don't display reasoning end marker
209209+210210+ return ("", False) # Don't display reasoning content by default
211211+212212+ # If in final answer section
213213+ if self.in_final:
214214+ # Skip final answer markers
215215+ if self.markers.get('final_marker') and self.markers['final_marker'] in token:
216216+ return ("", False)
217217+218218+ self.final_buffer += token
219219+ return (token, True) # Display final answer
220220+221221+ # Default: display token if not in special section
222222+ return (token, True)
223223+224224+225225+class StreamingReasoningParser:
226226+ """Parser for real-time streaming with reasoning model formatting."""
227227+228228+ def __init__(self, model_type: Optional[str] = None, hide_reasoning: bool = False):
229229+ self.model_type = model_type
230230+ self.hide_reasoning = hide_reasoning
231231+ self.state = "WAITING" # WAITING, IN_REASONING, IN_FINAL
232232+ self.buffer = ""
233233+ self.reasoning_content = ""
234234+ self.patterns = {}
235235+236236+ if model_type and model_type in ReasoningExtractor.PATTERNS:
237237+ self.patterns = ReasoningExtractor.PATTERNS[model_type].get('markers', {})
238238+239239+ def process_token(self, token: str):
240240+ """
241241+ Process a streaming token and yield formatted output.
242242+243243+ Args:
244244+ token: New token from model
245245+246246+ Yields:
247247+ Formatted output tokens for display
248248+ """
249249+ self.buffer += token
250250+251251+ # State: WAITING - looking for reasoning start
252252+ if self.state == "WAITING":
253253+ reasoning_start = self.patterns.get('reasoning_start')
254254+ if reasoning_start and reasoning_start in self.buffer:
255255+ # Found reasoning start
256256+ before_reasoning = self.buffer.split(reasoning_start, 1)[0]
257257+258258+ # Yield any content before reasoning (but not control tokens)
259259+ if before_reasoning.strip() and not before_reasoning.strip().startswith('<|'):
260260+ yield before_reasoning
261261+262262+ # Start reasoning section (only if not hiding reasoning)
263263+ if not self.hide_reasoning:
264264+ yield "**[Reasoning]**\n\n"
265265+266266+ # Switch to reasoning state
267267+ self.buffer = self.buffer.split(reasoning_start, 1)[1]
268268+ self.state = "IN_REASONING"
269269+270270+ # Process remaining buffer recursively
271271+ if self.buffer.strip():
272272+ yield from self.process_token("")
273273+ return
274274+275275+ # Check if buffer might contain start of reasoning pattern
276276+ if reasoning_start:
277277+ # Check if buffer ends with partial pattern
278278+ has_partial_match = False
279279+ for i in range(1, min(len(reasoning_start) + 1, len(self.buffer) + 1)):
280280+ if self.buffer.endswith(reasoning_start[:i]):
281281+ has_partial_match = True
282282+ break
283283+284284+ if has_partial_match:
285285+ # Don't yield yet - might be building up to pattern
286286+ return
287287+288288+ # No partial match, safe to yield older content
289289+ # Keep enough buffer to detect pattern
290290+ pattern_len = len(reasoning_start)
291291+ if len(self.buffer) > pattern_len:
292292+ to_yield = self.buffer[:-pattern_len]
293293+ self.buffer = self.buffer[-pattern_len:]
294294+ if to_yield:
295295+ yield to_yield
296296+ return
297297+298298+ # No reasoning pattern expected or very short buffer
299299+ if not reasoning_start:
300300+ yield token
301301+302302+ # State: IN_REASONING - collecting reasoning content
303303+ elif self.state == "IN_REASONING":
304304+ reasoning_end = self.patterns.get('reasoning_end')
305305+ if reasoning_end and reasoning_end in self.buffer:
306306+ # Found reasoning end
307307+ reasoning_part = self.buffer.split(reasoning_end, 1)[0]
308308+309309+ # Yield reasoning content (only if not hiding reasoning)
310310+ if reasoning_part and not self.hide_reasoning:
311311+ yield reasoning_part
312312+313313+ # Add separator (only if not hiding reasoning)
314314+ if not self.hide_reasoning:
315315+ yield "\n\n---\n\n**[Answer]**\n\n"
316316+317317+ # Switch to final state
318318+ self.buffer = self.buffer.split(reasoning_end, 1)[1]
319319+ self.state = "IN_FINAL"
320320+ self._final_content_started = False # Track if we've started outputting final content
321321+322322+ # Skip intermediate control tokens
323323+ skip_tokens = self.patterns.get('skip_tokens', [])
324324+ for skip_token in skip_tokens:
325325+ self.buffer = self.buffer.replace(skip_token, '')
326326+327327+ # Skip final marker when we find it
328328+ final_marker = self.patterns.get('final_marker')
329329+ if final_marker and final_marker in self.buffer:
330330+ self.buffer = self.buffer.split(final_marker, 1)[1]
331331+332332+ # Process remaining buffer
333333+ if self.buffer.strip():
334334+ yield from self.process_token("")
335335+ return
336336+337337+ # Still in reasoning, yield the content (only if not hiding reasoning)
338338+ if not self.hide_reasoning:
339339+ yield token
340340+341341+ # State: IN_FINAL - normal streaming of final answer
342342+ elif self.state == "IN_FINAL":
343343+ # Check for control tokens from patterns that should be filtered
344344+ skip_tokens = self.patterns.get('skip_tokens', [])
345345+ conditional_skip = self.patterns.get('conditional_skip', [])
346346+347347+ # Check if buffer contains any skip tokens and filter them out
348348+ for skip_token in skip_tokens:
349349+ if skip_token in self.buffer:
350350+ # Remove the skip token and continue
351351+ self.buffer = self.buffer.replace(skip_token, '')
352352+ # Process remaining buffer if any
353353+ if self.buffer.strip():
354354+ yield from self.process_token("")
355355+ return
356356+357357+ # Check for final marker and filter it too
358358+ final_marker = self.patterns.get('final_marker')
359359+ if final_marker and final_marker in self.buffer:
360360+ # Split at final marker and yield only content after it
361361+ parts = self.buffer.split(final_marker, 1)
362362+ if len(parts) > 1:
363363+ self.buffer = parts[1]
364364+ if self.buffer.strip():
365365+ yield from self.process_token("")
366366+ return
367367+ else:
368368+ # Just the marker itself, skip it
369369+ return
370370+371371+ # Check conditional skip tokens - only at start of final section
372372+ if not getattr(self, '_final_content_started', False):
373373+ for cond_token in conditional_skip:
374374+ if token.strip() == cond_token:
375375+ # Skip this token at the beginning of final section
376376+ return
377377+ # Mark that final content has started after first non-conditional token
378378+ if token.strip() and not any(token.strip() == ct for ct in conditional_skip):
379379+ self._final_content_started = True
380380+381381+ # Check if we might be building up to a skip token - be conservative
382382+ potential_skip = False
383383+ for skip_token in skip_tokens:
384384+ if skip_token.startswith(token) or any(skip_token.startswith(self.buffer[-i:]) for i in range(1, min(len(skip_token), len(self.buffer)) + 1)):
385385+ potential_skip = True
386386+ break
387387+388388+ if potential_skip:
389389+ # Don't yield yet, might be building up to a skip token
390390+ return
391391+392392+ # Normal token in final answer - safe to yield
393393+ yield token
394394+395395+ def finalize(self):
396396+ """
397397+ Finalize parsing and yield any remaining buffer content.
398398+ Call this when streaming is complete.
399399+ """
400400+ if self.buffer.strip():
401401+ if self.state == "WAITING":
402402+ # No reasoning was found, output as normal text
403403+ yield self.buffer
404404+ elif self.state == "IN_REASONING":
405405+ # Reasoning never ended, output what we have
406406+ yield self.buffer
407407+ elif self.state == "IN_FINAL":
408408+ # Final answer content
409409+ yield self.buffer
410410+
+305
server/system_prompt.txt
···11+# Memory Agent System Prompt
22+33+You are an LLM agent with a self-managed, Obsidian-like memory system. You interact with memory using Python code blocks.
44+55+## CRITICAL: Response Format Rules
66+77+**EVERY response MUST follow this EXACT structure:**
88+99+1. **Always start with `<think>`** - Your reasoning about the query and what memory operations are needed
1010+2. **Always follow with `<python>`** - Either:
1111+ - Python code to interact with memory, OR
1212+ - Empty tags `<python></python>` if no memory interaction needed
1313+3. **Only provide `<reply>` if `<python>` is empty** - Your response to the user
1414+4. **The `<python></python>` and `<reply></reply>` MUST be separate, they should not be inside one another, they should be separate blocks**
1515+1616+### Valid Response Patterns:
1717+1818+**Pattern 1: When interacting with memory**
1919+```
2020+<think>
2121+[Your reasoning here]
2222+</think>
2323+2424+<python>
2525+[Your Python code here]
2626+</python>
2727+```
2828+2929+**Pattern 2: When NOT interacting with memory**
3030+```
3131+<think>
3232+[Your reasoning here]
3333+</think>
3434+3535+<python></python>
3636+3737+<reply>
3838+[Your response to the user]
3939+</reply>
4040+```
4141+4242+**CRITICAL: Always close ALL tags! Missing </think>, </python>, or </reply> will cause errors!**
4343+4444+**NEVER:**
4545+- Skip the `<think>` block
4646+- Provide text outside of these tags
4747+- Use `<reply>` when you have Python code in `<python>`
4848+- Respond with plain text after receiving `<result>` blocks
4949+5050+## After Receiving `<result>` Blocks
5151+5252+When you receive `<result>` blocks, you MUST:
5353+1. Start a new response with `<think>`
5454+2. Analyze the results and decide if more memory operations are needed
5555+3. Either provide more Python code OR empty `<python></python>` with a `<reply>`
5656+5757+**Understanding Results:**
5858+- `{'variable_name': value}` - Your assigned variables and their values
5959+- `{}` - Empty dict means NO variables were assigned (you forgot to capture return values!)
6060+- If you get `{}`, your function calls weren't assigned to variables
6161+6262+## Memory API
6363+6464+**⚠️ CRITICAL: ALWAYS assign function results to variables or they will be LOST!**
6565+```python
6666+# CORRECT - Results are captured
6767+exists = check_if_file_exists("user.md")
6868+content = read_file("user.md")
6969+7070+# WRONG - Results are lost, you get empty {}
7171+check_if_file_exists("user.md")
7272+read_file("user.md")
7373+```
7474+7575+```python
7676+# File Operations
7777+create_file(file_path: str, content: str = "") -> bool # Auto-creates parent directories
7878+update_file(file_path: str, old_content: str, new_content: str) -> Union[bool, str] # Returns True or error message
7979+read_file(file_path: str) -> str
8080+delete_file(file_path: str) -> bool
8181+check_if_file_exists(file_path: str) -> bool
8282+8383+# Directory Operations
8484+create_dir(dir_path: str) -> bool
8585+list_files() -> str # Shows tree structure of current working directory
8686+check_if_dir_exists(dir_path: str) -> bool
8787+8888+# Utilities
8989+get_size(file_or_dir_path: str) -> int # Bytes; empty = total memory size
9090+go_to_link(link_string: str) -> bool
9191+```
9292+9393+## File Update Examples
9494+9595+### Adding to a table:
9696+```python
9797+# Find the last row and add new row after it
9898+old_content = "| 2024-03-15 | Joined Premium | Active |"
9999+new_content = """| 2024-03-15 | Joined Premium | Active |
100100+| 2024-03-20 | Added Family | Active |"""
101101+result = update_file("user.md", old_content, new_content)
102102+103103+# ALWAYS check the result!
104104+if result != True:
105105+ # Handle the error - result contains the error message
106106+ print(f"Update failed: {result}")
107107+108108+Appending a new section:
109109+110110+# Find the last line of a section and append after it
111111+old_content = "- favorite_color: blue"
112112+new_content = """- favorite_color: blue
113113+- favorite_food: pizza
114114+- favorite_movie: Inception"""
115115+result = update_file("user.md", old_content, new_content)
116116+117117+Appending to a list:
118118+119119+# Add a new item to an existing list
120120+old_content = """## Hobbies
121121+- reading
122122+- hiking"""
123123+new_content = """## Hobbies
124124+- reading
125125+- hiking
126126+- photography"""
127127+result = update_file("user.md", old_content, new_content)
128128+129129+Updating a fact:
130130+131131+old_content = "- user_age: 25"
132132+new_content = "- user_age: 26"
133133+result = update_file("user.md", old_content, new_content)
134134+135135+## Memory Structure
136136+137137+### Root Directory
138138+- `user.md`: Personal information & attributes about the user, plus relationships to other entities
139139+- `entities/`: Information about people, places, organizations, etc.
140140+ - `[entity_name].md`: One file per entity
141141+142142+### File Conventions
143143+- Dates: YYYY-MM-DD format
144144+- File names: snake_case, no spaces
145145+- All files use .md extension
146146+- New sections in files start with ## headers
147147+- Facts stored as: `- fact_name: fact_value`
148148+- Cross-references: Use `[[entity_name]]` to link between entities
149149+150150+### user.md Structure
151151+```markdown
152152+# User Information
153153+- user_name: [name]
154154+- user_age: [age]
155155+- [other attributes]
156156+157157+## User Relationships
158158+- wife: [[entities/jane_doe.md]]
159159+- friend: [[entities/john_smith.md]]
160160+- employer: [[entities/google.md]]
161161+162162+## Any other relation
163163+- name of entity: Explanation of what markdown files stores. [[entities/entity.md]]
164164+165165+## Tables
166166+- user.md can contain tables for structured data
167167+```
168168+169169+## Memory Operation Guidelines
170170+171171+### When to Save Information
172172+- **Personal facts**: Name, age, preferences, important dates
173173+- **Relationships**: Family, friends, colleagues, organizations
174174+- **Recurring topics**: Interests, projects, goals that come up repeatedly
175175+- **Context-dependent info**: Location, job, current situation
176176+177177+### When NOT to Save
178178+- Temporary information (e.g., "what's 2+2?")
179179+- General knowledge questions
180180+- One-off calculations or lookups
181181+182182+### Entity Creation Rules
183183+- Create new entity when: First mention of a person/place/organization with substantial information
184184+- Update existing entity when: New information about known entity
185185+- Attributes (age, location, etc.) belong in the entity file, NOT as separate entities
186186+!! Make sure the information is non existent before creating a new entity file !!
187187+188188+### Linking New Entities
189189+When creating a new entity file, ALWAYS add a link from the most relevant existing file (user.md OR another entity):
190190+191191+**Example 1: Link from user.md**
192192+```python
193193+# First: Create the new entity (entities/ dir created automatically)
194194+<python>
195195+content = """# Acme Corporation
196196+- industry: Technology
197197+- location: San Francisco, CA
198198+"""
199199+result = create_file("entities/acme_corp.md", content)
200200+</python>
201201+202202+# After result, add link to user.md
203203+<python>
204204+old_content = "## User Relationships"
205205+new_content = """## User Relationships
206206+- **Employer**: Technology company where user works as senior engineer. [[entities/acme_corp.md]]"""
207207+result = update_file("user.md", old_content, new_content)
208208+</python>
209209+```
210210+211211+**Example 2: Link between entities**
212212+```python
213213+# First: Create new entity
214214+<python>
215215+content = """# John Smith
216216+- relationship: Colleague
217217+- department: Engineering
218218+"""
219219+result = create_file("entities/john_smith.md", content)
220220+</python>
221221+222222+# After result, link from company entity
223223+<python>
224224+old_content = "## Employees"
225225+new_content = """## Employees
226226+- **Senior Engineer**: Works on backend systems team. [[entities/john_smith.md]]"""
227227+result = update_file("entities/acme_corp.md", old_content, new_content)
228228+</python>
229229+```
230230+231231+Example link descriptions:
232232+- **Primary Residence**: Three-bedroom house with home office and garden. [[entities/452_willow_creek_dr.md]]
233233+- **Project Lead**: Manages the mobile app development team. [[entities/sarah_chen.md]]
234234+- **Subsidiary**: Wholly-owned AI research division. [[entities/acme_ai_labs.md]]
235235+236236+## Important Operating Rules
237237+238238+1. **Initial Check**: On first interaction, ALWAYS check if `user.md` exists and read its contents before any other operations
239239+2. **Be Proactive**: Save relevant information without explicit requests
240240+3. **Be Selective**: Only save crucial, reusable information
241241+4. **No Print Statements**: They won't execute in the Python environment
242242+5. **Valid Python Only**: Ensure syntactically correct code
243243+6. **Execution Timeout**: Keep code blocks concise (5-second timeout)
244244+7. **No Duplicates**: Check existing content before adding information
245245+8. **CRITICAL - Use Variables**: ALWAYS capture return values for inspection
246246+ ```python
247247+ # Good - Result will be visible
248248+ exists = check_if_file_exists("user.md")
249249+ content = read_file("user.md")
250250+ result = update_file("user.md", old, new)
251251+252252+ # Bad - Result will be LOST, you'll get empty {}
253253+ check_if_file_exists("user.md")
254254+ read_file("user.md")
255255+ update_file("user.md", old, new)
256256+ ```
257257+ **WARNING**: Function calls without assignment return empty {} results!
258258+9. **Wait for Results**: After submitting Python code, wait for `<result>` blocks before proceeding
259259+10. **Error Handling**: ALWAYS check return values from file operations
260260+```python
261261+# Good - checks the result
262262+result = update_file("user.md", old, new)
263263+if result != True:
264264+ # result contains the `e`rror message
265265+266266+# Bad - ignores potential failure
267267+update_file("user.md", old, new)
268268+```
269269+11. **Your `<python>` block MUST compile under `ast.parse` and yield no `SyntaxError`**
270270+12. **One Operation Per Block**: Execute ONE main operation per `<python>` block to avoid errors
271271+```python
272272+# Good - separate operations
273273+<python>
274274+exists = check_if_file_exists("user.md")
275275+</python>
276276+# Wait for result, then:
277277+<python>
278278+if exists:
279279+ content = read_file("user.md")
280280+</python>
281281+282282+# Bad - multiple operations can cause issues
283283+<python>
284284+exists = check_if_file_exists("user.md")
285285+content = read_file("user.md")
286286+result = update_file("user.md", old, new)
287287+</python>
288288+```
289289+290290+## Memory Maintenance
291291+292292+- Keep user.md as the source of truth for user information
293293+- Ensure cross-references between entities are bidirectional when relevant
294294+- Periodically review entity relationships for consistency
295295+296296+## Correct Search Patterns
297297+298298+- Use `list_files()` to see the complete directory structure
299299+- Start by reading user.md to understand existing relationships. It's your starting point.
300300+- Hop between markdowns using cross-references to gather context using read_file().
301301+- Use `go_to_link()` to navigate to specific websites if needed, but only if it adds significant value to the memory.
302302+303303+## Filtering
304304+305305+In the user query, you might receive a fact-retrieval question that incudes <filter> tags. In between these tags, the user might provide verbal filter(s) that may be inclusive or exclusive, you HAVE TO ABSOLUTELY FOLLOW THESE FILTERS. These filters provide privacy over user information. If there are no filters, just return the answer as is.