feat: first stab at responses api · tiles.run/tiles@b126eeb

+21 -3

server/api.py

··· 1 1 from fastapi import FastAPI, HTTPException 2 2 3 - from .schemas import ChatMessage, ChatCompletionRequest, StartRequest, downloadRequest 3 + from .schemas import ( 4 + ChatMessage, 5 + ChatCompletionRequest, 6 + StartRequest, 7 + downloadRequest, 8 + ResponsesRequest, 9 + ) 4 10 import logging 5 11 import sys 6 12 from typing import Optional ··· 10 16 11 17 from .hf_downloader import pull_model 12 18 13 - from server.mem_agent.utils import ( 19 + from .mem_agent.utils import ( 14 20 create_memory_if_not_exists, 15 21 format_results, 16 22 ) 17 - from server.mem_agent.engine import execute_sandboxed_code 23 + from .mem_agent.engine import execute_sandboxed_code 18 24 19 25 from . import runtime 20 26 ··· 39 45 """Download the model""" 40 46 runtime.backend.download_model(request.model) 41 47 48 + 42 49 @app.post("/start") 43 50 async def start_model(request: StartRequest): 44 51 """Load the model and start the agent""" ··· 77 84 ) 78 85 except Exception as e: 79 86 raise HTTPException(status_code=500, detail=str(e)) 87 + 88 + 89 + @app.post("/v1/responses") 90 + async def create_chat_response(request: ResponsesRequest): 91 + """ 92 + Create a response with openResponse format 93 + """ 94 + 95 + global _messages 96 + 97 + return await runtime.backend.generate_response_chat(request)

+52 -1

server/backend/mlx.py

··· 1 1 from .mlx_runner import MLXRunner 2 2 from ..cache_utils import get_model_path 3 3 from fastapi import HTTPException 4 - from ..schemas import ChatMessage, ChatCompletionRequest, downloadRequest, GenerationMetrics 4 + from ..schemas import ( 5 + ChatMessage, 6 + ChatCompletionRequest, 7 + ResponsesResponse, 8 + downloadRequest, 9 + GenerationMetrics, 10 + ResponsesRequest, 11 + ) 5 12 from ..hf_downloader import pull_model 6 13 7 14 import logging ··· 77 84 logger.info(f"Model {model_name} already in memory") 78 85 79 86 return _model_cache[model_path_str] 87 + 80 88 81 89 async def generate_chat_stream( 82 90 messages: List[ChatMessage], request: ChatCompletionRequest ··· 181 189 yield f"data: {json.dumps(final_response)}\n\n" 182 190 yield "data: [DONE]\n\n" 183 191 192 + 184 193 def format_chat_messages_for_runner( 185 194 messages: List[ChatMessage], 186 195 ) -> List[Dict[str, str]]: ··· 195 204 """Rough token count estimation.""" 196 205 return int(len(text.split()) * 1.3) # Approximation, convert to int 197 206 207 + 208 + async def generate_response_chat(request: ResponsesRequest): 209 + """Generate chat responses""" 210 + 211 + model = request.model or "mlx-community/gpt-oss-20b-MXFP4-Q4" 212 + input = request.input or "" 213 + response_id = f"resp-{uuid.uuid4()}" 214 + msg_id = f"msg_{uuid.uuid4()}" 215 + created = int(time.time()) 216 + runner = get_or_load_model(model) 217 + generated_text = runner.generate_batch( 218 + prompt=input, 219 + max_tokens=runner.get_effective_max_tokens(request.max_output_tokens), 220 + temperature=request.temperature or 1, 221 + top_p=request.top_p or 1, 222 + use_chat_template=True, # Already applied in _format_conversation 223 + ) 224 + completed_at = int(time.time()) 225 + return ResponsesResponse( 226 + id=response_id, 227 + created_at=created, 228 + completed_at=completed_at, 229 + model=model, 230 + status="completed", 231 + object="response", 232 + output=[ 233 + { 234 + "type": "message", 235 + "id": msg_id, 236 + "status": "completed", 237 + "role": "assistant", 238 + "content": [ 239 + { 240 + "type": "output_text", 241 + "text": generated_text, 242 + "annotations": [], 243 + } 244 + ], 245 + } 246 + ], 247 + usage={"input_tokens": 36}, 248 + )

+3 -1

server/runtime.py

··· 1 - backend = None 1 + from typing import Any 2 + 3 + backend: Any = None

+44 -1

server/schemas.py

··· 2 2 from typing import Any, Dict, List, Optional, Union 3 3 from dataclasses import dataclass 4 4 5 + 5 6 class CompletionRequest(BaseModel): 6 7 model: str 7 8 prompt: Union[str, List[str]] ··· 60 61 class StartRequest(BaseModel): 61 62 model: str 62 63 memory_path: str 63 - system_prompt: str 64 + system_prompt: str 65 + 64 66 65 67 class downloadRequest(BaseModel): 66 68 model: str 67 69 70 + 71 + class ResponsesRequest(BaseModel): 72 + model: Optional[str] = None 73 + input: Optional[str] = None 74 + reasoning: Optional[Dict[str, Any]] = None 75 + previous_response_id: Optional[str] = None 76 + stream: Optional[bool] = False 77 + tools: Optional[List[Dict[str, Any]]] = None 78 + temperature: Optional[float] = 1 79 + top_p: Optional[float] = 1 80 + max_output_tokens: Optional[int] = None 81 + 82 + 83 + class ResponsesResponse(BaseModel): 84 + id: str 85 + object: str = "response" 86 + created_at: int 87 + status: str 88 + completed_at: Optional[int] = None 89 + error: Optional[Dict[str, Any]] = None 90 + incomplete_details: Optional[Dict[str, Any]] = None 91 + instructions: Optional[str] = None 92 + max_output_tokens: Optional[int] = None 93 + model: str 94 + output: List[Dict[str, Any]] 95 + parallel_tool_calls: bool = True 96 + previous_response_id: Optional[str] = None 97 + reasoning: Optional[Dict[str, Any]] = Field(default_factory=dict) 98 + store: bool = True 99 + temperature: float = 1.0 100 + text: Dict[str, Any] = Field(default_factory=lambda: {"format": {"type": "text"}}) 101 + tool_choice: Union[str, Dict[str, Any]] = "auto" 102 + tools: List[Dict[str, Any]] = Field(default_factory=list) 103 + top_p: float = 1.0 104 + truncation: str = "disabled" 105 + usage: Dict[str, Any] 106 + user: Optional[str] = None 107 + metadata: Dict[str, Any] = Field(default_factory=dict) 108 + 109 + 68 110 @dataclass 69 111 class GenerationMetrics: 70 112 """Benchmarking metrics for token generation.""" 113 + 71 114 ttft_ms: float # Time to first token in milliseconds 72 115 total_tokens: int # Total tokens generated 73 116 tokens_per_second: float # Throughput

+3 -3

server/stack/requirements/app-server/packages-app-server.txt

··· 19 19 mypy-extensions==1.1.0 20 20 numpy==2.4.1 21 21 packaging==26.0 22 - pathspec==1.0.3 22 + pathspec==1.0.4 23 23 platformdirs==4.5.1 24 - protobuf==6.33.4 24 + protobuf==6.33.5 25 25 pydantic==2.12.5 26 26 pydantic-core==2.41.5 27 - pytokens==0.4.0 27 + pytokens==0.4.1 28 28 pyyaml==6.0.3 29 29 regex==2026.1.15 30 30 requests==2.32.5

+2 -2

server/stack/requirements/app-server/pylock.app-server.meta.json

··· 1 1 { 2 2 "lock_input_hash": "sha256:182c606e20dd957344cc3adc54391f47f4b6dd80b4481ddf219392a7aad6e0ce", 3 3 "lock_version": 1, 4 - "locked_at": "2026-01-22T05:41:48.443112+00:00", 4 + "locked_at": "2026-01-30T08:41:45.203370+00:00", 5 5 "other_inputs_hash": "sha256:63b3c2cfe2ec414938e81dace7aac779c7b902bae681618cd8827e9f16880985", 6 - "requirements_hash": "sha256:a08c15387b6f199fe37fad0855c14ffde941d1c0b49f94fa1ed48a9464fab9a6", 6 + "requirements_hash": "sha256:288220847007f2f14c9a0aa2a972b33e92f6bb84f25dac1a248fbe6e55ec2bea", 7 7 "version_inputs_hash": "sha256:58db986b7cd72eeded675f7c9afd8138fe024fb51451131b5562922bbde3cf43" 8 8 }

+35 -35

server/stack/requirements/app-server/pylock.app-server.toml

··· 440 440 441 441 [[packages]] 442 442 name = "pathspec" 443 - version = "1.0.3" 443 + version = "1.0.4" 444 444 index = "https://pypi.org/simple" 445 445 446 446 [[packages.wheels]] 447 - url = "https://files.pythonhosted.org/packages/32/2b/121e912bd60eebd623f873fd090de0e84f322972ab25a7f9044c056804ed/pathspec-1.0.3-py3-none-any.whl" 448 - upload-time = 2026-01-09T15:46:44Z 449 - size = 55021 447 + url = "https://files.pythonhosted.org/packages/ef/3c/2c197d226f9ea224a9ab8d197933f9da0ae0aac5b6e0f884e2b8d9c8e9f7/pathspec-1.0.4-py3-none-any.whl" 448 + upload-time = 2026-01-27T03:59:45Z 449 + size = 55206 450 450 451 451 [packages.wheels.hashes] 452 - sha256 = "e80767021c1cc524aa3fb14bedda9c34406591343cc42797b386ce7b9354fb6c" 452 + sha256 = "fb6ae2fd4e7c921a165808a552060e722767cfa526f99ca5156ed2ce45a5c723" 453 453 454 454 [[packages]] 455 455 name = "platformdirs" ··· 466 466 467 467 [[packages]] 468 468 name = "protobuf" 469 - version = "6.33.4" 469 + version = "6.33.5" 470 470 index = "https://pypi.org/simple" 471 471 472 472 [[packages.wheels]] 473 - url = "https://files.pythonhosted.org/packages/66/15/6ee23553b6bfd82670207ead921f4d8ef14c107e5e11443b04caeb5ab5ec/protobuf-6.33.4-cp39-abi3-macosx_10_9_universal2.whl" 474 - upload-time = 2026-01-12T18:33:32Z 475 - size = 427612 473 + url = "https://files.pythonhosted.org/packages/a2/6b/e48dfc1191bc5b52950246275bf4089773e91cb5ba3592621723cdddca62/protobuf-6.33.5-cp39-abi3-macosx_10_9_universal2.whl" 474 + upload-time = 2026-01-29T21:51:25Z 475 + size = 427766 476 476 477 477 [packages.wheels.hashes] 478 - sha256 = "2fe67f6c014c84f655ee06f6f66213f9254b3a8b6bda6cda0ccd4232c73c06f0" 478 + sha256 = "a5cb85982d95d906df1e2210e58f8e4f1e3cdc088e52c921a041f9c9a0386de5" 479 479 480 480 [[packages.wheels]] 481 - url = "https://files.pythonhosted.org/packages/e8/8e/971c0edd084914f7ee7c23aa70ba89e8903918adca179319ee94403701d5/protobuf-6.33.4-cp39-abi3-manylinux2014_x86_64.whl" 482 - upload-time = 2026-01-12T18:33:36Z 483 - size = 323311 481 + url = "https://files.pythonhosted.org/packages/9b/53/a9443aa3ca9ba8724fdfa02dd1887c1bcd8e89556b715cfbacca6b63dbec/protobuf-6.33.5-cp39-abi3-manylinux2014_x86_64.whl" 482 + upload-time = 2026-01-29T21:51:28Z 483 + size = 323465 484 484 485 485 [packages.wheels.hashes] 486 - sha256 = "3df850c2f8db9934de4cf8f9152f8dc2558f49f298f37f90c517e8e5c84c30e9" 486 + sha256 = "cbf16ba3350fb7b889fca858fb215967792dc125b35c7976ca4818bee3521cf0" 487 487 488 488 [[packages.wheels]] 489 - url = "https://files.pythonhosted.org/packages/75/b1/1dc83c2c661b4c62d56cc081706ee33a4fc2835bd90f965baa2663ef7676/protobuf-6.33.4-py3-none-any.whl" 490 - upload-time = 2026-01-12T18:33:39Z 491 - size = 170532 489 + url = "https://files.pythonhosted.org/packages/57/bf/2086963c69bdac3d7cff1cc7ff79b8ce5ea0bec6797a017e1be338a46248/protobuf-6.33.5-py3-none-any.whl" 490 + upload-time = 2026-01-29T21:51:32Z 491 + size = 170687 492 492 493 493 [packages.wheels.hashes] 494 - sha256 = "1fe3730068fcf2e595816a6c34fe66eeedd37d51d0400b72fabc848811fdc1bc" 494 + sha256 = "69915a973dd0f60f31a08b8318b73eab2bd6a392c79184b3612226b0a3f8ec02" 495 495 496 496 [[packages]] 497 497 name = "pydantic" ··· 537 537 538 538 [[packages]] 539 539 name = "pytokens" 540 - version = "0.4.0" 540 + version = "0.4.1" 541 541 index = "https://pypi.org/simple" 542 542 543 543 [[packages.wheels]] 544 - url = "https://files.pythonhosted.org/packages/98/63/627b7e71d557383da5a97f473ad50f8d9c2c1f55c7d3c2531a120c796f6e/pytokens-0.4.0-cp313-cp313-macosx_11_0_arm64.whl" 545 - upload-time = 2026-01-19T07:59:16Z 546 - size = 159744 544 + url = "https://files.pythonhosted.org/packages/cb/dc/08b1a080372afda3cceb4f3c0a7ba2bde9d6a5241f1edb02a22a019ee147/pytokens-0.4.1-cp313-cp313-macosx_11_0_arm64.whl" 545 + upload-time = 2026-01-30T01:03:13Z 546 + size = 160720 547 547 548 548 [packages.wheels.hashes] 549 - sha256 = "73eff3bdd8ad08da679867992782568db0529b887bed4c85694f84cdf35eafc6" 549 + sha256 = "8bdb9d0ce90cbf99c525e75a2fa415144fd570a1ba987380190e8b786bc6ef9b" 550 550 551 551 [[packages.wheels]] 552 - url = "https://files.pythonhosted.org/packages/ab/96/04102856b9527701ae57d74a6393d1aca5bad18a1b1ca48ccffb3c93b392/pytokens-0.4.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl" 553 - upload-time = 2026-01-19T07:59:19Z 554 - size = 267452 552 + url = "https://files.pythonhosted.org/packages/e0/d2/afe5c7f8607018beb99971489dbb846508f1b8f351fcefc225fcf4b2adc0/pytokens-0.4.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl" 553 + upload-time = 2026-01-30T01:03:15Z 554 + size = 268423 555 555 556 556 [packages.wheels.hashes] 557 - sha256 = "a2c8952c537cb73a1a74369501a83b7f9d208c3cf92c41dd88a17814e68d48ce" 557 + sha256 = "29d1d8fb1030af4d231789959f21821ab6325e463f0503a61d204343c9b355d1" 558 558 559 559 [[packages.wheels]] 560 - url = "https://files.pythonhosted.org/packages/0e/ef/0936eb472b89ab2d2c2c24bb81c50417e803fa89c731930d9fb01176fe9f/pytokens-0.4.0-cp313-cp313-musllinux_1_2_x86_64.whl" 561 - upload-time = 2026-01-19T07:59:20Z 562 - size = 265965 560 + url = "https://files.pythonhosted.org/packages/68/d4/00ffdbd370410c04e9591da9220a68dc1693ef7499173eb3e30d06e05ed1/pytokens-0.4.1-cp313-cp313-musllinux_1_2_x86_64.whl" 561 + upload-time = 2026-01-30T01:03:17Z 562 + size = 266859 563 563 564 564 [packages.wheels.hashes] 565 - sha256 = "5dbf56f3c748aed9310b310d5b8b14e2c96d3ad682ad5a943f381bdbbdddf753" 565 + sha256 = "970b08dd6b86058b6dc07efe9e98414f5102974716232d10f32ff39701e841c4" 566 566 567 567 [[packages.wheels]] 568 - url = "https://files.pythonhosted.org/packages/7c/3c/6941a82f4f130af6e1c68c076b6789069ef10c04559bd4733650f902fd3b/pytokens-0.4.0-py3-none-any.whl" 569 - upload-time = 2026-01-19T07:59:49Z 570 - size = 13224 568 + url = "https://files.pythonhosted.org/packages/c6/78/397db326746f0a342855b81216ae1f0a32965deccfd7c830a2dbc66d2483/pytokens-0.4.1-py3-none-any.whl" 569 + upload-time = 2026-01-30T01:03:45Z 570 + size = 13729 571 571 572 572 [packages.wheels.hashes] 573 - sha256 = "0508d11b4de157ee12063901603be87fb0253e8f4cb9305eb168b1202ab92068" 573 + sha256 = "26cef14744a8385f35d0e095dc8b3a7583f6c953c2e3d269c7f82484bf5ad2de" 574 574 575 575 [[packages]] 576 576 name = "pyyaml"

-2

tiles/src/runtime/mlx.rs

··· 274 274 275 275 println!("Usage Tips:"); 276 276 println!(" - Type your questions or prompts directly"); 277 - println!(" - Model outputs <think>, <python>, and <reply> tags"); 278 - println!(" - Only <reply> content is shown as final output"); 279 277 println!(); 280 278 } 281 279

Configure Feed

Configure Feed