feat: added basic benchmark in the repl · tiles.run/tiles@9bc4b4b

+3

Cargo.lock

··· 528 528 checksum = "145052bdd345b87320e369255277e3fb5152762ad123a901ef5c262dd38fe8d2" 529 529 dependencies = [ 530 530 "iana-time-zone", 531 + "js-sys", 531 532 "num-traits", 533 + "wasm-bindgen", 532 534 "windows-link", 533 535 ] 534 536 ··· 3314 3316 version = "0.4.0-rc.1" 3315 3317 dependencies = [ 3316 3318 "anyhow", 3319 + "chrono", 3317 3320 "clap", 3318 3321 "futures-util", 3319 3322 "hf-hub",

+15 -1

server/backend/mlx.py

··· 1 1 from .mlx_runner import MLXRunner 2 2 from ..cache_utils import get_model_path 3 3 from fastapi import HTTPException 4 - from ..schemas import ChatMessage, ChatCompletionRequest, downloadRequest 4 + from ..schemas import ChatMessage, ChatCompletionRequest, downloadRequest, GenerationMetrics 5 5 from ..hf_downloader import pull_model 6 6 7 7 import logging ··· 113 113 yield f"data: {json.dumps(initial_response)}\n\n" 114 114 115 115 # Stream tokens 116 + metrics = None 116 117 try: 117 118 for token in runner.generate_streaming( 118 119 prompt=prompt, ··· 125 126 use_chat_template=False, # Already applied in _format_conversation 126 127 use_chat_stop_tokens=False, # Server mode shouldn't stop on chat markers 127 128 ): 129 + if isinstance(token, GenerationMetrics): 130 + metrics = token 131 + continue 132 + 128 133 chunk_response = { 129 134 "id": completion_id, 130 135 "object": "chat.completion.chunk", ··· 165 170 "choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}], 166 171 } 167 172 173 + # Include benchmarking metrics if available 174 + if metrics: 175 + final_response["metrics"] = { 176 + "ttft_ms": metrics.ttft_ms, 177 + "total_tokens": metrics.total_tokens, 178 + "tokens_per_second": metrics.tokens_per_second, 179 + "total_latency_s": metrics.total_latency_s, 180 + } 181 + print(f"data: {json.dumps(final_response)}") 168 182 yield f"data: {json.dumps(final_response)}\n\n" 169 183 yield "data: [DONE]\n\n" 170 184

+39 -2

server/backend/mlx_runner.py

··· 18 18 from mlx_lm import load 19 19 from mlx_lm.generate import generate_step 20 20 from mlx_lm.sample_utils import make_repetition_penalty, make_sampler 21 - 21 + from ..schemas import GenerationMetrics 22 22 from ..reasoning_utils import ReasoningExtractor, StreamingReasoningParser 23 23 24 24 ··· 474 474 # Track generation metrics 475 475 start_time = time.time() 476 476 tokens_generated = 0 477 - 477 + ttft = None 478 478 # Create sampler with our parameters 479 479 sampler = make_sampler(temp=temperature, top_p=top_p) 480 480 ··· 566 566 yield formatted_token 567 567 else: 568 568 yield new_part_before_stop 569 + if reasoning_parser: 570 + yield from reasoning_parser.finalize() 571 + total_latency = time.time() - start_time 572 + tokens_per_second = tokens_generated / total_latency if total_latency > 0 else 0 573 + ttft_ms = (ttft * 1000) if ttft is not None else 0 574 + yield GenerationMetrics( 575 + ttft_ms=ttft_ms, 576 + total_tokens=tokens_generated, 577 + tokens_per_second=tokens_per_second, 578 + total_latency_s=total_latency 579 + ) 569 580 return # Stop generation without yielding stop token 570 581 571 582 # Only check chat stop tokens if no native stop token found (fallback) ··· 596 607 yield formatted_token 597 608 else: 598 609 yield new_part_before_stop 610 + if reasoning_parser: 611 + yield from reasoning_parser.finalize() 612 + total_latency = time.time() - start_time 613 + tokens_per_second = tokens_generated / total_latency if total_latency > 0 else 0 614 + ttft_ms = (ttft * 1000) if ttft is not None else 0 615 + yield GenerationMetrics( 616 + ttft_ms=ttft_ms, 617 + total_tokens=tokens_generated, 618 + tokens_per_second=tokens_per_second, 619 + total_latency_s=total_latency 620 + ) 599 621 return # Stop generation without yielding stop token 622 + 623 + if ttft is None: 624 + ttft = time.time() - start_time 600 625 601 626 # No stop token found, process the new text 602 627 if reasoning_parser: ··· 615 640 # Finalize reasoning parser if used 616 641 if reasoning_parser: 617 642 yield from reasoning_parser.finalize() 643 + 644 + # Yield metrics at the end 645 + total_latency = time.time() - start_time 646 + tokens_per_second = tokens_generated / total_latency if total_latency > 0 else 0 647 + ttft_ms = (ttft * 1000) if ttft is not None else 0 648 + metrics = GenerationMetrics( 649 + ttft_ms=ttft_ms, 650 + total_tokens=tokens_generated, 651 + tokens_per_second=tokens_per_second, 652 + total_latency_s=total_latency 653 + ) 654 + yield metrics 618 655 619 656 # Print generation statistics if verbose 620 657 if self.verbose:

+9

server/schemas.py

··· 1 1 from pydantic import BaseModel, Field 2 2 from typing import Any, Dict, List, Optional, Union 3 + from dataclasses import dataclass 3 4 4 5 class CompletionRequest(BaseModel): 5 6 model: str ··· 63 64 64 65 class downloadRequest(BaseModel): 65 66 model: str 67 + 68 + @dataclass 69 + class GenerationMetrics: 70 + """Benchmarking metrics for token generation.""" 71 + ttft_ms: float # Time to first token in milliseconds 72 + total_tokens: int # Total tokens generated 73 + tokens_per_second: float # Throughput 74 + total_latency_s: float # End-to-end latency in seconds

+1

tiles/Cargo.toml

··· 15 15 futures-util = "0.3" 16 16 hf-hub = {version = "0.4", features = ["tokio"]} 17 17 rustyline = "17.0" 18 + chrono = "0.4"

+43 -2

tiles/src/runtime/mlx.rs

··· 14 14 use rustyline::history::DefaultHistory; 15 15 use rustyline::validate::Validator; 16 16 use rustyline::{Config, Editor, Helper}; 17 + use serde::{Deserialize, Serialize}; 17 18 use serde_json::{Value, json}; 18 19 use std::fs; 19 20 use std::fs::File; ··· 23 24 use std::{io, process::Command}; 24 25 use tilekit::modelfile::Modelfile; 25 26 use tokio::time::sleep; 27 + 28 + #[derive(Debug, Deserialize, Serialize)] 29 + pub struct BenchmarkMetrics { 30 + ttft_ms: f64, 31 + total_tokens: i32, 32 + tokens_per_second: f64, 33 + total_latency_s: f64, 34 + } 35 + 26 36 pub struct MLXRuntime {} 27 37 28 38 impl MLXRuntime {} ··· 30 40 // think: String, 31 41 reply: String, 32 42 code: String, 43 + metrics: Option<BenchmarkMetrics>, 33 44 } 34 45 35 46 impl Default for MLXRuntime { ··· 413 424 } else { 414 425 println!("\n"); 415 426 } 427 + // Display benchmark metrics if available 428 + if let Some(metrics) = response.metrics { 429 + println!( 430 + "{}", 431 + format!( 432 + "\n{} {:.1} tok/s | {} tokens | {:.0}ms TTFT", 433 + "💡".yellow(), 434 + metrics.tokens_per_second, 435 + metrics.total_tokens, 436 + metrics.ttft_ms 437 + ) 438 + .dimmed() 439 + ); 440 + } 441 + 416 442 break; 417 443 } 418 444 } else { ··· 505 531 let mut stream = res.bytes_stream(); 506 532 let mut accumulated = String::new(); 507 533 println!(); 534 + let mut metrics: Option<BenchmarkMetrics> = None; 508 535 let mut is_answer_start = false; 509 536 while let Some(chunk) = stream.next().await { 510 537 let chunk = chunk.unwrap(); ··· 517 544 let data = line.trim_start_matches("data: "); 518 545 519 546 if data == "[DONE]" { 520 - return Ok(convert_to_chat_response(&accumulated, run_args.memory)); 547 + return Ok(convert_to_chat_response( 548 + &accumulated, 549 + run_args.memory, 550 + metrics, 551 + )); 521 552 } 553 + 522 554 // Parse JSON 523 555 let v: Value = serde_json::from_str(data).unwrap(); 556 + // Check for metrics in the response 557 + if let Some(metrics_obj) = v.get("metrics") { 558 + metrics = serde_json::from_value(metrics_obj.clone()).ok(); 559 + } 524 560 if let Some(delta) = v["choices"][0]["delta"]["content"].as_str() { 525 561 accumulated.push_str(delta); 526 562 if !run_args.memory && delta.contains("**[Answer]**") { ··· 539 575 Err(String::from("request failed")) 540 576 } 541 577 542 - fn convert_to_chat_response(content: &str, memory_mode: bool) -> ChatResponse { 578 + fn convert_to_chat_response( 579 + content: &str, 580 + memory_mode: bool, 581 + metrics: Option<BenchmarkMetrics>, 582 + ) -> ChatResponse { 543 583 ChatResponse { 544 584 reply: extract_reply(content, memory_mode), 545 585 code: extract_python(content), 586 + metrics, 546 587 } 547 588 } 548 589

Configure Feed

Configure Feed