fix: added model download resuming for partial downloads

+42 -8

2 changed files

expand all

tiles

src

runtime

mlx.rs

utils

config.rs

+41 -7

tiles/src/runtime/mlx.rs

··· 26 26 use tilekit::modelfile::Role; 27 27 use tokio::time::sleep; 28 28 29 + const MAX_LOAD_MODEL_RETRIES: u8 = 3; 30 + 29 31 #[derive(Debug, Deserialize, Serialize, Clone)] 30 32 pub struct BenchmarkMetrics { 31 33 ttft_ms: f64, ··· 238 240 } 239 241 // loading the model from mem-agent via daemon server 240 242 let memory_path = get_memory_path().context("Setting/Retrieving memory_path failed")?; 241 - match load_model(&modelfile, &default_modelfile, &memory_path).await { 243 + match load_model(&modelfile, &default_modelfile, &memory_path, 0).await { 242 244 Ok(_) => start_repl(mlx_runtime, &modelfile, run_args, db_conn).await?, 243 245 Err(err) => return Err(anyhow::anyhow!(err)), 244 246 } ··· 400 402 modelfile: &Modelfile, 401 403 default_modelfile: &Modelfile, 402 404 memory_path: &str, 405 + retries: u8, 403 406 ) -> Result<()> { 407 + if retries > MAX_LOAD_MODEL_RETRIES { 408 + return Err(anyhow!( 409 + "Model loading retried failed after {} times", 410 + retries 411 + )); 412 + } 404 413 let model_name = modelfile.from.clone().unwrap(); 414 + let model_cache_res = get_model_cache(&model_name); 405 415 406 - if let Ok(model_cache_path) = get_model_cache(&model_name) { 407 - load_model_in_py(modelfile, default_modelfile, memory_path, &model_cache_path).await 408 - } else { 416 + if model_cache_res.is_err() { 409 417 download_model(&model_name).await?; 410 - let model_cache_path = get_model_cache(&model_name)?; 411 - load_model_in_py(modelfile, default_modelfile, memory_path, &model_cache_path).await 418 + return Box::pin(load_model(modelfile, default_modelfile, memory_path, 0)).await; 419 + } 420 + 421 + // If loading fails it most probably a partial downloaded 422 + // model present, so we try to resume the download 423 + if load_model_in_py( 424 + modelfile, 425 + default_modelfile, 426 + memory_path, 427 + &model_cache_res.unwrap(), 428 + ) 429 + .await 430 + .is_err() 431 + { 432 + log::warn!("Load model failed, resuming the partial download"); 433 + download_model(&model_name).await?; 434 + Box::pin(load_model( 435 + modelfile, 436 + default_modelfile, 437 + memory_path, 438 + retries + 1, 439 + )) 440 + .await 441 + } else { 442 + Ok(()) 412 443 } 413 444 } 414 445 ··· 635 666 model_cache_path: &PathBuf, 636 667 ) -> Result<()> { 637 668 let client = Client::new(); 638 - let model_name = modelfile.from.clone().unwrap(); 669 + let model_name = modelfile 670 + .from 671 + .clone() 672 + .expect("Failed to get `FROM` of modelfile"); 639 673 let body = json!({ 640 674 "model": model_name, 641 675 "memory_path": memory_path,

+1 -1

tiles/src/utils/config.rs

··· 236 236 Ok(()) 237 237 } 238 238 239 - // Get the apt path where the model lies 239 + /// Get the apt path where the model in the system 240 240 pub fn get_model_cache(model_name: &str) -> Result<PathBuf> { 241 241 let hf_model_dir = if model_name.starts_with("mlx-community/") { 242 242 let model_spec_parts = model_name.split("/").collect::<Vec<&str>>();

Configure Feed

Configure Feed