Use Q6 Devstral Small 2 with 80k ctxlen · karashiiro.moe/nixos-config@a935c3a

+10 -12

1 changed file

expand all

+10 -12

configuration.nix

··· 97 97 # Devstral 2 - 24B coding agent model from Mistral AI 98 98 # (unsloth version has fixed system prompts!) 99 99 # Download: huggingface-cli download unsloth/Devstral-Small-2-24B-Instruct-2512-GGUF \ 100 - # --include "*Q4_K_XL*" \ 101 - # --local-dir /srv/devstral-2 102 - # huggingface-cli download unsloth/Devstral-Small-2-24B-Instruct-2512-GGUF \ 103 - # --include "*mmproj-F16*" \ 100 + # --include "*Q6_K_XL*" \ 104 101 # --local-dir /srv/devstral-2 105 102 { 106 103 Name = "devstral-2"; ··· 109 106 ProxyTargetPort = "18081"; 110 107 Command = "${llamaCppCuda}/bin/llama-server"; 111 108 Args = builtins.concatStringsSep " " [ 112 - "-m /srv/devstral-2/Devstral-Small-2-24B-Instruct-2512-UD-Q4_K_XL.gguf" 113 - "--mmproj /srv/devstral-2/mmproj-F16.gguf" 109 + "-m /srv/devstral-2/Devstral-Small-2-24B-Instruct-2512-UD-Q6_K_XL.gguf" 114 110 "--jinja" # Enable Jinja2 templates for chat formatting 115 111 "-ngl 99" # Offload all layers to GPU 116 112 "--threads -1" # Auto-detect threads 117 - "--ctx-size 98304" # 96k context 113 + "--ctx-size 81920" # 80k context (Q6_K_XL needs more VRAM for weights) 118 114 "-fa on" # Flash attention 119 - "--cache-type-k q8_0" # Quantize K cache to save VRAM 120 - "--cache-type-v q8_0" # Quantize V cache to save VRAM 115 + "--cache-type-k q4_0" # Quantize K cache to save VRAM 116 + "--cache-type-v q4_0" # Quantize V cache to save VRAM 121 117 "--host 127.0.0.1" 122 118 "--port 18081" 123 119 ]; ··· 126 122 HealthcheckIntervalMilliseconds = 500; 127 123 RestartOnConnectionFailure = true; 128 124 ResourceRequirements = { 129 - VRAM = 23800; # ~23GB for Q4_K_XL quantization at a 96k context with image support 125 + VRAM = 23800; # ~23GB for Q6_K_XL quantization at 96k context 130 126 }; 131 127 } 132 128 # GLM 4.7 Flash - 30B MoE model (only ~3.6B active params!) ··· 439 435 440 436 # Development tools 441 437 devcontainer 442 - python3Packages.huggingface-hub 443 - python3Packages.hf-xet # Blazing fast HuggingFace downloads via XET protocol 438 + (python3.withPackages (ps: with ps; [ 439 + huggingface-hub 440 + hf-xet # Blazing fast HuggingFace downloads via XET protocol 441 + ])) 444 442 llamaCppCuda 445 443 unstable.claude-code 446 444 ];

Configure Feed

Configure Feed