Bump Qwen3.6 ctx to 60k · karashiiro.moe/nixos-config@b17b3f9

+39 -2

1 changed file

expand all

+39 -2

configuration.nix

··· 176 176 "--jinja" # Required for proper chat template 177 177 "-ngl 99" # Offload all layers to GPU 178 178 "--threads -1" # Auto-detect threads 179 - "--ctx-size 49152" # 48k context 179 + "--ctx-size 60000" # 60k context 180 180 "-fa on" # Flash attention 181 181 "--cache-type-k bf16" # bf16 required - q4/q8 KV cache causes gibberish! 182 182 "--cache-type-v bf16" # bf16 required - per unsloth docs ··· 184 184 "--top-p 0.8" 185 185 "--top-k 20" 186 186 "--presence-penalty 1.5" # Prevents repetition (Qwen3.6 recommended) 187 - "--chat-template-kwargs '{\"enable_thinking\":true}'" 187 + "--chat-template-kwargs '{\"enable_thinking\":true,\"preserve_thinking\":true}'" 188 188 "--host 127.0.0.1" 189 189 "--port 18083" 190 190 ]; ··· 194 194 RestartOnConnectionFailure = true; 195 195 ResourceRequirements = { 196 196 VRAM = 20000; # ~20GB for Q3_K_XL + 48k ctx with bf16 KV cache 197 + }; 198 + } 199 + # Qwen3.6-27B - Dense model (27B params, all active) 200 + # Slower than 35B-A3B MoE but potentially higher quality per param. 201 + # Supports vision/multimodal (would need separate mmproj file). 202 + # Download: huggingface-cli download unsloth/Qwen3.6-27B-GGUF \ 203 + # --include "*Q4_K_XL*" \ 204 + # --local-dir /srv/qwen3.6-27b 205 + { 206 + Name = "qwen3.6-27b"; 207 + ListenPort = "8084"; 208 + ProxyTargetHost = "127.0.0.1"; 209 + ProxyTargetPort = "18084"; 210 + Command = "${llamaCppCuda}/bin/llama-server"; 211 + Args = builtins.concatStringsSep " " [ 212 + "-m /srv/qwen3.6-27b/Qwen3.6-27B-UD-Q4_K_XL.gguf" 213 + "--jinja" # Required for proper chat template 214 + "-ngl 99" # Offload all layers to GPU 215 + "--threads -1" # Auto-detect threads 216 + "--ctx-size 60000" # 60k context 217 + "-fa on" # Flash attention 218 + "--cache-type-k bf16" # bf16 required - q4/q8 KV cache causes gibberish! 219 + "--cache-type-v bf16" # bf16 required - per unsloth docs 220 + "--temp 0.7" # Recommended for non-thinking general tasks 221 + "--top-p 0.8" 222 + "--top-k 20" 223 + "--presence-penalty 1.5" # Prevents repetition (Qwen3.6 recommended) 224 + "--chat-template-kwargs '{\"enable_thinking\":true,\"preserve_thinking\":true}'" 225 + "--host 127.0.0.1" 226 + "--port 18084" 227 + ]; 228 + OpenAiApi = true; 229 + HealthcheckCommand = "curl --fail http://127.0.0.1:18084/health"; 230 + HealthcheckIntervalMilliseconds = 500; 231 + RestartOnConnectionFailure = true; 232 + ResourceRequirements = { 233 + VRAM = 22000; # ~22GB for Q4_K_XL (~17.6GB weights) + 32k ctx + bf16 KV 197 234 }; 198 235 } 199 236 # Add more models here - LMP will swap them based on VRAM availability

Configure Feed

Configure Feed