Add Qwen 3.6 to inference setup · karashiiro.moe/nixos-config@5d4a98c

karashiiro.moe / nixos-config

fork

Personal NixOS config

fork

+36

1 changed file

expand all

configuration.nix

+36

configuration.nix

··· 160 160 VRAM = 24000; # ~24GB for Q4_K_XL + 56K ctx (no FA/KV quant - MoE FA is too slow) 161 161 }; 162 162 } 163 + # Qwen3.6 35B-A3B - MoE model (35B total, ~3B active params) 164 + # Great general-purpose model with strong multilingual and reasoning 165 + # Download: huggingface-cli download unsloth/Qwen3.6-35B-A3B-GGUF \ 166 + # --include "*Q3_K_XL*" \ 167 + # --local-dir /srv/qwen3.6 168 + { 169 + Name = "qwen3.6"; 170 + ListenPort = "8083"; 171 + ProxyTargetHost = "127.0.0.1"; 172 + ProxyTargetPort = "18083"; 173 + Command = "${llamaCppCuda}/bin/llama-server"; 174 + Args = builtins.concatStringsSep " " [ 175 + "-m /srv/qwen3.6/Qwen3.6-35B-A3B-UD-Q3_K_XL.gguf" 176 + "--jinja" # Required for proper chat template 177 + "-ngl 99" # Offload all layers to GPU 178 + "--threads -1" # Auto-detect threads 179 + "--ctx-size 49152" # 48k context 180 + "-fa on" # Flash attention 181 + "--cache-type-k bf16" # bf16 required - q4/q8 KV cache causes gibberish! 182 + "--cache-type-v bf16" # bf16 required - per unsloth docs 183 + "--temp 0.7" # Recommended for non-thinking general tasks 184 + "--top-p 0.8" 185 + "--top-k 20" 186 + "--presence-penalty 1.5" # Prevents repetition (Qwen3.6 recommended) 187 + "--chat-template-kwargs '{\"enable_thinking\":true}'" 188 + "--host 127.0.0.1" 189 + "--port 18083" 190 + ]; 191 + OpenAiApi = true; 192 + HealthcheckCommand = "curl --fail http://127.0.0.1:18083/health"; 193 + HealthcheckIntervalMilliseconds = 500; 194 + RestartOnConnectionFailure = true; 195 + ResourceRequirements = { 196 + VRAM = 20000; # ~20GB for Q3_K_XL + 48k ctx with bf16 KV cache 197 + }; 198 + } 163 199 # Add more models here - LMP will swap them based on VRAM availability 164 200 ]; 165 201 };

Configure Feed

Configure Feed