Add sad GLM 4.7 Flash with no FA · karashiiro.moe/nixos-config@2ffa101

+36

1 changed file

expand all

+36

configuration.nix

··· 129 129 VRAM = 23800; # ~23GB for Q4_K_XL quantization at a 96k context with image support 130 130 }; 131 131 } 132 + # GLM 4.7 Flash - 30B MoE model (only ~3.6B active params!) 133 + # Excellent for reasoning, coding, and agentic workflows 134 + # Download: huggingface-cli download unsloth/GLM-4.7-Flash-GGUF \ 135 + # --include "*IQ4_XS*" \ 136 + # --local-dir /srv/glm-4.7-flash 137 + { 138 + Name = "glm-4.7-flash"; 139 + ListenPort = "8082"; 140 + ProxyTargetHost = "127.0.0.1"; 141 + ProxyTargetPort = "18082"; 142 + Command = "${llamaCppCuda}/bin/llama-server"; 143 + Args = builtins.concatStringsSep " " [ 144 + "-m /srv/glm-4.7-flash/GLM-4.7-Flash-IQ4_XS.gguf" 145 + "--jinja" # Required for proper chat template 146 + "-ngl 99" # Offload all layers to GPU 147 + "--threads -1" # Auto-detect threads 148 + "--ctx-size 56000" # 56K context - IQ4_XS saves ~2.5GB for more ctx headroom! 149 + # NOTE: KV cache quantization requires Flash Attention, which MoE doesn't support on CUDA 150 + "--dry-multiplier 1.1" # CRITICAL: prevents repetition/looping! 151 + "--temp 0.2" 152 + "--top-k 50" 153 + "--top-p 0.95" 154 + "--min-p 0.01" 155 + "--host 127.0.0.1" 156 + "--port 18082" 157 + ]; 158 + OpenAiApi = true; 159 + HealthcheckCommand = "curl --fail http://127.0.0.1:18082/health"; 160 + HealthcheckIntervalMilliseconds = 500; 161 + RestartOnConnectionFailure = true; 162 + ResourceRequirements = { 163 + VRAM = 24000; # ~24GB for IQ4_XS (~15GB) + 56K ctx WITHOUT KV cache quantization 164 + }; 165 + } 132 166 # Add more models here - LMP will swap them based on VRAM availability 133 167 ]; 134 168 }; ··· 406 440 # Development tools 407 441 devcontainer 408 442 python3Packages.huggingface-hub 443 + python3Packages.hf-xet # Blazing fast HuggingFace downloads via XET protocol 409 444 llamaCppCuda 410 445 unstable.claude-code 411 446 ]; ··· 494 529 7071 # Large Model Proxy - Management UI 495 530 8080 # LMP proxied model endpoint (gpt-oss-20b) 496 531 8081 # LMP proxied model endpoint (devstral-2) 532 + 8082 # LMP proxied model endpoint (glm-4.7-flash) 497 533 8888 # Jupyter notebook (if you use it) 498 534 ]; 499 535 # networking.firewall.allowedUDPPorts = [ ... ];

Configure Feed

Configure Feed