Use Q4 quantization for GLM 4.7 Flash at 56k ctxlen · karashiiro.moe/nixos-config@2dae2e2

karashiiro.moe / nixos-config

fork

Personal NixOS config

fork

+6 -5

1 changed file

expand all

configuration.nix

+6 -5

configuration.nix

··· 128 128 # GLM 4.7 Flash - 30B MoE model (only ~3.6B active params!) 129 129 # Excellent for reasoning, coding, and agentic workflows 130 130 # Download: huggingface-cli download unsloth/GLM-4.7-Flash-GGUF \ 131 - # --include "*IQ4_XS*" \ 131 + # --include "*Q4_K_XL*" \ 132 132 # --local-dir /srv/glm-4.7-flash 133 133 { 134 134 Name = "glm-4.7-flash"; ··· 137 137 ProxyTargetPort = "18082"; 138 138 Command = "${llamaCppCuda}/bin/llama-server"; 139 139 Args = builtins.concatStringsSep " " [ 140 - "-m /srv/glm-4.7-flash/GLM-4.7-Flash-IQ4_XS.gguf" 140 + "-m /srv/glm-4.7-flash/GLM-4.7-Flash-UD-Q4_K_XL.gguf" 141 141 "--jinja" # Required for proper chat template 142 142 "-ngl 99" # Offload all layers to GPU 143 143 "--threads -1" # Auto-detect threads 144 - "--ctx-size 56000" # 56K context - IQ4_XS saves ~2.5GB for more ctx headroom! 145 - # NOTE: KV cache quantization requires Flash Attention, which MoE doesn't support on CUDA 144 + "--ctx-size 56000" # 56K context 145 + # NOTE: Flash attention for MoE on CUDA is functional but very slow (~9 t/s) 146 + "--cache-type-k q8_0" # Quantize K cache (V cache quant still requires FA) 146 147 "--dry-multiplier 1.1" # CRITICAL: prevents repetition/looping! 147 148 "--temp 0.2" 148 149 "--top-k 50" ··· 156 157 HealthcheckIntervalMilliseconds = 500; 157 158 RestartOnConnectionFailure = true; 158 159 ResourceRequirements = { 159 - VRAM = 24000; # ~24GB for IQ4_XS (~15GB) + 56K ctx WITHOUT KV cache quantization 160 + VRAM = 24000; # ~24GB for Q4_K_XL + 56K ctx (no FA/KV quant - MoE FA is too slow) 160 161 }; 161 162 } 162 163 # Add more models here - LMP will swap them based on VRAM availability

Configure Feed

Configure Feed