add check · ssp.sh/dotfiles@24cbbbc

+48 -5

1 changed file

expand all

hypr

.config

hypr

sspaeti

gpu-health-check

+48 -5

hypr/.config/hypr/sspaeti/gpu-health-check

··· 6 6 YELLOW='\033[0;33m' 7 7 NC='\033[0m' 8 8 9 + warn=0 10 + 9 11 echo "GPU Health Check" 10 12 echo "================" 11 13 ··· 18 20 echo -e "${RED}[WARN]${NC} GPU errors detected:" 19 21 echo "$errors" | head -5 20 22 echo -e "${YELLOW}Recommendation: Reboot before your meeting${NC}" 23 + warn=1 21 24 fi 22 25 23 - # Check GPU is responding 24 - if busy=$(cat /sys/class/drm/card*/device/gpu_busy_percent 2>/dev/null | head -1); then 26 + # Check MES errors specifically (early warning before full crash) 27 + mes_count=$(journalctl -b -k --no-pager 2>/dev/null | grep -c "MES") 28 + if [ "$mes_count" -gt 0 ]; then 29 + echo -e "${RED}[WARN]${NC} MES errors this boot: ${mes_count} (crash imminent, reboot NOW)" 30 + warn=1 31 + else 32 + echo -e "${GREEN}[OK]${NC} MES scheduler healthy" 33 + fi 34 + 35 + # Check GPU is responding (timeout = early freeze detection) 36 + if busy=$(timeout 2 cat /sys/class/drm/card*/device/gpu_busy_percent 2>/dev/null | head -1); then 25 37 echo -e "${GREEN}[OK]${NC} GPU responding (${busy}% busy)" 26 38 else 27 - echo -e "${RED}[FAIL]${NC} Cannot read GPU status" 39 + echo -e "${RED}[FAIL]${NC} Cannot read GPU status (GPU may be hung)" 40 + warn=1 28 41 fi 29 42 30 43 # Check VRAM ··· 35 48 echo -e "${YELLOW}[SKIP]${NC} VRAM info not available" 36 49 fi 37 50 51 + # Check CWSR status (known MES crash trigger on gfx1150) 52 + cwsr=$(cat /sys/module/amdgpu/parameters/cwsr_enable 2>/dev/null) 53 + if [ "$cwsr" = "1" ]; then 54 + echo -e "${YELLOW}[INFO]${NC} CWSR enabled (known MES crash risk on gfx1150)" 55 + echo -e " To disable: add amdgpu.cwsr_enable=0 to kernel params" 56 + elif [ "$cwsr" = "0" ]; then 57 + echo -e "${GREEN}[OK]${NC} CWSR disabled (MES crash mitigation active)" 58 + fi 59 + 60 + # Check if last suspend/resume left GPU degraded 61 + resume_errors=$(journalctl -b -k --no-pager 2>/dev/null | grep -E "(SMU is resumed|ring .* uses VM)" | tail -1) 62 + if [ -n "$resume_errors" ]; then 63 + post_resume_errors=$(journalctl -b -k --no-pager 2>/dev/null | sed -n "/SMU is resumed/,\$p" | grep -cE "(fail|error|timeout)" 2>/dev/null) 64 + if [ "$post_resume_errors" -gt 0 ]; then 65 + echo -e "${RED}[WARN]${NC} GPU errors after last resume: ${post_resume_errors} (reboot recommended)" 66 + warn=1 67 + else 68 + echo -e "${GREEN}[OK]${NC} Last suspend/resume was clean" 69 + fi 70 + fi 71 + 72 + # Check kernel params are set 73 + cmdline=$(cat /proc/cmdline 2>/dev/null) 74 + missing="" 75 + echo "$cmdline" | grep -q "gpu_recovery=1" || missing="${missing} gpu_recovery" 76 + echo "$cmdline" | grep -q "ip_block_mask" || missing="${missing} ip_block_mask(VPE)" 77 + if [ -n "$missing" ]; then 78 + echo -e "${YELLOW}[INFO]${NC} Missing kernel params:${missing}" 79 + fi 80 + 38 81 # Summary 39 82 echo "" 40 - if [ -z "$errors" ]; then 41 - echo -e "${GREEN}GPU is healthy - safe to start meeting${NC}" 83 + if [ "$warn" -eq 0 ]; then 84 + echo -e "${GREEN}GPU is healthy - safe to proceed${NC}" 42 85 else 43 86 echo -e "${RED}GPU may be unstable - consider rebooting${NC}" 44 87 fi

Configure Feed

Configure Feed