Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

drm/amdgpu: disable ras query and iject during gpu reset

added flag to ras context to indicate if ras query functionality is ready

Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: John Clements <john.clements@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

authored by

John Clements and committed by
Alex Deucher
61380faa 66399248

+28 -3
+3
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
··· 4168 4168 need_full_reset = job_signaled = false; 4169 4169 INIT_LIST_HEAD(&device_list); 4170 4170 4171 + amdgpu_ras_set_error_query_ready(adev, false); 4172 + 4171 4173 dev_info(adev->dev, "GPU %s begin!\n", 4172 4174 (in_ras_intr && !use_baco) ? "jobs stop":"reset"); 4173 4175 ··· 4226 4224 /* block all schedulers and reset given job's ring */ 4227 4225 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 4228 4226 if (tmp_adev != adev) { 4227 + amdgpu_ras_set_error_query_ready(tmp_adev, false); 4229 4228 amdgpu_device_lock_adev(tmp_adev, false); 4230 4229 if (!amdgpu_sriov_vf(tmp_adev)) 4231 4230 amdgpu_amdkfd_pre_reset(tmp_adev);
+21 -3
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
··· 80 80 static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev, 81 81 uint64_t addr); 82 82 83 + void amdgpu_ras_set_error_query_ready(struct amdgpu_device *adev, bool ready) 84 + { 85 + if (adev) 86 + amdgpu_ras_get_context(adev)->error_query_ready = ready; 87 + } 88 + 89 + bool amdgpu_ras_get_error_query_ready(struct amdgpu_device *adev) 90 + { 91 + if (adev) 92 + return amdgpu_ras_get_context(adev)->error_query_ready; 93 + 94 + return false; 95 + } 96 + 83 97 static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf, 84 98 size_t size, loff_t *pos) 85 99 { ··· 295 281 struct ras_debug_if data; 296 282 int ret = 0; 297 283 298 - if (amdgpu_ras_intr_triggered()) { 284 + if (!amdgpu_ras_get_error_query_ready(adev)) { 299 285 DRM_WARN("RAS WARN: error injection currently inaccessible\n"); 300 286 return size; 301 287 } ··· 413 399 .head = obj->head, 414 400 }; 415 401 416 - if (amdgpu_ras_intr_triggered()) 402 + if (!amdgpu_ras_get_error_query_ready(obj->adev)) 417 403 return snprintf(buf, PAGE_SIZE, 418 404 "Query currently inaccessible\n"); 419 405 ··· 1900 1886 } 1901 1887 1902 1888 /* in resume phase, no need to create ras fs node */ 1903 - if (adev->in_suspend || adev->in_gpu_reset) 1889 + if (adev->in_suspend || adev->in_gpu_reset) { 1890 + amdgpu_ras_set_error_query_ready(adev, true); 1904 1891 return 0; 1892 + } 1905 1893 1906 1894 if (ih_info->cb) { 1907 1895 r = amdgpu_ras_interrupt_add_handler(adev, ih_info); ··· 1914 1898 r = amdgpu_ras_sysfs_create(adev, fs_info); 1915 1899 if (r) 1916 1900 goto sysfs; 1901 + 1902 + amdgpu_ras_set_error_query_ready(adev, true); 1917 1903 1918 1904 return 0; 1919 1905 cleanup:
+4
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
··· 334 334 uint32_t flags; 335 335 bool reboot; 336 336 struct amdgpu_ras_eeprom_control eeprom_control; 337 + 338 + bool error_query_ready; 337 339 }; 338 340 339 341 struct ras_fs_data { ··· 630 628 } 631 629 632 630 void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev); 631 + 632 + void amdgpu_ras_set_error_query_ready(struct amdgpu_device *adev, bool ready); 633 633 634 634 #endif