Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

drm/amd/amdgpu implement tdr advanced mode

[Why]
Previous tdr design treats the first job in job_timeout as the bad job.
But sometimes a later bad compute job can block a good gfx job and
cause an unexpected gfx job timeout because gfx and compute ring share
internal GC HW mutually.

[How]
This patch implements an advanced tdr mode.It involves an additinal
synchronous pre-resubmit step(Step0 Resubmit) before normal resubmit
step in order to find the real bad job.

1. At Step0 Resubmit stage, it synchronously submits and pends for the
first job being signaled. If it gets timeout, we identify it as guilty
and do hw reset. After that, we would do the normal resubmit step to
resubmit left jobs.

2. For whole gpu reset(vram lost), do resubmit as the old way.

v2: squash in build fix (Alex)

Signed-off-by: Jack Zhang <Jack.Zhang1@amd.com>
Reviewed-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

authored by

Jack Zhang and committed by
Alex Deucher
e6c6338f 030bb4ad

+156 -33
+81
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
··· 4575 4575 return 0; 4576 4576 } 4577 4577 4578 + void amdgpu_device_recheck_guilty_jobs(struct amdgpu_device *adev, 4579 + struct amdgpu_hive_info *hive, 4580 + struct list_head *device_list_handle, 4581 + bool *need_full_reset) 4582 + { 4583 + int i, r = 0; 4584 + 4585 + for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4586 + struct amdgpu_ring *ring = adev->rings[i]; 4587 + int ret = 0; 4588 + struct drm_sched_job *s_job; 4589 + 4590 + if (!ring || !ring->sched.thread) 4591 + continue; 4592 + 4593 + s_job = list_first_entry_or_null(&ring->sched.pending_list, 4594 + struct drm_sched_job, list); 4595 + if (s_job == NULL) 4596 + continue; 4597 + 4598 + /* clear job's guilty and depend the folowing step to decide the real one */ 4599 + drm_sched_reset_karma(s_job); 4600 + drm_sched_resubmit_jobs_ext(&ring->sched, 1); 4601 + 4602 + ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout); 4603 + if (ret == 0) { /* timeout */ 4604 + DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n", 4605 + ring->sched.name, s_job->id); 4606 + 4607 + /* set guilty */ 4608 + drm_sched_increase_karma(s_job); 4609 + retry: 4610 + /* do hw reset */ 4611 + if (amdgpu_sriov_vf(adev)) { 4612 + amdgpu_virt_fini_data_exchange(adev); 4613 + r = amdgpu_device_reset_sriov(adev, false); 4614 + if (r) 4615 + adev->asic_reset_res = r; 4616 + } else { 4617 + r = amdgpu_do_asic_reset(hive, device_list_handle, 4618 + need_full_reset, false); 4619 + if (r && r == -EAGAIN) 4620 + goto retry; 4621 + } 4622 + 4623 + /* 4624 + * add reset counter so that the following 4625 + * resubmitted job could flush vmid 4626 + */ 4627 + atomic_inc(&adev->gpu_reset_counter); 4628 + continue; 4629 + } 4630 + 4631 + /* got the hw fence, signal finished fence */ 4632 + atomic_dec(ring->sched.score); 4633 + dma_fence_get(&s_job->s_fence->finished); 4634 + dma_fence_signal(&s_job->s_fence->finished); 4635 + dma_fence_put(&s_job->s_fence->finished); 4636 + 4637 + /* remove node from list and free the job */ 4638 + spin_lock(&ring->sched.job_list_lock); 4639 + list_del_init(&s_job->list); 4640 + spin_unlock(&ring->sched.job_list_lock); 4641 + ring->sched.ops->free_job(s_job); 4642 + } 4643 + } 4644 + 4578 4645 /** 4579 4646 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 4580 4647 * ··· 4664 4597 int i, r = 0; 4665 4598 bool need_emergency_restart = false; 4666 4599 bool audio_suspended = false; 4600 + int tmp_vram_lost_counter; 4667 4601 4668 4602 /* 4669 4603 * Special case: RAS triggered and full reset isn't supported ··· 4816 4748 } 4817 4749 } 4818 4750 4751 + tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter)); 4819 4752 /* Actual ASIC resets if needed.*/ 4820 4753 /* TODO Implement XGMI hive reset logic for SRIOV */ 4821 4754 if (amdgpu_sriov_vf(adev)) { ··· 4833 4764 4834 4765 /* Post ASIC reset for all devs .*/ 4835 4766 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4767 + 4768 + /* 4769 + * Sometimes a later bad compute job can block a good gfx job as gfx 4770 + * and compute ring share internal GC HW mutually. We add an additional 4771 + * guilty jobs recheck step to find the real guilty job, it synchronously 4772 + * submits and pends for the first job being signaled. If it gets timeout, 4773 + * we identify it as a real guilty job. 4774 + */ 4775 + if (amdgpu_gpu_recovery == 2 && 4776 + !(tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter))) 4777 + amdgpu_device_recheck_guilty_jobs(tmp_adev, hive, 4778 + device_list_handle, &need_full_reset); 4836 4779 4837 4780 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4838 4781 struct amdgpu_ring *ring = tmp_adev->rings[i];
+1 -1
drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
··· 516 516 * DOC: gpu_recovery (int) 517 517 * Set to enable GPU recovery mechanism (1 = enable, 0 = disable). The default is -1 (auto, disabled except SRIOV). 518 518 */ 519 - MODULE_PARM_DESC(gpu_recovery, "Enable GPU recovery mechanism, (1 = enable, 0 = disable, -1 = auto)"); 519 + MODULE_PARM_DESC(gpu_recovery, "Enable GPU recovery mechanism, (2 = advanced tdr mode, 1 = enable, 0 = disable, -1 = auto)"); 520 520 module_param_named(gpu_recovery, amdgpu_gpu_recovery, int, 0444); 521 521 522 522 /**
+71 -32
drivers/gpu/drm/scheduler/sched_main.c
··· 361 361 */ 362 362 void drm_sched_increase_karma(struct drm_sched_job *bad) 363 363 { 364 - int i; 365 - struct drm_sched_entity *tmp; 366 - struct drm_sched_entity *entity; 367 - struct drm_gpu_scheduler *sched = bad->sched; 368 - 369 - /* don't increase @bad's karma if it's from KERNEL RQ, 370 - * because sometimes GPU hang would cause kernel jobs (like VM updating jobs) 371 - * corrupt but keep in mind that kernel jobs always considered good. 372 - */ 373 - if (bad->s_priority != DRM_SCHED_PRIORITY_KERNEL) { 374 - atomic_inc(&bad->karma); 375 - for (i = DRM_SCHED_PRIORITY_MIN; i < DRM_SCHED_PRIORITY_KERNEL; 376 - i++) { 377 - struct drm_sched_rq *rq = &sched->sched_rq[i]; 378 - 379 - spin_lock(&rq->lock); 380 - list_for_each_entry_safe(entity, tmp, &rq->entities, list) { 381 - if (bad->s_fence->scheduled.context == 382 - entity->fence_context) { 383 - if (atomic_read(&bad->karma) > 384 - bad->sched->hang_limit) 385 - if (entity->guilty) 386 - atomic_set(entity->guilty, 1); 387 - break; 388 - } 389 - } 390 - spin_unlock(&rq->lock); 391 - if (&entity->list != &rq->entities) 392 - break; 393 - } 394 - } 364 + drm_sched_increase_karma_ext(bad, 1); 395 365 } 396 366 EXPORT_SYMBOL(drm_sched_increase_karma); 367 + 368 + void drm_sched_reset_karma(struct drm_sched_job *bad) 369 + { 370 + drm_sched_increase_karma_ext(bad, 0); 371 + } 372 + EXPORT_SYMBOL(drm_sched_reset_karma); 397 373 398 374 /** 399 375 * drm_sched_stop - stop the scheduler ··· 510 534 */ 511 535 void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched) 512 536 { 537 + drm_sched_resubmit_jobs_ext(sched, INT_MAX); 538 + } 539 + EXPORT_SYMBOL(drm_sched_resubmit_jobs); 540 + 541 + /** 542 + * drm_sched_resubmit_jobs_ext - helper to relunch certain number of jobs from mirror ring list 543 + * 544 + * @sched: scheduler instance 545 + * @max: job numbers to relaunch 546 + * 547 + */ 548 + void drm_sched_resubmit_jobs_ext(struct drm_gpu_scheduler *sched, int max) 549 + { 513 550 struct drm_sched_job *s_job, *tmp; 514 551 uint64_t guilty_context; 515 552 bool found_guilty = false; 516 553 struct dma_fence *fence; 554 + int i = 0; 517 555 518 556 list_for_each_entry_safe(s_job, tmp, &sched->pending_list, list) { 519 557 struct drm_sched_fence *s_fence = s_job->s_fence; 558 + 559 + if (i >= max) 560 + break; 520 561 521 562 if (!found_guilty && atomic_read(&s_job->karma) > sched->hang_limit) { 522 563 found_guilty = true; ··· 545 552 546 553 dma_fence_put(s_job->s_fence->parent); 547 554 fence = sched->ops->run_job(s_job); 555 + i++; 548 556 549 557 if (IS_ERR_OR_NULL(fence)) { 550 558 if (IS_ERR(fence)) ··· 557 563 } 558 564 } 559 565 } 560 - EXPORT_SYMBOL(drm_sched_resubmit_jobs); 566 + EXPORT_SYMBOL(drm_sched_resubmit_jobs_ext); 561 567 562 568 /** 563 569 * drm_sched_job_init - init a scheduler job ··· 897 903 sched->ready = false; 898 904 } 899 905 EXPORT_SYMBOL(drm_sched_fini); 906 + 907 + /** 908 + * drm_sched_increase_karma_ext - Update sched_entity guilty flag 909 + * 910 + * @bad: The job guilty of time out 911 + * @type: type for increase/reset karma 912 + * 913 + */ 914 + void drm_sched_increase_karma_ext(struct drm_sched_job *bad, int type) 915 + { 916 + int i; 917 + struct drm_sched_entity *tmp; 918 + struct drm_sched_entity *entity; 919 + struct drm_gpu_scheduler *sched = bad->sched; 920 + 921 + /* don't change @bad's karma if it's from KERNEL RQ, 922 + * because sometimes GPU hang would cause kernel jobs (like VM updating jobs) 923 + * corrupt but keep in mind that kernel jobs always considered good. 924 + */ 925 + if (bad->s_priority != DRM_SCHED_PRIORITY_KERNEL) { 926 + if (type == 0) 927 + atomic_set(&bad->karma, 0); 928 + else if (type == 1) 929 + atomic_inc(&bad->karma); 930 + 931 + for (i = DRM_SCHED_PRIORITY_MIN; i < DRM_SCHED_PRIORITY_KERNEL; 932 + i++) { 933 + struct drm_sched_rq *rq = &sched->sched_rq[i]; 934 + 935 + spin_lock(&rq->lock); 936 + list_for_each_entry_safe(entity, tmp, &rq->entities, list) { 937 + if (bad->s_fence->scheduled.context == 938 + entity->fence_context) { 939 + if (entity->guilty) 940 + atomic_set(entity->guilty, type); 941 + break; 942 + } 943 + } 944 + spin_unlock(&rq->lock); 945 + if (&entity->list != &rq->entities) 946 + break; 947 + } 948 + } 949 + } 950 + EXPORT_SYMBOL(drm_sched_increase_karma_ext);
+3
include/drm/gpu_scheduler.h
··· 322 322 void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad); 323 323 void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery); 324 324 void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched); 325 + void drm_sched_resubmit_jobs_ext(struct drm_gpu_scheduler *sched, int max); 325 326 void drm_sched_increase_karma(struct drm_sched_job *bad); 327 + void drm_sched_reset_karma(struct drm_sched_job *bad); 328 + void drm_sched_increase_karma_ext(struct drm_sched_job *bad, int type); 326 329 bool drm_sched_dependency_optimized(struct dma_fence* fence, 327 330 struct drm_sched_entity *entity); 328 331 void drm_sched_fault(struct drm_gpu_scheduler *sched);