Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

drm/amdkfd: APIs to stop/start KFD scheduling

Provide amdgpu_amdkfd_stop_sched() for amdgpu to stop KFD scheduling
compute work on HIQ. amdgpu_amdkfd_start_sched() resumes the scheduling.
When amdgpu_amdkfd_stop_sched is called, KFD will unmap queues from
runlist. If users send ioctls to KFD to create queues, they'll be added
but those queues won't be mapped to runlist (so not scheduled) until
amdgpu_amdkfd_start_sched is called.

v2: fix build (Alex)

Signed-off-by: Amber Lin <Amber.Lin@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

authored by

Amber Lin and committed by
Alex Deucher
234eebe1 b1f49ff9

+137 -1
+18
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
··· 887 887 888 888 return r; 889 889 } 890 + 891 + /* Stop scheduling on KFD */ 892 + int amdgpu_amdkfd_stop_sched(struct amdgpu_device *adev, uint32_t node_id) 893 + { 894 + if (!adev->kfd.init_complete) 895 + return 0; 896 + 897 + return kgd2kfd_stop_sched(adev->kfd.dev, node_id); 898 + } 899 + 900 + /* Start scheduling on KFD */ 901 + int amdgpu_amdkfd_start_sched(struct amdgpu_device *adev, uint32_t node_id) 902 + { 903 + if (!adev->kfd.init_complete) 904 + return 0; 905 + 906 + return kgd2kfd_start_sched(adev->kfd.dev, node_id); 907 + }
+14
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
··· 264 264 uint32_t *payload); 265 265 int amdgpu_amdkfd_unmap_hiq(struct amdgpu_device *adev, u32 doorbell_off, 266 266 u32 inst); 267 + int amdgpu_amdkfd_start_sched(struct amdgpu_device *adev, uint32_t node_id); 268 + int amdgpu_amdkfd_stop_sched(struct amdgpu_device *adev, uint32_t node_id); 267 269 268 270 /* Read user wptr from a specified user address space with page fault 269 271 * disabled. The memory must be pinned and mapped to the hardware when ··· 428 426 void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint64_t throttle_bitmask); 429 427 int kgd2kfd_check_and_lock_kfd(void); 430 428 void kgd2kfd_unlock_kfd(void); 429 + int kgd2kfd_start_sched(struct kfd_dev *kfd, uint32_t node_id); 430 + int kgd2kfd_stop_sched(struct kfd_dev *kfd, uint32_t node_id); 431 431 #else 432 432 static inline int kgd2kfd_init(void) 433 433 { ··· 499 495 500 496 static inline void kgd2kfd_unlock_kfd(void) 501 497 { 498 + } 499 + 500 + static inline int kgd2kfd_start_sched(struct kfd_dev *kfd, uint32_t node_id) 501 + { 502 + return 0; 503 + } 504 + 505 + static inline int kgd2kfd_stop_sched(struct kfd_dev *kfd, uint32_t node_id) 506 + { 507 + return 0; 502 508 } 503 509 #endif 504 510 #endif /* AMDGPU_AMDKFD_H_INCLUDED */
+39
drivers/gpu/drm/amd/amdkfd/kfd_device.c
··· 1446 1446 mutex_unlock(&kfd_processes_mutex); 1447 1447 } 1448 1448 1449 + int kgd2kfd_start_sched(struct kfd_dev *kfd, uint32_t node_id) 1450 + { 1451 + struct kfd_node *node; 1452 + int ret; 1453 + 1454 + if (!kfd->init_complete) 1455 + return 0; 1456 + 1457 + if (node_id >= kfd->num_nodes) { 1458 + dev_warn(kfd->adev->dev, "Invalid node ID: %u exceeds %u\n", 1459 + node_id, kfd->num_nodes - 1); 1460 + return -EINVAL; 1461 + } 1462 + node = kfd->nodes[node_id]; 1463 + 1464 + ret = node->dqm->ops.unhalt(node->dqm); 1465 + if (ret) 1466 + dev_err(kfd_device, "Error in starting scheduler\n"); 1467 + 1468 + return ret; 1469 + } 1470 + 1471 + int kgd2kfd_stop_sched(struct kfd_dev *kfd, uint32_t node_id) 1472 + { 1473 + struct kfd_node *node; 1474 + 1475 + if (!kfd->init_complete) 1476 + return 0; 1477 + 1478 + if (node_id >= kfd->num_nodes) { 1479 + dev_warn(kfd->adev->dev, "Invalid node ID: %u exceeds %u\n", 1480 + node_id, kfd->num_nodes - 1); 1481 + return -EINVAL; 1482 + } 1483 + 1484 + node = kfd->nodes[node_id]; 1485 + return node->dqm->ops.halt(node->dqm); 1486 + } 1487 + 1449 1488 #if defined(CONFIG_DEBUG_FS) 1450 1489 1451 1490 /* This function will send a package to HIQ to hang the HWS
+57 -1
drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
··· 1679 1679 return 0; 1680 1680 } 1681 1681 1682 + /* halt_cpsch: 1683 + * Unmap queues so the schedule doesn't continue remaining jobs in the queue. 1684 + * Then set dqm->sched_halt so queues don't map to runlist until unhalt_cpsch 1685 + * is called. 1686 + */ 1687 + static int halt_cpsch(struct device_queue_manager *dqm) 1688 + { 1689 + int ret = 0; 1690 + 1691 + dqm_lock(dqm); 1692 + if (!dqm->sched_running) { 1693 + dqm_unlock(dqm); 1694 + return 0; 1695 + } 1696 + 1697 + WARN_ONCE(dqm->sched_halt, "Scheduling is already on halt\n"); 1698 + 1699 + if (!dqm->is_hws_hang) { 1700 + if (!dqm->dev->kfd->shared_resources.enable_mes) 1701 + ret = unmap_queues_cpsch(dqm, 1702 + KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0, 1703 + USE_DEFAULT_GRACE_PERIOD, false); 1704 + else 1705 + ret = remove_all_queues_mes(dqm); 1706 + } 1707 + dqm->sched_halt = true; 1708 + dqm_unlock(dqm); 1709 + 1710 + return ret; 1711 + } 1712 + 1713 + /* unhalt_cpsch 1714 + * Unset dqm->sched_halt and map queues back to runlist 1715 + */ 1716 + static int unhalt_cpsch(struct device_queue_manager *dqm) 1717 + { 1718 + int ret = 0; 1719 + 1720 + dqm_lock(dqm); 1721 + if (!dqm->sched_running || !dqm->sched_halt) { 1722 + WARN_ONCE(!dqm->sched_halt, "Scheduling is not on halt.\n"); 1723 + dqm_unlock(dqm); 1724 + return 0; 1725 + } 1726 + dqm->sched_halt = false; 1727 + if (!dqm->dev->kfd->shared_resources.enable_mes) 1728 + ret = execute_queues_cpsch(dqm, 1729 + KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 1730 + 0, USE_DEFAULT_GRACE_PERIOD); 1731 + dqm_unlock(dqm); 1732 + 1733 + return ret; 1734 + } 1735 + 1682 1736 static int start_cpsch(struct device_queue_manager *dqm) 1683 1737 { 1684 1738 struct device *dev = dqm->dev->adev->dev; ··· 2038 1984 struct device *dev = dqm->dev->adev->dev; 2039 1985 int retval; 2040 1986 2041 - if (!dqm->sched_running) 1987 + if (!dqm->sched_running || dqm->sched_halt) 2042 1988 return 0; 2043 1989 if (dqm->active_queue_count <= 0 || dqm->processes_count <= 0) 2044 1990 return 0; ··· 2781 2727 dqm->ops.initialize = initialize_cpsch; 2782 2728 dqm->ops.start = start_cpsch; 2783 2729 dqm->ops.stop = stop_cpsch; 2730 + dqm->ops.halt = halt_cpsch; 2731 + dqm->ops.unhalt = unhalt_cpsch; 2784 2732 dqm->ops.destroy_queue = destroy_queue_cpsch; 2785 2733 dqm->ops.update_queue = update_queue; 2786 2734 dqm->ops.register_process = register_process;
+9
drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
··· 106 106 * @uninitialize: Destroys all the device queue manager resources allocated in 107 107 * initialize routine. 108 108 * 109 + * @halt: This routine unmaps queues from runlist and set halt status to true 110 + * so no more queues will be mapped to runlist until unhalt. 111 + * 112 + * @unhalt: This routine unset halt status to flase and maps queues back to 113 + * runlist. 114 + * 109 115 * @create_kernel_queue: Creates kernel queue. Used for debug queue. 110 116 * 111 117 * @destroy_kernel_queue: Destroys kernel queue. Used for debug queue. ··· 159 153 int (*start)(struct device_queue_manager *dqm); 160 154 int (*stop)(struct device_queue_manager *dqm); 161 155 void (*uninitialize)(struct device_queue_manager *dqm); 156 + int (*halt)(struct device_queue_manager *dqm); 157 + int (*unhalt)(struct device_queue_manager *dqm); 162 158 int (*create_kernel_queue)(struct device_queue_manager *dqm, 163 159 struct kernel_queue *kq, 164 160 struct qcm_process_device *qpd); ··· 272 264 struct work_struct hw_exception_work; 273 265 struct kfd_mem_obj hiq_sdma_mqd; 274 266 bool sched_running; 267 + bool sched_halt; 275 268 276 269 /* used for GFX 9.4.3 only */ 277 270 uint32_t current_logical_xcc_start;