Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'drm-fixes-2025-05-17' of https://gitlab.freedesktop.org/drm/kernel

Pull drm fixes from Dave Airlie:
"Weekly drm fixes, I'll be honest and say I think this is larger than
I'd prefer at this point, the main blow out point is that xe has two
larger fixes.

One is a fix for active context utilisation reporting, it's for a
reported regression and will end up in stable anyways, so I don't see
any point in holding it up.

The second is a fix for mixed cpu/gpu atomics, which are currently
broken, but are also not something your average desktop/laptop user is
going to hit in normal operation, and having them fixed now is better
than threading them through stable later.

Other than those, it's mostly the usual, a bunch of amdgpu randoms and
a few other minor fixes.

dma-buf:
- Avoid memory reordering in fence handling

meson:
- Avoid integer overflow in mode-clock calculations

panel-mipi-dbi:
- Fix output with drm_client_setup_with_fourcc()

amdgpu:
- Fix CSA unmap
- Fix MALL size reporting on GFX11.5
- AUX fix
- DCN 3.5 fix
- VRR fix
- DP MST fix
- DML 2.1 fixes
- Silence DP AUX spam
- DCN 4.0.1 cursor fix
- VCN 4.0.5 fix

ivpu:
- Fix buffer size in debugfs code

gpuvm:
- Add timeslicing and allocation restriction for SVM

xe:
- Fix shrinker debugfs name
- Add HW workaround to Xe2
- Fix SVM when mixing GPU and CPU atomics
- Fix per client engine utilization due to active contexts not saving
timestamp with lite restore enabled"

* tag 'drm-fixes-2025-05-17' of https://gitlab.freedesktop.org/drm/kernel: (24 commits)
drm/xe: Add WA BB to capture active context utilization
drm/xe: Save the gt pointer in lrc and drop the tile
drm/xe: Save CTX_TIMESTAMP mmio value instead of LRC value
drm/xe: Timeslice GPU on atomic SVM fault
drm/gpusvm: Add timeslicing support to GPU SVM
drm/xe: Strict migration policy for atomic SVM faults
drm/gpusvm: Introduce devmem_only flag for allocation
drm/xe/xe2hpg: Add Wa_22021007897
drm/amdgpu: read back register after written for VCN v4.0.5
Revert "drm/amd/display: Hardware cursor changes color when switched to software cursor"
dma-buf: insert memory barrier before updating num_fences
drm/xe: Fix the gem shrinker name
drm/amd/display: Avoid flooding unnecessary info messages
drm/amd/display: Fix null check of pipe_ctx->plane_state for update_dchubp_dpp
drm/amd/display: check stream id dml21 wrapper to get plane_id
drm/amd/display: fix link_set_dpms_off multi-display MST corner case
drm/amd/display: Defer BW-optimization-blocked DRR adjustments
Revert: "drm/amd/display: Enable urgent latency adjustment on DCN35"
drm/amd/display: Correct the reply value when AUX write incomplete
drm/amdgpu: fix incorrect MALL size for GFX1151
...

+474 -121
+1 -1
drivers/accel/ivpu/ivpu_debugfs.c
··· 455 455 if (ret < 0) 456 456 return ret; 457 457 458 - buf[size] = '\0'; 458 + buf[ret] = '\0'; 459 459 ret = sscanf(buf, "%u %u %u %u", &band, &grace_period, &process_grace_period, 460 460 &process_quantum); 461 461 if (ret != 4)
+3 -2
drivers/dma-buf/dma-resv.c
··· 320 320 count++; 321 321 322 322 dma_resv_list_set(fobj, i, fence, usage); 323 - /* pointer update must be visible before we extend the num_fences */ 324 - smp_store_mb(fobj->num_fences, count); 323 + /* fence update must be visible before we extend the num_fences */ 324 + smp_wmb(); 325 + fobj->num_fences = count; 325 326 } 326 327 EXPORT_SYMBOL(dma_resv_add_fence); 327 328
+1 -1
drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c
··· 109 109 struct drm_exec exec; 110 110 int r; 111 111 112 - drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT, 0); 112 + drm_exec_init(&exec, 0, 0); 113 113 drm_exec_until_all_locked(&exec) { 114 114 r = amdgpu_vm_lock_pd(vm, &exec, 0); 115 115 if (likely(!r))
+12
drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
··· 752 752 adev->gmc.vram_type = vram_type; 753 753 adev->gmc.vram_vendor = vram_vendor; 754 754 755 + /* The mall_size is already calculated as mall_size_per_umc * num_umc. 756 + * However, for gfx1151, which features a 2-to-1 UMC mapping, 757 + * the result must be multiplied by 2 to determine the actual mall size. 758 + */ 759 + switch (amdgpu_ip_version(adev, GC_HWIP, 0)) { 760 + case IP_VERSION(11, 5, 1): 761 + adev->gmc.mall_size *= 2; 762 + break; 763 + default: 764 + break; 765 + } 766 + 755 767 switch (amdgpu_ip_version(adev, GC_HWIP, 0)) { 756 768 case IP_VERSION(11, 0, 0): 757 769 case IP_VERSION(11, 0, 1):
+8
drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c
··· 1023 1023 ring->doorbell_index << VCN_RB1_DB_CTRL__OFFSET__SHIFT | 1024 1024 VCN_RB1_DB_CTRL__EN_MASK); 1025 1025 1026 + /* Keeping one read-back to ensure all register writes are done, otherwise 1027 + * it may introduce race conditions */ 1028 + RREG32_SOC15(VCN, inst_idx, regVCN_RB1_DB_CTRL); 1029 + 1026 1030 return 0; 1027 1031 } 1028 1032 ··· 1208 1204 tmp |= VCN_RB_ENABLE__RB1_EN_MASK; 1209 1205 WREG32_SOC15(VCN, i, regVCN_RB_ENABLE, tmp); 1210 1206 fw_shared->sq.queue_mode &= ~(FW_QUEUE_RING_RESET | FW_QUEUE_DPG_HOLD_OFF); 1207 + 1208 + /* Keeping one read-back to ensure all register writes are done, otherwise 1209 + * it may introduce race conditions */ 1210 + RREG32_SOC15(VCN, i, regVCN_RB_ENABLE); 1211 1211 1212 1212 return 0; 1213 1213 }
+4 -1
drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
··· 372 372 static inline bool is_dc_timing_adjust_needed(struct dm_crtc_state *old_state, 373 373 struct dm_crtc_state *new_state) 374 374 { 375 + if (new_state->stream->adjust.timing_adjust_pending) 376 + return true; 375 377 if (new_state->freesync_config.state == VRR_STATE_ACTIVE_FIXED) 376 378 return true; 377 379 else if (amdgpu_dm_crtc_vrr_active(old_state) != amdgpu_dm_crtc_vrr_active(new_state)) ··· 12765 12763 /* The reply is stored in the top nibble of the command. */ 12766 12764 payload->reply[0] = (adev->dm.dmub_notify->aux_reply.command >> 4) & 0xF; 12767 12765 12768 - if (!payload->write && p_notify->aux_reply.length) 12766 + /*write req may receive a byte indicating partially written number as well*/ 12767 + if (p_notify->aux_reply.length) 12769 12768 memcpy(payload->data, p_notify->aux_reply.data, 12770 12769 p_notify->aux_reply.length); 12771 12770
+11 -5
drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c
··· 62 62 enum aux_return_code_type operation_result; 63 63 struct amdgpu_device *adev; 64 64 struct ddc_service *ddc; 65 + uint8_t copy[16]; 65 66 66 67 if (WARN_ON(msg->size > 16)) 67 68 return -E2BIG; ··· 77 76 payload.write_status_update = 78 77 (msg->request & DP_AUX_I2C_WRITE_STATUS_UPDATE) != 0; 79 78 payload.defer_delay = 0; 79 + 80 + if (payload.write) { 81 + memcpy(copy, msg->buffer, msg->size); 82 + payload.data = copy; 83 + } 80 84 81 85 result = dc_link_aux_transfer_raw(TO_DM_AUX(aux)->ddc_service, &payload, 82 86 &operation_result); ··· 106 100 */ 107 101 if (payload.write && result >= 0) { 108 102 if (result) { 109 - /*one byte indicating partially written bytes. Force 0 to retry*/ 110 - drm_info(adev_to_drm(adev), "amdgpu: AUX partially written\n"); 111 - result = 0; 103 + /*one byte indicating partially written bytes*/ 104 + drm_dbg_dp(adev_to_drm(adev), "amdgpu: AUX partially written\n"); 105 + result = payload.data[0]; 112 106 } else if (!payload.reply[0]) 113 107 /*I2C_ACK|AUX_ACK*/ 114 108 result = msg->size; ··· 133 127 break; 134 128 } 135 129 136 - drm_info(adev_to_drm(adev), "amdgpu: DP AUX transfer fail:%d\n", operation_result); 130 + drm_dbg_dp(adev_to_drm(adev), "amdgpu: DP AUX transfer fail:%d\n", operation_result); 137 131 } 138 132 139 133 if (payload.reply[0]) 140 - drm_info(adev_to_drm(adev), "amdgpu: AUX reply command not ACK: 0x%02x.", 134 + drm_dbg_dp(adev_to_drm(adev), "amdgpu: AUX reply command not ACK: 0x%02x.", 141 135 payload.reply[0]); 142 136 143 137 return result;
+7 -3
drivers/gpu/drm/amd/display/dc/core/dc.c
··· 439 439 * Don't adjust DRR while there's bandwidth optimizations pending to 440 440 * avoid conflicting with firmware updates. 441 441 */ 442 - if (dc->ctx->dce_version > DCE_VERSION_MAX) 443 - if (dc->optimized_required || dc->wm_optimized_required) 442 + if (dc->ctx->dce_version > DCE_VERSION_MAX) { 443 + if (dc->optimized_required || dc->wm_optimized_required) { 444 + stream->adjust.timing_adjust_pending = true; 444 445 return false; 446 + } 447 + } 445 448 446 449 dc_exit_ips_for_hw_access(dc); 447 450 ··· 3171 3168 3172 3169 if (update->crtc_timing_adjust) { 3173 3170 if (stream->adjust.v_total_min != update->crtc_timing_adjust->v_total_min || 3174 - stream->adjust.v_total_max != update->crtc_timing_adjust->v_total_max) 3171 + stream->adjust.v_total_max != update->crtc_timing_adjust->v_total_max || 3172 + stream->adjust.timing_adjust_pending) 3175 3173 update->crtc_timing_adjust->timing_adjust_pending = true; 3176 3174 stream->adjust = *update->crtc_timing_adjust; 3177 3175 update->crtc_timing_adjust->timing_adjust_pending = false;
+2 -2
drivers/gpu/drm/amd/display/dc/dml/dcn35/dcn35_fpu.c
··· 195 195 .dcn_downspread_percent = 0.5, 196 196 .gpuvm_min_page_size_bytes = 4096, 197 197 .hostvm_min_page_size_bytes = 4096, 198 - .do_urgent_latency_adjustment = 1, 198 + .do_urgent_latency_adjustment = 0, 199 199 .urgent_latency_adjustment_fabric_clock_component_us = 0, 200 - .urgent_latency_adjustment_fabric_clock_reference_mhz = 3000, 200 + .urgent_latency_adjustment_fabric_clock_reference_mhz = 0, 201 201 }; 202 202 203 203 void dcn35_build_wm_range_table_fpu(struct clk_mgr *clk_mgr)
+11 -9
drivers/gpu/drm/amd/display/dc/dml2/dml21/dml21_translation_helper.c
··· 910 910 } 911 911 912 912 //TODO : Could be possibly moved to a common helper layer. 913 - static bool dml21_wrapper_get_plane_id(const struct dc_state *context, const struct dc_plane_state *plane, unsigned int *plane_id) 913 + static bool dml21_wrapper_get_plane_id(const struct dc_state *context, unsigned int stream_id, const struct dc_plane_state *plane, unsigned int *plane_id) 914 914 { 915 915 int i, j; 916 916 ··· 918 918 return false; 919 919 920 920 for (i = 0; i < context->stream_count; i++) { 921 - for (j = 0; j < context->stream_status[i].plane_count; j++) { 922 - if (context->stream_status[i].plane_states[j] == plane) { 923 - *plane_id = (i << 16) | j; 924 - return true; 921 + if (context->streams[i]->stream_id == stream_id) { 922 + for (j = 0; j < context->stream_status[i].plane_count; j++) { 923 + if (context->stream_status[i].plane_states[j] == plane) { 924 + *plane_id = (i << 16) | j; 925 + return true; 926 + } 925 927 } 926 928 } 927 929 } ··· 946 944 return location; 947 945 } 948 946 949 - static unsigned int map_plane_to_dml21_display_cfg(const struct dml2_context *dml_ctx, 947 + static unsigned int map_plane_to_dml21_display_cfg(const struct dml2_context *dml_ctx, unsigned int stream_id, 950 948 const struct dc_plane_state *plane, const struct dc_state *context) 951 949 { 952 950 unsigned int plane_id; 953 951 int i = 0; 954 952 int location = -1; 955 953 956 - if (!dml21_wrapper_get_plane_id(context, plane, &plane_id)) { 954 + if (!dml21_wrapper_get_plane_id(context, stream_id, plane, &plane_id)) { 957 955 ASSERT(false); 958 956 return -1; 959 957 } ··· 1039 1037 dml_dispcfg->plane_descriptors[disp_cfg_plane_location].stream_index = disp_cfg_stream_location; 1040 1038 } else { 1041 1039 for (plane_index = 0; plane_index < context->stream_status[stream_index].plane_count; plane_index++) { 1042 - disp_cfg_plane_location = map_plane_to_dml21_display_cfg(dml_ctx, context->stream_status[stream_index].plane_states[plane_index], context); 1040 + disp_cfg_plane_location = map_plane_to_dml21_display_cfg(dml_ctx, context->streams[stream_index]->stream_id, context->stream_status[stream_index].plane_states[plane_index], context); 1043 1041 1044 1042 if (disp_cfg_plane_location < 0) 1045 1043 disp_cfg_plane_location = dml_dispcfg->num_planes++; ··· 1050 1048 populate_dml21_plane_config_from_plane_state(dml_ctx, &dml_dispcfg->plane_descriptors[disp_cfg_plane_location], context->stream_status[stream_index].plane_states[plane_index], context, stream_index); 1051 1049 dml_dispcfg->plane_descriptors[disp_cfg_plane_location].stream_index = disp_cfg_stream_location; 1052 1050 1053 - if (dml21_wrapper_get_plane_id(context, context->stream_status[stream_index].plane_states[plane_index], &dml_ctx->v21.dml_to_dc_pipe_mapping.disp_cfg_to_plane_id[disp_cfg_plane_location])) 1051 + if (dml21_wrapper_get_plane_id(context, context->streams[stream_index]->stream_id, context->stream_status[stream_index].plane_states[plane_index], &dml_ctx->v21.dml_to_dc_pipe_mapping.disp_cfg_to_plane_id[disp_cfg_plane_location])) 1054 1052 dml_ctx->v21.dml_to_dc_pipe_mapping.disp_cfg_to_plane_id_valid[disp_cfg_plane_location] = true; 1055 1053 1056 1054 /* apply forced pstate policy */
+3 -2
drivers/gpu/drm/amd/display/dc/dpp/dcn401/dcn401_dpp_cm.c
··· 120 120 enum dc_cursor_color_format color_format = cursor_attributes->color_format; 121 121 int cur_rom_en = 0; 122 122 123 - // DCN4 should always do Cursor degamma for Cursor Color modes 124 123 if (color_format == CURSOR_MODE_COLOR_PRE_MULTIPLIED_ALPHA || 125 124 color_format == CURSOR_MODE_COLOR_UN_PRE_MULTIPLIED_ALPHA) { 126 - cur_rom_en = 1; 125 + if (cursor_attributes->attribute_flags.bits.ENABLE_CURSOR_DEGAMMA) { 126 + cur_rom_en = 1; 127 + } 127 128 } 128 129 129 130 REG_UPDATE_3(CURSOR0_CONTROL,
+3 -3
drivers/gpu/drm/amd/display/dc/hwss/dcn401/dcn401_hwseq.c
··· 1980 1980 dc->res_pool->hubbub, pipe_ctx->plane_res.hubp->inst, pipe_ctx->hubp_regs.det_size); 1981 1981 } 1982 1982 1983 - if (pipe_ctx->update_flags.raw || 1984 - (pipe_ctx->plane_state && pipe_ctx->plane_state->update_flags.raw) || 1985 - pipe_ctx->stream->update_flags.raw) 1983 + if (pipe_ctx->plane_state && (pipe_ctx->update_flags.raw || 1984 + pipe_ctx->plane_state->update_flags.raw || 1985 + pipe_ctx->stream->update_flags.raw)) 1986 1986 dc->hwss.update_dchubp_dpp(dc, pipe_ctx, context); 1987 1987 1988 1988 if (pipe_ctx->plane_state && (pipe_ctx->update_flags.bits.enable ||
+32 -5
drivers/gpu/drm/drm_gpusvm.c
··· 1118 1118 lockdep_assert_held(&gpusvm->notifier_lock); 1119 1119 1120 1120 if (range->flags.has_dma_mapping) { 1121 + struct drm_gpusvm_range_flags flags = { 1122 + .__flags = range->flags.__flags, 1123 + }; 1124 + 1121 1125 for (i = 0, j = 0; i < npages; j++) { 1122 1126 struct drm_pagemap_device_addr *addr = &range->dma_addr[j]; 1123 1127 ··· 1135 1131 dev, *addr); 1136 1132 i += 1 << addr->order; 1137 1133 } 1138 - range->flags.has_devmem_pages = false; 1139 - range->flags.has_dma_mapping = false; 1134 + 1135 + /* WRITE_ONCE pairs with READ_ONCE for opportunistic checks */ 1136 + flags.has_devmem_pages = false; 1137 + flags.has_dma_mapping = false; 1138 + WRITE_ONCE(range->flags.__flags, flags.__flags); 1139 + 1140 1140 range->dpagemap = NULL; 1141 1141 } 1142 1142 } ··· 1342 1334 int err = 0; 1343 1335 struct dev_pagemap *pagemap; 1344 1336 struct drm_pagemap *dpagemap; 1337 + struct drm_gpusvm_range_flags flags; 1345 1338 1346 1339 retry: 1347 1340 hmm_range.notifier_seq = mmu_interval_read_begin(notifier); ··· 1387 1378 */ 1388 1379 drm_gpusvm_notifier_lock(gpusvm); 1389 1380 1390 - if (range->flags.unmapped) { 1381 + flags.__flags = range->flags.__flags; 1382 + if (flags.unmapped) { 1391 1383 drm_gpusvm_notifier_unlock(gpusvm); 1392 1384 err = -EFAULT; 1393 1385 goto err_free; ··· 1464 1454 goto err_unmap; 1465 1455 } 1466 1456 1457 + if (ctx->devmem_only) { 1458 + err = -EFAULT; 1459 + goto err_unmap; 1460 + } 1461 + 1467 1462 addr = dma_map_page(gpusvm->drm->dev, 1468 1463 page, 0, 1469 1464 PAGE_SIZE << order, ··· 1484 1469 } 1485 1470 i += 1 << order; 1486 1471 num_dma_mapped = i; 1487 - range->flags.has_dma_mapping = true; 1472 + flags.has_dma_mapping = true; 1488 1473 } 1489 1474 1490 1475 if (zdd) { 1491 - range->flags.has_devmem_pages = true; 1476 + flags.has_devmem_pages = true; 1492 1477 range->dpagemap = dpagemap; 1493 1478 } 1479 + 1480 + /* WRITE_ONCE pairs with READ_ONCE for opportunistic checks */ 1481 + WRITE_ONCE(range->flags.__flags, flags.__flags); 1494 1482 1495 1483 drm_gpusvm_notifier_unlock(gpusvm); 1496 1484 kvfree(pfns); ··· 1783 1765 goto err_finalize; 1784 1766 1785 1767 /* Upon success bind devmem allocation to range and zdd */ 1768 + devmem_allocation->timeslice_expiration = get_jiffies_64() + 1769 + msecs_to_jiffies(ctx->timeslice_ms); 1786 1770 zdd->devmem_allocation = devmem_allocation; /* Owns ref */ 1787 1771 1788 1772 err_finalize: ··· 2004 1984 unsigned long start, end; 2005 1985 void *buf; 2006 1986 int i, err = 0; 1987 + 1988 + if (page) { 1989 + zdd = page->zone_device_data; 1990 + if (time_before64(get_jiffies_64(), 1991 + zdd->devmem_allocation->timeslice_expiration)) 1992 + return 0; 1993 + } 2007 1994 2008 1995 start = ALIGN_DOWN(fault_addr, size); 2009 1996 end = ALIGN(fault_addr + 1, size);
+2 -2
drivers/gpu/drm/meson/meson_encoder_hdmi.c
··· 75 75 unsigned long long venc_freq; 76 76 unsigned long long hdmi_freq; 77 77 78 - vclk_freq = mode->clock * 1000; 78 + vclk_freq = mode->clock * 1000ULL; 79 79 80 80 /* For 420, pixel clock is half unlike venc clock */ 81 81 if (encoder_hdmi->output_bus_fmt == MEDIA_BUS_FMT_UYYVYY8_0_5X24) ··· 123 123 struct meson_encoder_hdmi *encoder_hdmi = bridge_to_meson_encoder_hdmi(bridge); 124 124 struct meson_drm *priv = encoder_hdmi->priv; 125 125 bool is_hdmi2_sink = display_info->hdmi.scdc.supported; 126 - unsigned long long clock = mode->clock * 1000; 126 + unsigned long long clock = mode->clock * 1000ULL; 127 127 unsigned long long phy_freq; 128 128 unsigned long long vclk_freq; 129 129 unsigned long long venc_freq;
+4 -1
drivers/gpu/drm/tiny/panel-mipi-dbi.c
··· 390 390 391 391 spi_set_drvdata(spi, drm); 392 392 393 - drm_client_setup(drm, NULL); 393 + if (bpp == 16) 394 + drm_client_setup_with_fourcc(drm, DRM_FORMAT_RGB565); 395 + else 396 + drm_client_setup_with_fourcc(drm, DRM_FORMAT_RGB888); 394 397 395 398 return 0; 396 399 }
+4
drivers/gpu/drm/xe/instructions/xe_mi_commands.h
··· 47 47 #define MI_LRI_FORCE_POSTED REG_BIT(12) 48 48 #define MI_LRI_LEN(x) (((x) & 0xff) + 1) 49 49 50 + #define MI_STORE_REGISTER_MEM (__MI_INSTR(0x24) | XE_INSTR_NUM_DW(4)) 51 + #define MI_SRM_USE_GGTT REG_BIT(22) 52 + #define MI_SRM_ADD_CS_OFFSET REG_BIT(19) 53 + 50 54 #define MI_FLUSH_DW __MI_INSTR(0x26) 51 55 #define MI_FLUSH_DW_PROTECTED_MEM_EN REG_BIT(22) 52 56 #define MI_FLUSH_DW_STORE_INDEX REG_BIT(21)
+5
drivers/gpu/drm/xe/regs/xe_engine_regs.h
··· 43 43 #define XEHPC_BCS8_RING_BASE 0x3ee000 44 44 #define GSCCS_RING_BASE 0x11a000 45 45 46 + #define ENGINE_ID(base) XE_REG((base) + 0x8c) 47 + #define ENGINE_INSTANCE_ID REG_GENMASK(9, 4) 48 + #define ENGINE_CLASS_ID REG_GENMASK(2, 0) 49 + 46 50 #define RING_TAIL(base) XE_REG((base) + 0x30) 47 51 #define TAIL_ADDR REG_GENMASK(20, 3) 48 52 ··· 158 154 #define STOP_RING REG_BIT(8) 159 155 160 156 #define RING_CTX_TIMESTAMP(base) XE_REG((base) + 0x3a8) 157 + #define RING_CTX_TIMESTAMP_UDW(base) XE_REG((base) + 0x3ac) 161 158 #define CSBE_DEBUG_STATUS(base) XE_REG((base) + 0x3fc) 162 159 163 160 #define RING_FORCE_TO_NONPRIV(base, i) XE_REG(((base) + 0x4d0) + (i) * 4)
+1
drivers/gpu/drm/xe/regs/xe_gt_regs.h
··· 157 157 #define XEHPG_SC_INSTDONE_EXTRA2 XE_REG_MCR(0x7108) 158 158 159 159 #define COMMON_SLICE_CHICKEN4 XE_REG(0x7300, XE_REG_OPTION_MASKED) 160 + #define SBE_PUSH_CONSTANT_BEHIND_FIX_ENABLE REG_BIT(12) 160 161 #define DISABLE_TDC_LOAD_BALANCING_CALC REG_BIT(6) 161 162 162 163 #define COMMON_SLICE_CHICKEN3 XE_REG(0x7304, XE_REG_OPTION_MASKED)
+2
drivers/gpu/drm/xe/regs/xe_lrc_layout.h
··· 11 11 #define CTX_RING_TAIL (0x06 + 1) 12 12 #define CTX_RING_START (0x08 + 1) 13 13 #define CTX_RING_CTL (0x0a + 1) 14 + #define CTX_BB_PER_CTX_PTR (0x12 + 1) 14 15 #define CTX_TIMESTAMP (0x22 + 1) 16 + #define CTX_TIMESTAMP_UDW (0x24 + 1) 15 17 #define CTX_INDIRECT_RING_STATE (0x26 + 1) 16 18 #define CTX_PDP0_UDW (0x30 + 1) 17 19 #define CTX_PDP0_LDW (0x32 + 1)
+2
drivers/gpu/drm/xe/xe_device_types.h
··· 330 330 u8 has_sriov:1; 331 331 /** @info.has_usm: Device has unified shared memory support */ 332 332 u8 has_usm:1; 333 + /** @info.has_64bit_timestamp: Device supports 64-bit timestamps */ 334 + u8 has_64bit_timestamp:1; 333 335 /** @info.is_dgfx: is discrete device */ 334 336 u8 is_dgfx:1; 335 337 /**
+1 -1
drivers/gpu/drm/xe/xe_exec_queue.c
··· 830 830 { 831 831 struct xe_device *xe = gt_to_xe(q->gt); 832 832 struct xe_lrc *lrc; 833 - u32 old_ts, new_ts; 833 + u64 old_ts, new_ts; 834 834 int idx; 835 835 836 836 /*
+1 -1
drivers/gpu/drm/xe/xe_guc_submit.c
··· 941 941 return xe_sched_invalidate_job(job, 2); 942 942 } 943 943 944 - ctx_timestamp = xe_lrc_ctx_timestamp(q->lrc[0]); 944 + ctx_timestamp = lower_32_bits(xe_lrc_ctx_timestamp(q->lrc[0])); 945 945 ctx_job_timestamp = xe_lrc_ctx_job_timestamp(q->lrc[0]); 946 946 947 947 /*
+186 -13
drivers/gpu/drm/xe/xe_lrc.c
··· 24 24 #include "xe_hw_fence.h" 25 25 #include "xe_map.h" 26 26 #include "xe_memirq.h" 27 + #include "xe_mmio.h" 27 28 #include "xe_sriov.h" 28 29 #include "xe_trace_lrc.h" 29 30 #include "xe_vm.h" ··· 651 650 #define LRC_START_SEQNO_PPHWSP_OFFSET (LRC_SEQNO_PPHWSP_OFFSET + 8) 652 651 #define LRC_CTX_JOB_TIMESTAMP_OFFSET (LRC_START_SEQNO_PPHWSP_OFFSET + 8) 653 652 #define LRC_PARALLEL_PPHWSP_OFFSET 2048 653 + #define LRC_ENGINE_ID_PPHWSP_OFFSET 2096 654 654 #define LRC_PPHWSP_SIZE SZ_4K 655 655 656 656 u32 xe_lrc_regs_offset(struct xe_lrc *lrc) ··· 686 684 687 685 static u32 __xe_lrc_ctx_job_timestamp_offset(struct xe_lrc *lrc) 688 686 { 689 - /* The start seqno is stored in the driver-defined portion of PPHWSP */ 687 + /* This is stored in the driver-defined portion of PPHWSP */ 690 688 return xe_lrc_pphwsp_offset(lrc) + LRC_CTX_JOB_TIMESTAMP_OFFSET; 691 689 } 692 690 ··· 696 694 return xe_lrc_pphwsp_offset(lrc) + LRC_PARALLEL_PPHWSP_OFFSET; 697 695 } 698 696 697 + static inline u32 __xe_lrc_engine_id_offset(struct xe_lrc *lrc) 698 + { 699 + return xe_lrc_pphwsp_offset(lrc) + LRC_ENGINE_ID_PPHWSP_OFFSET; 700 + } 701 + 699 702 static u32 __xe_lrc_ctx_timestamp_offset(struct xe_lrc *lrc) 700 703 { 701 704 return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP * sizeof(u32); 705 + } 706 + 707 + static u32 __xe_lrc_ctx_timestamp_udw_offset(struct xe_lrc *lrc) 708 + { 709 + return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP_UDW * sizeof(u32); 702 710 } 703 711 704 712 static inline u32 __xe_lrc_indirect_ring_offset(struct xe_lrc *lrc) ··· 738 726 DECL_MAP_ADDR_HELPERS(start_seqno) 739 727 DECL_MAP_ADDR_HELPERS(ctx_job_timestamp) 740 728 DECL_MAP_ADDR_HELPERS(ctx_timestamp) 729 + DECL_MAP_ADDR_HELPERS(ctx_timestamp_udw) 741 730 DECL_MAP_ADDR_HELPERS(parallel) 742 731 DECL_MAP_ADDR_HELPERS(indirect_ring) 732 + DECL_MAP_ADDR_HELPERS(engine_id) 743 733 744 734 #undef DECL_MAP_ADDR_HELPERS 745 735 ··· 757 743 } 758 744 759 745 /** 746 + * xe_lrc_ctx_timestamp_udw_ggtt_addr() - Get ctx timestamp udw GGTT address 747 + * @lrc: Pointer to the lrc. 748 + * 749 + * Returns: ctx timestamp udw GGTT address 750 + */ 751 + u32 xe_lrc_ctx_timestamp_udw_ggtt_addr(struct xe_lrc *lrc) 752 + { 753 + return __xe_lrc_ctx_timestamp_udw_ggtt_addr(lrc); 754 + } 755 + 756 + /** 760 757 * xe_lrc_ctx_timestamp() - Read ctx timestamp value 761 758 * @lrc: Pointer to the lrc. 762 759 * 763 760 * Returns: ctx timestamp value 764 761 */ 765 - u32 xe_lrc_ctx_timestamp(struct xe_lrc *lrc) 762 + u64 xe_lrc_ctx_timestamp(struct xe_lrc *lrc) 766 763 { 767 764 struct xe_device *xe = lrc_to_xe(lrc); 768 765 struct iosys_map map; 766 + u32 ldw, udw = 0; 769 767 770 768 map = __xe_lrc_ctx_timestamp_map(lrc); 771 - return xe_map_read32(xe, &map); 769 + ldw = xe_map_read32(xe, &map); 770 + 771 + if (xe->info.has_64bit_timestamp) { 772 + map = __xe_lrc_ctx_timestamp_udw_map(lrc); 773 + udw = xe_map_read32(xe, &map); 774 + } 775 + 776 + return (u64)udw << 32 | ldw; 772 777 } 773 778 774 779 /** ··· 897 864 898 865 static void xe_lrc_set_ppgtt(struct xe_lrc *lrc, struct xe_vm *vm) 899 866 { 900 - u64 desc = xe_vm_pdp4_descriptor(vm, lrc->tile); 867 + u64 desc = xe_vm_pdp4_descriptor(vm, gt_to_tile(lrc->gt)); 901 868 902 869 xe_lrc_write_ctx_reg(lrc, CTX_PDP0_UDW, upper_32_bits(desc)); 903 870 xe_lrc_write_ctx_reg(lrc, CTX_PDP0_LDW, lower_32_bits(desc)); ··· 910 877 xe_bo_unpin(lrc->bo); 911 878 xe_bo_unlock(lrc->bo); 912 879 xe_bo_put(lrc->bo); 880 + xe_bo_unpin_map_no_vm(lrc->bb_per_ctx_bo); 881 + } 882 + 883 + /* 884 + * xe_lrc_setup_utilization() - Setup wa bb to assist in calculating active 885 + * context run ticks. 886 + * @lrc: Pointer to the lrc. 887 + * 888 + * Context Timestamp (CTX_TIMESTAMP) in the LRC accumulates the run ticks of the 889 + * context, but only gets updated when the context switches out. In order to 890 + * check how long a context has been active before it switches out, two things 891 + * are required: 892 + * 893 + * (1) Determine if the context is running: 894 + * To do so, we program the WA BB to set an initial value for CTX_TIMESTAMP in 895 + * the LRC. The value chosen is 1 since 0 is the initial value when the LRC is 896 + * initialized. During a query, we just check for this value to determine if the 897 + * context is active. If the context switched out, it would overwrite this 898 + * location with the actual CTX_TIMESTAMP MMIO value. Note that WA BB runs as 899 + * the last part of context restore, so reusing this LRC location will not 900 + * clobber anything. 901 + * 902 + * (2) Calculate the time that the context has been active for: 903 + * The CTX_TIMESTAMP ticks only when the context is active. If a context is 904 + * active, we just use the CTX_TIMESTAMP MMIO as the new value of utilization. 905 + * While doing so, we need to read the CTX_TIMESTAMP MMIO for the specific 906 + * engine instance. Since we do not know which instance the context is running 907 + * on until it is scheduled, we also read the ENGINE_ID MMIO in the WA BB and 908 + * store it in the PPHSWP. 909 + */ 910 + #define CONTEXT_ACTIVE 1ULL 911 + static void xe_lrc_setup_utilization(struct xe_lrc *lrc) 912 + { 913 + u32 *cmd; 914 + 915 + cmd = lrc->bb_per_ctx_bo->vmap.vaddr; 916 + 917 + *cmd++ = MI_STORE_REGISTER_MEM | MI_SRM_USE_GGTT | MI_SRM_ADD_CS_OFFSET; 918 + *cmd++ = ENGINE_ID(0).addr; 919 + *cmd++ = __xe_lrc_engine_id_ggtt_addr(lrc); 920 + *cmd++ = 0; 921 + 922 + *cmd++ = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1); 923 + *cmd++ = __xe_lrc_ctx_timestamp_ggtt_addr(lrc); 924 + *cmd++ = 0; 925 + *cmd++ = lower_32_bits(CONTEXT_ACTIVE); 926 + 927 + if (lrc_to_xe(lrc)->info.has_64bit_timestamp) { 928 + *cmd++ = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1); 929 + *cmd++ = __xe_lrc_ctx_timestamp_udw_ggtt_addr(lrc); 930 + *cmd++ = 0; 931 + *cmd++ = upper_32_bits(CONTEXT_ACTIVE); 932 + } 933 + 934 + *cmd++ = MI_BATCH_BUFFER_END; 935 + 936 + xe_lrc_write_ctx_reg(lrc, CTX_BB_PER_CTX_PTR, 937 + xe_bo_ggtt_addr(lrc->bb_per_ctx_bo) | 1); 938 + 913 939 } 914 940 915 941 #define PVC_CTX_ASID (0x2e + 1) ··· 985 893 void *init_data = NULL; 986 894 u32 arb_enable; 987 895 u32 lrc_size; 896 + u32 bo_flags; 988 897 int err; 989 898 990 899 kref_init(&lrc->refcount); 900 + lrc->gt = gt; 991 901 lrc->flags = 0; 992 902 lrc_size = ring_size + xe_gt_lrc_size(gt, hwe->class); 993 903 if (xe_gt_has_indirect_ring_state(gt)) 994 904 lrc->flags |= XE_LRC_FLAG_INDIRECT_RING_STATE; 905 + 906 + bo_flags = XE_BO_FLAG_VRAM_IF_DGFX(tile) | XE_BO_FLAG_GGTT | 907 + XE_BO_FLAG_GGTT_INVALIDATE; 995 908 996 909 /* 997 910 * FIXME: Perma-pinning LRC as we don't yet support moving GGTT address ··· 1004 907 */ 1005 908 lrc->bo = xe_bo_create_pin_map(xe, tile, vm, lrc_size, 1006 909 ttm_bo_type_kernel, 1007 - XE_BO_FLAG_VRAM_IF_DGFX(tile) | 1008 - XE_BO_FLAG_GGTT | 1009 - XE_BO_FLAG_GGTT_INVALIDATE); 910 + bo_flags); 1010 911 if (IS_ERR(lrc->bo)) 1011 912 return PTR_ERR(lrc->bo); 1012 913 914 + lrc->bb_per_ctx_bo = xe_bo_create_pin_map(xe, tile, NULL, SZ_4K, 915 + ttm_bo_type_kernel, 916 + bo_flags); 917 + if (IS_ERR(lrc->bb_per_ctx_bo)) { 918 + err = PTR_ERR(lrc->bb_per_ctx_bo); 919 + goto err_lrc_finish; 920 + } 921 + 1013 922 lrc->size = lrc_size; 1014 - lrc->tile = gt_to_tile(hwe->gt); 1015 923 lrc->ring.size = ring_size; 1016 924 lrc->ring.tail = 0; 1017 - lrc->ctx_timestamp = 0; 1018 925 1019 926 xe_hw_fence_ctx_init(&lrc->fence_ctx, hwe->gt, 1020 927 hwe->fence_irq, hwe->name); ··· 1091 990 xe_lrc_read_ctx_reg(lrc, CTX_CONTEXT_CONTROL) | 1092 991 _MASKED_BIT_ENABLE(CTX_CTRL_PXP_ENABLE)); 1093 992 993 + lrc->ctx_timestamp = 0; 1094 994 xe_lrc_write_ctx_reg(lrc, CTX_TIMESTAMP, 0); 995 + if (lrc_to_xe(lrc)->info.has_64bit_timestamp) 996 + xe_lrc_write_ctx_reg(lrc, CTX_TIMESTAMP_UDW, 0); 1095 997 1096 998 if (xe->info.has_asid && vm) 1097 999 xe_lrc_write_ctx_reg(lrc, PVC_CTX_ASID, vm->usm.asid); ··· 1122 1018 1123 1019 map = __xe_lrc_start_seqno_map(lrc); 1124 1020 xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1); 1021 + 1022 + xe_lrc_setup_utilization(lrc); 1125 1023 1126 1024 return 0; 1127 1025 ··· 1342 1236 struct iosys_map xe_lrc_parallel_map(struct xe_lrc *lrc) 1343 1237 { 1344 1238 return __xe_lrc_parallel_map(lrc); 1239 + } 1240 + 1241 + /** 1242 + * xe_lrc_engine_id() - Read engine id value 1243 + * @lrc: Pointer to the lrc. 1244 + * 1245 + * Returns: context id value 1246 + */ 1247 + static u32 xe_lrc_engine_id(struct xe_lrc *lrc) 1248 + { 1249 + struct xe_device *xe = lrc_to_xe(lrc); 1250 + struct iosys_map map; 1251 + 1252 + map = __xe_lrc_engine_id_map(lrc); 1253 + return xe_map_read32(xe, &map); 1345 1254 } 1346 1255 1347 1256 static int instr_dw(u32 cmd_header) ··· 1805 1684 snapshot->lrc_offset = xe_lrc_pphwsp_offset(lrc); 1806 1685 snapshot->lrc_size = lrc->bo->size - snapshot->lrc_offset; 1807 1686 snapshot->lrc_snapshot = NULL; 1808 - snapshot->ctx_timestamp = xe_lrc_ctx_timestamp(lrc); 1687 + snapshot->ctx_timestamp = lower_32_bits(xe_lrc_ctx_timestamp(lrc)); 1809 1688 snapshot->ctx_job_timestamp = xe_lrc_ctx_job_timestamp(lrc); 1810 1689 return snapshot; 1811 1690 } ··· 1905 1784 kfree(snapshot); 1906 1785 } 1907 1786 1787 + static int get_ctx_timestamp(struct xe_lrc *lrc, u32 engine_id, u64 *reg_ctx_ts) 1788 + { 1789 + u16 class = REG_FIELD_GET(ENGINE_CLASS_ID, engine_id); 1790 + u16 instance = REG_FIELD_GET(ENGINE_INSTANCE_ID, engine_id); 1791 + struct xe_hw_engine *hwe; 1792 + u64 val; 1793 + 1794 + hwe = xe_gt_hw_engine(lrc->gt, class, instance, false); 1795 + if (xe_gt_WARN_ONCE(lrc->gt, !hwe || xe_hw_engine_is_reserved(hwe), 1796 + "Unexpected engine class:instance %d:%d for context utilization\n", 1797 + class, instance)) 1798 + return -1; 1799 + 1800 + if (lrc_to_xe(lrc)->info.has_64bit_timestamp) 1801 + val = xe_mmio_read64_2x32(&hwe->gt->mmio, 1802 + RING_CTX_TIMESTAMP(hwe->mmio_base)); 1803 + else 1804 + val = xe_mmio_read32(&hwe->gt->mmio, 1805 + RING_CTX_TIMESTAMP(hwe->mmio_base)); 1806 + 1807 + *reg_ctx_ts = val; 1808 + 1809 + return 0; 1810 + } 1811 + 1908 1812 /** 1909 1813 * xe_lrc_update_timestamp() - Update ctx timestamp 1910 1814 * @lrc: Pointer to the lrc. 1911 1815 * @old_ts: Old timestamp value 1912 1816 * 1913 1817 * Populate @old_ts current saved ctx timestamp, read new ctx timestamp and 1914 - * update saved value. 1818 + * update saved value. With support for active contexts, the calculation may be 1819 + * slightly racy, so follow a read-again logic to ensure that the context is 1820 + * still active before returning the right timestamp. 1915 1821 * 1916 1822 * Returns: New ctx timestamp value 1917 1823 */ 1918 - u32 xe_lrc_update_timestamp(struct xe_lrc *lrc, u32 *old_ts) 1824 + u64 xe_lrc_update_timestamp(struct xe_lrc *lrc, u64 *old_ts) 1919 1825 { 1826 + u64 lrc_ts, reg_ts; 1827 + u32 engine_id; 1828 + 1920 1829 *old_ts = lrc->ctx_timestamp; 1921 1830 1922 - lrc->ctx_timestamp = xe_lrc_ctx_timestamp(lrc); 1831 + lrc_ts = xe_lrc_ctx_timestamp(lrc); 1832 + /* CTX_TIMESTAMP mmio read is invalid on VF, so return the LRC value */ 1833 + if (IS_SRIOV_VF(lrc_to_xe(lrc))) { 1834 + lrc->ctx_timestamp = lrc_ts; 1835 + goto done; 1836 + } 1923 1837 1838 + if (lrc_ts == CONTEXT_ACTIVE) { 1839 + engine_id = xe_lrc_engine_id(lrc); 1840 + if (!get_ctx_timestamp(lrc, engine_id, &reg_ts)) 1841 + lrc->ctx_timestamp = reg_ts; 1842 + 1843 + /* read lrc again to ensure context is still active */ 1844 + lrc_ts = xe_lrc_ctx_timestamp(lrc); 1845 + } 1846 + 1847 + /* 1848 + * If context switched out, just use the lrc_ts. Note that this needs to 1849 + * be a separate if condition. 1850 + */ 1851 + if (lrc_ts != CONTEXT_ACTIVE) 1852 + lrc->ctx_timestamp = lrc_ts; 1853 + 1854 + done: 1924 1855 trace_xe_lrc_update_timestamp(lrc, *old_ts); 1925 1856 1926 1857 return lrc->ctx_timestamp;
+3 -2
drivers/gpu/drm/xe/xe_lrc.h
··· 120 120 void xe_lrc_snapshot_free(struct xe_lrc_snapshot *snapshot); 121 121 122 122 u32 xe_lrc_ctx_timestamp_ggtt_addr(struct xe_lrc *lrc); 123 - u32 xe_lrc_ctx_timestamp(struct xe_lrc *lrc); 123 + u32 xe_lrc_ctx_timestamp_udw_ggtt_addr(struct xe_lrc *lrc); 124 + u64 xe_lrc_ctx_timestamp(struct xe_lrc *lrc); 124 125 u32 xe_lrc_ctx_job_timestamp_ggtt_addr(struct xe_lrc *lrc); 125 126 u32 xe_lrc_ctx_job_timestamp(struct xe_lrc *lrc); 126 127 ··· 137 136 * 138 137 * Returns the current LRC timestamp 139 138 */ 140 - u32 xe_lrc_update_timestamp(struct xe_lrc *lrc, u32 *old_ts); 139 + u64 xe_lrc_update_timestamp(struct xe_lrc *lrc, u64 *old_ts); 141 140 142 141 #endif
+6 -3
drivers/gpu/drm/xe/xe_lrc_types.h
··· 25 25 /** @size: size of lrc including any indirect ring state page */ 26 26 u32 size; 27 27 28 - /** @tile: tile which this LRC belongs to */ 29 - struct xe_tile *tile; 28 + /** @gt: gt which this LRC belongs to */ 29 + struct xe_gt *gt; 30 30 31 31 /** @flags: LRC flags */ 32 32 #define XE_LRC_FLAG_INDIRECT_RING_STATE 0x1 ··· 52 52 struct xe_hw_fence_ctx fence_ctx; 53 53 54 54 /** @ctx_timestamp: readout value of CTX_TIMESTAMP on last update */ 55 - u32 ctx_timestamp; 55 + u64 ctx_timestamp; 56 + 57 + /** @bb_per_ctx_bo: buffer object for per context batch wa buffer */ 58 + struct xe_bo *bb_per_ctx_bo; 56 59 }; 57 60 58 61 struct xe_lrc_snapshot;
-3
drivers/gpu/drm/xe/xe_module.c
··· 29 29 module_param_named(svm_notifier_size, xe_modparam.svm_notifier_size, uint, 0600); 30 30 MODULE_PARM_DESC(svm_notifier_size, "Set the svm notifier size(in MiB), must be power of 2"); 31 31 32 - module_param_named(always_migrate_to_vram, xe_modparam.always_migrate_to_vram, bool, 0444); 33 - MODULE_PARM_DESC(always_migrate_to_vram, "Always migrate to VRAM on GPU fault"); 34 - 35 32 module_param_named_unsafe(force_execlist, xe_modparam.force_execlist, bool, 0444); 36 33 MODULE_PARM_DESC(force_execlist, "Force Execlist submission"); 37 34
-1
drivers/gpu/drm/xe/xe_module.h
··· 12 12 struct xe_modparam { 13 13 bool force_execlist; 14 14 bool probe_display; 15 - bool always_migrate_to_vram; 16 15 u32 force_vram_bar_size; 17 16 int guc_log_level; 18 17 char *guc_firmware_path;
+2
drivers/gpu/drm/xe/xe_pci.c
··· 140 140 .has_indirect_ring_state = 1, \ 141 141 .has_range_tlb_invalidation = 1, \ 142 142 .has_usm = 1, \ 143 + .has_64bit_timestamp = 1, \ 143 144 .va_bits = 48, \ 144 145 .vm_max_level = 4, \ 145 146 .hw_engine_mask = \ ··· 669 668 670 669 xe->info.has_range_tlb_invalidation = graphics_desc->has_range_tlb_invalidation; 671 670 xe->info.has_usm = graphics_desc->has_usm; 671 + xe->info.has_64bit_timestamp = graphics_desc->has_64bit_timestamp; 672 672 673 673 for_each_remote_tile(tile, xe, id) { 674 674 int err;
+1
drivers/gpu/drm/xe/xe_pci_types.h
··· 21 21 u8 has_indirect_ring_state:1; 22 22 u8 has_range_tlb_invalidation:1; 23 23 u8 has_usm:1; 24 + u8 has_64bit_timestamp:1; 24 25 }; 25 26 26 27 struct xe_media_desc {
+11 -3
drivers/gpu/drm/xe/xe_pt.c
··· 2232 2232 } 2233 2233 case DRM_GPUVA_OP_DRIVER: 2234 2234 { 2235 + /* WRITE_ONCE pairs with READ_ONCE in xe_svm.c */ 2236 + 2235 2237 if (op->subop == XE_VMA_SUBOP_MAP_RANGE) { 2236 - op->map_range.range->tile_present |= BIT(tile->id); 2237 - op->map_range.range->tile_invalidated &= ~BIT(tile->id); 2238 + WRITE_ONCE(op->map_range.range->tile_present, 2239 + op->map_range.range->tile_present | 2240 + BIT(tile->id)); 2241 + WRITE_ONCE(op->map_range.range->tile_invalidated, 2242 + op->map_range.range->tile_invalidated & 2243 + ~BIT(tile->id)); 2238 2244 } else if (op->subop == XE_VMA_SUBOP_UNMAP_RANGE) { 2239 - op->unmap_range.range->tile_present &= ~BIT(tile->id); 2245 + WRITE_ONCE(op->unmap_range.range->tile_present, 2246 + op->unmap_range.range->tile_present & 2247 + ~BIT(tile->id)); 2240 2248 } 2241 2249 break; 2242 2250 }
+2 -5
drivers/gpu/drm/xe/xe_ring_ops.c
··· 234 234 235 235 static int emit_copy_timestamp(struct xe_lrc *lrc, u32 *dw, int i) 236 236 { 237 - dw[i++] = MI_COPY_MEM_MEM | MI_COPY_MEM_MEM_SRC_GGTT | 238 - MI_COPY_MEM_MEM_DST_GGTT; 237 + dw[i++] = MI_STORE_REGISTER_MEM | MI_SRM_USE_GGTT | MI_SRM_ADD_CS_OFFSET; 238 + dw[i++] = RING_CTX_TIMESTAMP(0).addr; 239 239 dw[i++] = xe_lrc_ctx_job_timestamp_ggtt_addr(lrc); 240 240 dw[i++] = 0; 241 - dw[i++] = xe_lrc_ctx_timestamp_ggtt_addr(lrc); 242 - dw[i++] = 0; 243 - dw[i++] = MI_NOOP; 244 241 245 242 return i; 246 243 }
+1 -1
drivers/gpu/drm/xe/xe_shrinker.c
··· 227 227 if (!shrinker) 228 228 return ERR_PTR(-ENOMEM); 229 229 230 - shrinker->shrink = shrinker_alloc(0, "xe system shrinker"); 230 + shrinker->shrink = shrinker_alloc(0, "drm-xe_gem:%s", xe->drm.unique); 231 231 if (!shrinker->shrink) { 232 232 kfree(shrinker); 233 233 return ERR_PTR(-ENOMEM);
+90 -26
drivers/gpu/drm/xe/xe_svm.c
··· 15 15 16 16 static bool xe_svm_range_in_vram(struct xe_svm_range *range) 17 17 { 18 - /* Not reliable without notifier lock */ 19 - return range->base.flags.has_devmem_pages; 18 + /* 19 + * Advisory only check whether the range is currently backed by VRAM 20 + * memory. 21 + */ 22 + 23 + struct drm_gpusvm_range_flags flags = { 24 + /* Pairs with WRITE_ONCE in drm_gpusvm.c */ 25 + .__flags = READ_ONCE(range->base.flags.__flags), 26 + }; 27 + 28 + return flags.has_devmem_pages; 20 29 } 21 30 22 31 static bool xe_svm_range_has_vram_binding(struct xe_svm_range *range) ··· 654 645 } 655 646 656 647 static bool xe_svm_range_is_valid(struct xe_svm_range *range, 657 - struct xe_tile *tile) 648 + struct xe_tile *tile, 649 + bool devmem_only) 658 650 { 659 - return (range->tile_present & ~range->tile_invalidated) & BIT(tile->id); 651 + /* 652 + * Advisory only check whether the range currently has a valid mapping, 653 + * READ_ONCE pairs with WRITE_ONCE in xe_pt.c 654 + */ 655 + return ((READ_ONCE(range->tile_present) & 656 + ~READ_ONCE(range->tile_invalidated)) & BIT(tile->id)) && 657 + (!devmem_only || xe_svm_range_in_vram(range)); 660 658 } 661 659 662 660 static struct xe_vram_region *tile_to_vr(struct xe_tile *tile) ··· 728 712 return err; 729 713 } 730 714 715 + static bool supports_4K_migration(struct xe_device *xe) 716 + { 717 + if (xe->info.vram_flags & XE_VRAM_FLAGS_NEED64K) 718 + return false; 719 + 720 + return true; 721 + } 722 + 723 + static bool xe_svm_range_needs_migrate_to_vram(struct xe_svm_range *range, 724 + struct xe_vma *vma) 725 + { 726 + struct xe_vm *vm = range_to_vm(&range->base); 727 + u64 range_size = xe_svm_range_size(range); 728 + 729 + if (!range->base.flags.migrate_devmem) 730 + return false; 731 + 732 + if (xe_svm_range_in_vram(range)) { 733 + drm_dbg(&vm->xe->drm, "Range is already in VRAM\n"); 734 + return false; 735 + } 736 + 737 + if (range_size <= SZ_64K && !supports_4K_migration(vm->xe)) { 738 + drm_dbg(&vm->xe->drm, "Platform doesn't support SZ_4K range migration\n"); 739 + return false; 740 + } 741 + 742 + return true; 743 + } 744 + 731 745 /** 732 746 * xe_svm_handle_pagefault() - SVM handle page fault 733 747 * @vm: The VM. ··· 781 735 IS_ENABLED(CONFIG_DRM_XE_DEVMEM_MIRROR), 782 736 .check_pages_threshold = IS_DGFX(vm->xe) && 783 737 IS_ENABLED(CONFIG_DRM_XE_DEVMEM_MIRROR) ? SZ_64K : 0, 738 + .devmem_only = atomic && IS_DGFX(vm->xe) && 739 + IS_ENABLED(CONFIG_DRM_XE_DEVMEM_MIRROR), 740 + .timeslice_ms = atomic && IS_DGFX(vm->xe) && 741 + IS_ENABLED(CONFIG_DRM_XE_DEVMEM_MIRROR) ? 5 : 0, 784 742 }; 785 743 struct xe_svm_range *range; 786 744 struct drm_gpusvm_range *r; 787 745 struct drm_exec exec; 788 746 struct dma_fence *fence; 747 + int migrate_try_count = ctx.devmem_only ? 3 : 1; 789 748 ktime_t end = 0; 790 749 int err; 791 750 ··· 809 758 if (IS_ERR(r)) 810 759 return PTR_ERR(r); 811 760 761 + if (ctx.devmem_only && !r->flags.migrate_devmem) 762 + return -EACCES; 763 + 812 764 range = to_xe_range(r); 813 - if (xe_svm_range_is_valid(range, tile)) 765 + if (xe_svm_range_is_valid(range, tile, ctx.devmem_only)) 814 766 return 0; 815 767 816 768 range_debug(range, "PAGE FAULT"); 817 769 818 - /* XXX: Add migration policy, for now migrate range once */ 819 - if (!range->skip_migrate && range->base.flags.migrate_devmem && 820 - xe_svm_range_size(range) >= SZ_64K) { 821 - range->skip_migrate = true; 822 - 770 + if (--migrate_try_count >= 0 && 771 + xe_svm_range_needs_migrate_to_vram(range, vma)) { 823 772 err = xe_svm_alloc_vram(vm, tile, range, &ctx); 773 + ctx.timeslice_ms <<= 1; /* Double timeslice if we have to retry */ 824 774 if (err) { 825 - drm_dbg(&vm->xe->drm, 826 - "VRAM allocation failed, falling back to " 827 - "retrying fault, asid=%u, errno=%pe\n", 828 - vm->usm.asid, ERR_PTR(err)); 829 - goto retry; 775 + if (migrate_try_count || !ctx.devmem_only) { 776 + drm_dbg(&vm->xe->drm, 777 + "VRAM allocation failed, falling back to retrying fault, asid=%u, errno=%pe\n", 778 + vm->usm.asid, ERR_PTR(err)); 779 + goto retry; 780 + } else { 781 + drm_err(&vm->xe->drm, 782 + "VRAM allocation failed, retry count exceeded, asid=%u, errno=%pe\n", 783 + vm->usm.asid, ERR_PTR(err)); 784 + return err; 785 + } 830 786 } 831 787 } 832 788 ··· 841 783 err = drm_gpusvm_range_get_pages(&vm->svm.gpusvm, r, &ctx); 842 784 /* Corner where CPU mappings have changed */ 843 785 if (err == -EOPNOTSUPP || err == -EFAULT || err == -EPERM) { 844 - if (err == -EOPNOTSUPP) { 845 - range_debug(range, "PAGE FAULT - EVICT PAGES"); 846 - drm_gpusvm_range_evict(&vm->svm.gpusvm, &range->base); 786 + ctx.timeslice_ms <<= 1; /* Double timeslice if we have to retry */ 787 + if (migrate_try_count > 0 || !ctx.devmem_only) { 788 + if (err == -EOPNOTSUPP) { 789 + range_debug(range, "PAGE FAULT - EVICT PAGES"); 790 + drm_gpusvm_range_evict(&vm->svm.gpusvm, 791 + &range->base); 792 + } 793 + drm_dbg(&vm->xe->drm, 794 + "Get pages failed, falling back to retrying, asid=%u, gpusvm=%p, errno=%pe\n", 795 + vm->usm.asid, &vm->svm.gpusvm, ERR_PTR(err)); 796 + range_debug(range, "PAGE FAULT - RETRY PAGES"); 797 + goto retry; 798 + } else { 799 + drm_err(&vm->xe->drm, 800 + "Get pages failed, retry count exceeded, asid=%u, gpusvm=%p, errno=%pe\n", 801 + vm->usm.asid, &vm->svm.gpusvm, ERR_PTR(err)); 847 802 } 848 - drm_dbg(&vm->xe->drm, 849 - "Get pages failed, falling back to retrying, asid=%u, gpusvm=%p, errno=%pe\n", 850 - vm->usm.asid, &vm->svm.gpusvm, ERR_PTR(err)); 851 - range_debug(range, "PAGE FAULT - RETRY PAGES"); 852 - goto retry; 853 803 } 854 804 if (err) { 855 805 range_debug(range, "PAGE FAULT - FAIL PAGE COLLECT"); ··· 881 815 drm_exec_fini(&exec); 882 816 err = PTR_ERR(fence); 883 817 if (err == -EAGAIN) { 818 + ctx.timeslice_ms <<= 1; /* Double timeslice if we have to retry */ 884 819 range_debug(range, "PAGE FAULT - RETRY BIND"); 885 820 goto retry; 886 821 } ··· 891 824 } 892 825 } 893 826 drm_exec_fini(&exec); 894 - 895 - if (xe_modparam.always_migrate_to_vram) 896 - range->skip_migrate = false; 897 827 898 828 dma_fence_wait(fence, false); 899 829 dma_fence_put(fence);
-5
drivers/gpu/drm/xe/xe_svm.h
··· 36 36 * range. Protected by GPU SVM notifier lock. 37 37 */ 38 38 u8 tile_invalidated; 39 - /** 40 - * @skip_migrate: Skip migration to VRAM, protected by GPU fault handler 41 - * locking. 42 - */ 43 - u8 skip_migrate :1; 44 39 }; 45 40 46 41 #if IS_ENABLED(CONFIG_DRM_GPUSVM)
+4 -4
drivers/gpu/drm/xe/xe_trace_lrc.h
··· 19 19 #define __dev_name_lrc(lrc) dev_name(gt_to_xe((lrc)->fence_ctx.gt)->drm.dev) 20 20 21 21 TRACE_EVENT(xe_lrc_update_timestamp, 22 - TP_PROTO(struct xe_lrc *lrc, uint32_t old), 22 + TP_PROTO(struct xe_lrc *lrc, uint64_t old), 23 23 TP_ARGS(lrc, old), 24 24 TP_STRUCT__entry( 25 25 __field(struct xe_lrc *, lrc) 26 - __field(u32, old) 27 - __field(u32, new) 26 + __field(u64, old) 27 + __field(u64, new) 28 28 __string(name, lrc->fence_ctx.name) 29 29 __string(device_id, __dev_name_lrc(lrc)) 30 30 ), ··· 36 36 __assign_str(name); 37 37 __assign_str(device_id); 38 38 ), 39 - TP_printk("lrc=:%p lrc->name=%s old=%u new=%u device_id:%s", 39 + TP_printk("lrc=:%p lrc->name=%s old=%llu new=%llu device_id:%s", 40 40 __entry->lrc, __get_str(name), 41 41 __entry->old, __entry->new, 42 42 __get_str(device_id))
+4
drivers/gpu/drm/xe/xe_wa.c
··· 815 815 XE_RTP_RULES(GRAPHICS_VERSION(2001), ENGINE_CLASS(RENDER)), 816 816 XE_RTP_ACTIONS(SET(CHICKEN_RASTER_1, DIS_CLIP_NEGATIVE_BOUNDING_BOX)) 817 817 }, 818 + { XE_RTP_NAME("22021007897"), 819 + XE_RTP_RULES(GRAPHICS_VERSION(2001), ENGINE_CLASS(RENDER)), 820 + XE_RTP_ACTIONS(SET(COMMON_SLICE_CHICKEN4, SBE_PUSH_CONSTANT_BEHIND_FIX_ENABLE)) 821 + }, 818 822 819 823 /* Xe3_LPG */ 820 824 { XE_RTP_NAME("14021490052"),
+33 -14
include/drm/drm_gpusvm.h
··· 89 89 * @ops: Pointer to the operations structure for GPU SVM device memory 90 90 * @dpagemap: The struct drm_pagemap of the pages this allocation belongs to. 91 91 * @size: Size of device memory allocation 92 + * @timeslice_expiration: Timeslice expiration in jiffies 92 93 */ 93 94 struct drm_gpusvm_devmem { 94 95 struct device *dev; ··· 98 97 const struct drm_gpusvm_devmem_ops *ops; 99 98 struct drm_pagemap *dpagemap; 100 99 size_t size; 100 + u64 timeslice_expiration; 101 101 }; 102 102 103 103 /** ··· 188 186 }; 189 187 190 188 /** 189 + * struct drm_gpusvm_range_flags - Structure representing a GPU SVM range flags 190 + * 191 + * @migrate_devmem: Flag indicating whether the range can be migrated to device memory 192 + * @unmapped: Flag indicating if the range has been unmapped 193 + * @partial_unmap: Flag indicating if the range has been partially unmapped 194 + * @has_devmem_pages: Flag indicating if the range has devmem pages 195 + * @has_dma_mapping: Flag indicating if the range has a DMA mapping 196 + * @__flags: Flags for range in u16 form (used for READ_ONCE) 197 + */ 198 + struct drm_gpusvm_range_flags { 199 + union { 200 + struct { 201 + /* All flags below must be set upon creation */ 202 + u16 migrate_devmem : 1; 203 + /* All flags below must be set / cleared under notifier lock */ 204 + u16 unmapped : 1; 205 + u16 partial_unmap : 1; 206 + u16 has_devmem_pages : 1; 207 + u16 has_dma_mapping : 1; 208 + }; 209 + u16 __flags; 210 + }; 211 + }; 212 + 213 + /** 191 214 * struct drm_gpusvm_range - Structure representing a GPU SVM range 192 215 * 193 216 * @gpusvm: Pointer to the GPU SVM structure ··· 225 198 * @dpagemap: The struct drm_pagemap of the device pages we're dma-mapping. 226 199 * Note this is assuming only one drm_pagemap per range is allowed. 227 200 * @flags: Flags for range 228 - * @flags.migrate_devmem: Flag indicating whether the range can be migrated to device memory 229 - * @flags.unmapped: Flag indicating if the range has been unmapped 230 - * @flags.partial_unmap: Flag indicating if the range has been partially unmapped 231 - * @flags.has_devmem_pages: Flag indicating if the range has devmem pages 232 - * @flags.has_dma_mapping: Flag indicating if the range has a DMA mapping 233 201 * 234 202 * This structure represents a GPU SVM range used for tracking memory ranges 235 203 * mapped in a DRM device. ··· 238 216 unsigned long notifier_seq; 239 217 struct drm_pagemap_device_addr *dma_addr; 240 218 struct drm_pagemap *dpagemap; 241 - struct { 242 - /* All flags below must be set upon creation */ 243 - u16 migrate_devmem : 1; 244 - /* All flags below must be set / cleared under notifier lock */ 245 - u16 unmapped : 1; 246 - u16 partial_unmap : 1; 247 - u16 has_devmem_pages : 1; 248 - u16 has_dma_mapping : 1; 249 - } flags; 219 + struct drm_gpusvm_range_flags flags; 250 220 }; 251 221 252 222 /** ··· 297 283 * @check_pages_threshold: Check CPU pages for present if chunk is less than or 298 284 * equal to threshold. If not present, reduce chunk 299 285 * size. 286 + * @timeslice_ms: The timeslice MS which in minimum time a piece of memory 287 + * remains with either exclusive GPU or CPU access. 300 288 * @in_notifier: entering from a MMU notifier 301 289 * @read_only: operating on read-only memory 302 290 * @devmem_possible: possible to use device memory 291 + * @devmem_only: use only device memory 303 292 * 304 293 * Context that is DRM GPUSVM is operating in (i.e. user arguments). 305 294 */ 306 295 struct drm_gpusvm_ctx { 307 296 unsigned long check_pages_threshold; 297 + unsigned long timeslice_ms; 308 298 unsigned int in_notifier :1; 309 299 unsigned int read_only :1; 310 300 unsigned int devmem_possible :1; 301 + unsigned int devmem_only :1; 311 302 }; 312 303 313 304 int drm_gpusvm_init(struct drm_gpusvm *gpusvm,