Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

vfio/mlx5: Add REINIT support to VFIO_MIG_GET_PRECOPY_INFO

When userspace opts into VFIO_DEVICE_FEATURE_MIG_PRECOPY_INFOv2, the
driver may report the VFIO_PRECOPY_INFO_REINIT output flag in response
to the VFIO_MIG_GET_PRECOPY_INFO ioctl, along with a new initial_bytes
value.

The presence of the VFIO_PRECOPY_INFO_REINIT flag indicates to the
caller that new initial data is available in the migration stream.

If the firmware reports a new initial-data chunk, any previously dirty
bytes in memory are treated as initial bytes, since the caller must read
both sets before reaching the end of the initial-data region.

In this case, the driver issues a new SAVE command to fetch the data and
prepare it for a subsequent read() from userspace.

Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Link: https://lore.kernel.org/r/20260317161753.18964-7-yishaih@nvidia.com
Signed-off-by: Alex Williamson <alex@shazbot.org>

authored by

Yishai Hadas and committed by
Alex Williamson
56f90177 bd0da611

+86 -42
+16 -4
drivers/vfio/pci/mlx5/cmd.c
··· 87 87 88 88 int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev, 89 89 size_t *state_size, u64 *total_size, 90 - u8 query_flags) 90 + u8 *mig_state, u8 query_flags) 91 91 { 92 92 u32 out[MLX5_ST_SZ_DW(query_vhca_migration_state_out)] = {}; 93 93 u32 in[MLX5_ST_SZ_DW(query_vhca_migration_state_in)] = {}; ··· 151 151 *total_size = mvdev->chunk_mode ? 152 152 MLX5_GET64(query_vhca_migration_state_out, out, 153 153 remaining_total_size) : *state_size; 154 + 155 + if (mig_state && mvdev->mig_state_cap) 156 + *mig_state = MLX5_GET(query_vhca_migration_state_out, out, 157 + migration_state); 154 158 155 159 return 0; 156 160 } ··· 280 276 281 277 if (MLX5_CAP_GEN_2(mvdev->mdev, migration_in_chunks)) 282 278 mvdev->chunk_mode = 1; 279 + 280 + if (MLX5_CAP_GEN_2(mvdev->mdev, migration_state)) 281 + mvdev->mig_state_cap = 1; 283 282 284 283 end: 285 284 mlx5_vf_put_core_dev(mvdev->mdev); ··· 562 555 { 563 556 spin_lock_irq(&buf->migf->list_lock); 564 557 buf->stop_copy_chunk_num = 0; 558 + buf->pre_copy_init_bytes_chunk = false; 565 559 list_add_tail(&buf->buf_elm, &buf->migf->avail_list); 566 560 spin_unlock_irq(&buf->migf->list_lock); 567 561 } ··· 697 689 !next_required_umem_size; 698 690 if (async_data->header_buf) { 699 691 status = add_buf_header(async_data->header_buf, image_size, 700 - initial_pre_copy); 692 + initial_pre_copy || 693 + async_data->buf->pre_copy_init_bytes_chunk); 701 694 if (status) 702 695 goto err; 703 696 } ··· 717 708 } 718 709 } 719 710 spin_unlock_irqrestore(&migf->list_lock, flags); 720 - if (initial_pre_copy) { 711 + if (initial_pre_copy || async_data->buf->pre_copy_init_bytes_chunk) { 721 712 migf->pre_copy_initial_bytes += image_size; 722 - migf->state = MLX5_MIGF_STATE_PRE_COPY; 713 + if (initial_pre_copy) 714 + migf->state = MLX5_MIGF_STATE_PRE_COPY; 715 + if (async_data->buf->pre_copy_init_bytes_chunk) 716 + async_data->buf->pre_copy_init_bytes_chunk = false; 723 717 } 724 718 if (stop_copy_last_chunk) 725 719 migf->state = MLX5_MIGF_STATE_COMPLETE;
+4 -1
drivers/vfio/pci/mlx5/cmd.h
··· 62 62 u32 *mkey_in; 63 63 enum dma_data_direction dma_dir; 64 64 u8 stop_copy_chunk_num; 65 + bool pre_copy_init_bytes_chunk; 65 66 struct list_head buf_elm; 66 67 struct mlx5_vf_migration_file *migf; 67 68 }; ··· 98 97 u32 record_tag; 99 98 u64 stop_copy_prep_size; 100 99 u64 pre_copy_initial_bytes; 100 + u64 pre_copy_initial_bytes_start; 101 101 size_t next_required_umem_size; 102 102 u8 num_ready_chunks; 103 103 /* Upon chunk mode preserve another set of buffers for stop_copy phase */ ··· 177 175 u8 mdev_detach:1; 178 176 u8 log_active:1; 179 177 u8 chunk_mode:1; 178 + u8 mig_state_cap:1; 180 179 struct completion tracker_comp; 181 180 /* protect migration state */ 182 181 struct mutex state_mutex; ··· 202 199 int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod); 203 200 int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev, 204 201 size_t *state_size, u64 *total_size, 205 - u8 query_flags); 202 + u8 *migration_state, u8 query_flags); 206 203 void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev, 207 204 const struct vfio_migration_ops *mig_ops, 208 205 const struct vfio_log_ops *log_ops);
+66 -37
drivers/vfio/pci/mlx5/main.c
··· 464 464 struct mlx5_vhca_data_buffer *buf; 465 465 struct vfio_precopy_info info = {}; 466 466 loff_t *pos = &filp->f_pos; 467 + u8 migration_state = 0; 467 468 size_t inc_length = 0; 468 - bool end_of_data = false; 469 + bool reinit_state; 470 + bool end_of_data; 469 471 int ret; 470 472 471 473 ret = vfio_check_precopy_ioctl(&mvdev->core_device.vdev, cmd, arg, ··· 494 492 * As so, the other code below is safe with the proper locks. 495 493 */ 496 494 ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &inc_length, 497 - NULL, MLX5VF_QUERY_INC); 495 + NULL, &migration_state, 496 + MLX5VF_QUERY_INC); 498 497 if (ret) 499 498 goto err_state_unlock; 500 499 } ··· 506 503 goto err_migf_unlock; 507 504 } 508 505 509 - if (migf->pre_copy_initial_bytes > *pos) { 510 - info.initial_bytes = migf->pre_copy_initial_bytes - *pos; 511 - } else { 512 - info.dirty_bytes = migf->max_pos - *pos; 513 - if (!info.dirty_bytes) 514 - end_of_data = true; 515 - info.dirty_bytes += inc_length; 516 - } 517 - 518 - if (!end_of_data || !inc_length) { 519 - mutex_unlock(&migf->lock); 520 - goto done; 521 - } 522 - 523 - mutex_unlock(&migf->lock); 524 506 /* 525 - * We finished transferring the current state and the device has a 526 - * dirty state, save a new state to be ready for. 507 + * opt-in for VFIO_DEVICE_FEATURE_MIG_PRECOPY_INFOv2 serves 508 + * as opt-in for VFIO_PRECOPY_INFO_REINIT as well 527 509 */ 528 - buf = mlx5vf_get_data_buffer(migf, DIV_ROUND_UP(inc_length, PAGE_SIZE), 529 - DMA_FROM_DEVICE); 530 - if (IS_ERR(buf)) { 531 - ret = PTR_ERR(buf); 532 - mlx5vf_mark_err(migf); 533 - goto err_state_unlock; 510 + reinit_state = mvdev->core_device.vdev.precopy_info_v2 && 511 + migration_state == MLX5_QUERY_VHCA_MIG_STATE_OPER_MIGRATION_INIT; 512 + end_of_data = !(migf->max_pos - *pos); 513 + if (reinit_state) { 514 + /* 515 + * Any bytes already present in memory are treated as initial 516 + * bytes, since the caller is required to read them before 517 + * reaching the new initial-bytes region. 518 + */ 519 + migf->pre_copy_initial_bytes_start = *pos; 520 + migf->pre_copy_initial_bytes = migf->max_pos - *pos; 521 + info.initial_bytes = migf->pre_copy_initial_bytes + inc_length; 522 + info.flags |= VFIO_PRECOPY_INFO_REINIT; 523 + } else { 524 + if (migf->pre_copy_initial_bytes_start + 525 + migf->pre_copy_initial_bytes > *pos) { 526 + WARN_ON_ONCE(end_of_data); 527 + info.initial_bytes = migf->pre_copy_initial_bytes_start + 528 + migf->pre_copy_initial_bytes - *pos; 529 + } else { 530 + info.dirty_bytes = (migf->max_pos - *pos) + inc_length; 531 + } 532 + } 533 + mutex_unlock(&migf->lock); 534 + 535 + if ((reinit_state || end_of_data) && inc_length) { 536 + /* 537 + * In case we finished transferring the current state and the 538 + * device has a dirty state, or that the device has a new init 539 + * state, save a new state to be ready for. 540 + */ 541 + buf = mlx5vf_get_data_buffer(migf, DIV_ROUND_UP(inc_length, PAGE_SIZE), 542 + DMA_FROM_DEVICE); 543 + if (IS_ERR(buf)) { 544 + ret = PTR_ERR(buf); 545 + mlx5vf_mark_err(migf); 546 + goto err_state_unlock; 547 + } 548 + 549 + buf->pre_copy_init_bytes_chunk = reinit_state; 550 + ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, true); 551 + if (ret) { 552 + mlx5vf_mark_err(migf); 553 + mlx5vf_put_data_buffer(buf); 554 + goto err_state_unlock; 555 + } 556 + 557 + /* 558 + * SAVE appends a header record via add_buf_header(), 559 + * let's account it as well. 560 + */ 561 + if (reinit_state) 562 + info.initial_bytes += sizeof(struct mlx5_vf_migration_header); 563 + else 564 + info.dirty_bytes += sizeof(struct mlx5_vf_migration_header); 534 565 } 535 566 536 - ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, true); 537 - if (ret) { 538 - mlx5vf_mark_err(migf); 539 - mlx5vf_put_data_buffer(buf); 540 - goto err_state_unlock; 541 - } 542 - 543 - done: 544 567 mlx5vf_state_mutex_unlock(mvdev); 545 568 if (copy_to_user((void __user *)arg, &info, 546 569 offsetofend(struct vfio_precopy_info, dirty_bytes))) ··· 599 570 if (migf->state == MLX5_MIGF_STATE_ERROR) 600 571 return -ENODEV; 601 572 602 - ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, NULL, 573 + ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, NULL, NULL, 603 574 MLX5VF_QUERY_INC | MLX5VF_QUERY_FINAL); 604 575 if (ret) 605 576 goto err; ··· 665 636 if (ret) 666 637 goto out; 667 638 668 - ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, &full_size, 0); 639 + ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, &full_size, NULL, 0); 669 640 if (ret) 670 641 goto out_pd; 671 642 ··· 1152 1123 enum mlx5_vf_migf_state state; 1153 1124 size_t size; 1154 1125 1155 - ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &size, NULL, 1126 + ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &size, NULL, NULL, 1156 1127 MLX5VF_QUERY_INC | MLX5VF_QUERY_CLEANUP); 1157 1128 if (ret) 1158 1129 return ERR_PTR(ret); ··· 1277 1248 1278 1249 mutex_lock(&mvdev->state_mutex); 1279 1250 ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &state_size, 1280 - &total_size, 0); 1251 + &total_size, NULL, 0); 1281 1252 if (!ret) 1282 1253 *stop_copy_length = total_size; 1283 1254 mlx5vf_state_mutex_unlock(mvdev);