Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'ptp-vmclock-add-vm-generation-counter-and-acpi-notification'

Takahiro Itazuri says:

====================
ptp: vmclock: Add VM generation counter and ACPI notification

Similarly to live migration, starting a VM from some serialized state
(aka snapshot) is an event which calls for adjusting guest clocks, hence
a hypervisor should increase the disruption_marker before resuming the
VM vCPUs, letting the guest know.

However, loading a snapshot, is slightly different than live migration,
especially since we can start multiple VMs from the same serialized
state. Apart from adjusting clocks, the guest needs to take additional
action during such events, e.g. recreate UUIDs, reset network
adapters/connections, reseed entropy pools, etc. These actions are not
necessary during live migration. This calls for a differentiation
between the two triggering events.

We differentiate between the two events via an extra field in the
vmclock_abi, called vm_generation_counter. Whereas hypervisors should
increase the disruption marker in both cases, they should only increase
vm_generation_counter when a snapshot is loaded in a VM (not during live
migration).

Additionally, we attach an ACPI notification to VMClock. Implementing
the notification is optional for the device. VMClock device will declare
that it implements the notification by setting
VMCLOCK_FLAG_NOTIFICATION_PRESENT bit in vmclock_abi flags. Hypervisors
that implement the notification must send an ACPI notification every
time seq_count changes to an even number. The driver will propagate
these notifications to userspace via the poll() interface.
====================

Link: https://patch.msgid.link/20260130173704.12575-1-itazur@amazon.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

+279 -26
+46
Documentation/devicetree/bindings/ptp/amazon,vmclock.yaml
··· 1 + # SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) 2 + %YAML 1.2 3 + --- 4 + $id: http://devicetree.org/schemas/ptp/amazon,vmclock.yaml# 5 + $schema: http://devicetree.org/meta-schemas/core.yaml# 6 + 7 + title: Virtual Machine Clock 8 + 9 + maintainers: 10 + - David Woodhouse <dwmw2@infradead.org> 11 + 12 + description: 13 + The vmclock device provides a precise clock source and allows for 14 + accurate timekeeping across live migration and snapshot/restore 15 + operations. The full specification of the shared data structure is 16 + available at https://uapi-group.org/specifications/specs/vmclock/ 17 + 18 + properties: 19 + compatible: 20 + const: amazon,vmclock 21 + 22 + reg: 23 + description: 24 + Specifies the shared memory region containing the vmclock_abi structure. 25 + maxItems: 1 26 + 27 + interrupts: 28 + description: 29 + Interrupt used to notify when the contents of the vmclock_abi structure 30 + have been updated. 31 + maxItems: 1 32 + 33 + required: 34 + - compatible 35 + - reg 36 + 37 + additionalProperties: false 38 + 39 + examples: 40 + - | 41 + #include <dt-bindings/interrupt-controller/arm-gic.h> 42 + ptp@80000000 { 43 + compatible = "amazon,vmclock"; 44 + reg = <0x80000000 0x1000>; 45 + interrupts = <GIC_SPI 36 IRQ_TYPE_EDGE_RISING>; 46 + };
+1
MAINTAINERS
··· 21037 21037 M: David Woodhouse <dwmw2@infradead.org> 21038 21038 L: netdev@vger.kernel.org 21039 21039 S: Maintained 21040 + F: Documentation/devicetree/bindings/ptp/amazon,vmclock.yaml 21040 21041 F: drivers/ptp/ptp_vmclock.c 21041 21042 F: include/uapi/linux/vmclock-abi.h 21042 21043
+1 -1
drivers/ptp/Kconfig
··· 134 134 config PTP_1588_CLOCK_VMCLOCK 135 135 tristate "Virtual machine PTP clock" 136 136 depends on X86_TSC || ARM_ARCH_TIMER 137 - depends on PTP_1588_CLOCK && ACPI && ARCH_SUPPORTS_INT128 137 + depends on PTP_1588_CLOCK && ARCH_SUPPORTS_INT128 138 138 default PTP_1588_CLOCK_KVM 139 139 help 140 140 This driver adds support for using a virtual precision clock
+211 -25
drivers/ptp/ptp_vmclock.c
··· 5 5 * Copyright © 2024 Amazon.com, Inc. or its affiliates. 6 6 */ 7 7 8 + #include "linux/poll.h" 9 + #include "linux/types.h" 10 + #include "linux/wait.h" 8 11 #include <linux/acpi.h> 9 12 #include <linux/device.h> 10 13 #include <linux/err.h> 11 14 #include <linux/file.h> 12 15 #include <linux/fs.h> 13 16 #include <linux/init.h> 17 + #include <linux/io.h> 18 + #include <linux/interrupt.h> 14 19 #include <linux/kernel.h> 15 20 #include <linux/miscdevice.h> 16 21 #include <linux/mm.h> 17 22 #include <linux/module.h> 23 + #include <linux/of.h> 18 24 #include <linux/platform_device.h> 19 25 #include <linux/slab.h> 20 26 ··· 45 39 struct resource res; 46 40 struct vmclock_abi *clk; 47 41 struct miscdevice miscdev; 42 + wait_queue_head_t disrupt_wait; 48 43 struct ptp_clock_info ptp_clock_info; 49 44 struct ptp_clock *ptp_clock; 50 45 enum clocksource_ids cs_id, sys_cs_id; ··· 83 76 84 77 static bool tai_adjust(struct vmclock_abi *clk, uint64_t *sec) 85 78 { 86 - if (likely(clk->time_type == VMCLOCK_TIME_UTC)) 79 + if (clk->time_type == VMCLOCK_TIME_TAI) 87 80 return true; 88 81 89 - if (clk->time_type == VMCLOCK_TIME_TAI && 82 + if (clk->time_type == VMCLOCK_TIME_UTC && 90 83 (le64_to_cpu(clk->flags) & VMCLOCK_FLAG_TAI_OFFSET_VALID)) { 91 84 if (sec) 92 - *sec += (int16_t)le16_to_cpu(clk->tai_offset_sec); 85 + *sec -= (int16_t)le16_to_cpu(clk->tai_offset_sec); 93 86 return true; 94 87 } 95 88 return false; ··· 350 343 return NULL; 351 344 } 352 345 353 - /* Only UTC, or TAI with offset */ 346 + /* Accept TAI directly, or UTC with valid offset for conversion to TAI */ 354 347 if (!tai_adjust(st->clk, NULL)) { 355 - dev_info(dev, "vmclock does not provide unambiguous UTC\n"); 348 + dev_info(dev, "vmclock does not provide unambiguous time\n"); 356 349 return NULL; 357 350 } 358 351 ··· 364 357 return ptp_clock_register(&st->ptp_clock_info, dev); 365 358 } 366 359 360 + struct vmclock_file_state { 361 + struct vmclock_state *st; 362 + atomic_t seq; 363 + }; 364 + 367 365 static int vmclock_miscdev_mmap(struct file *fp, struct vm_area_struct *vma) 368 366 { 369 - struct vmclock_state *st = container_of(fp->private_data, 370 - struct vmclock_state, miscdev); 367 + struct vmclock_file_state *fst = fp->private_data; 368 + struct vmclock_state *st = fst->st; 371 369 372 370 if ((vma->vm_flags & (VM_READ|VM_WRITE)) != VM_READ) 373 371 return -EROFS; ··· 391 379 static ssize_t vmclock_miscdev_read(struct file *fp, char __user *buf, 392 380 size_t count, loff_t *ppos) 393 381 { 394 - struct vmclock_state *st = container_of(fp->private_data, 395 - struct vmclock_state, miscdev); 396 382 ktime_t deadline = ktime_add(ktime_get(), VMCLOCK_MAX_WAIT); 383 + struct vmclock_file_state *fst = fp->private_data; 384 + struct vmclock_state *st = fst->st; 385 + uint32_t seq, old_seq; 397 386 size_t max_count; 398 - uint32_t seq; 399 387 400 388 if (*ppos >= PAGE_SIZE) 401 389 return 0; ··· 404 392 if (count > max_count) 405 393 count = max_count; 406 394 395 + old_seq = atomic_read(&fst->seq); 407 396 while (1) { 408 397 seq = le32_to_cpu(st->clk->seq_count) & ~1U; 409 398 /* Pairs with hypervisor wmb */ ··· 415 402 416 403 /* Pairs with hypervisor wmb */ 417 404 virt_rmb(); 418 - if (seq == le32_to_cpu(st->clk->seq_count)) 419 - break; 405 + if (seq == le32_to_cpu(st->clk->seq_count)) { 406 + /* 407 + * Either we updated fst->seq to seq (the latest version we observed) 408 + * or someone else did (old_seq == seq), so we can break. 409 + */ 410 + if (atomic_try_cmpxchg(&fst->seq, &old_seq, seq) || 411 + old_seq == seq) { 412 + break; 413 + } 414 + } 420 415 421 416 if (ktime_after(ktime_get(), deadline)) 422 417 return -ETIMEDOUT; ··· 434 413 return count; 435 414 } 436 415 416 + static __poll_t vmclock_miscdev_poll(struct file *fp, poll_table *wait) 417 + { 418 + struct vmclock_file_state *fst = fp->private_data; 419 + struct vmclock_state *st = fst->st; 420 + uint32_t seq; 421 + 422 + /* 423 + * Hypervisor will not send us any notifications, so fail immediately 424 + * to avoid having caller sleeping for ever. 425 + */ 426 + if (!(le64_to_cpu(st->clk->flags) & VMCLOCK_FLAG_NOTIFICATION_PRESENT)) 427 + return POLLHUP; 428 + 429 + poll_wait(fp, &st->disrupt_wait, wait); 430 + 431 + seq = le32_to_cpu(st->clk->seq_count); 432 + if (atomic_read(&fst->seq) != seq) 433 + return POLLIN | POLLRDNORM; 434 + 435 + return 0; 436 + } 437 + 438 + static int vmclock_miscdev_open(struct inode *inode, struct file *fp) 439 + { 440 + struct vmclock_state *st = container_of(fp->private_data, 441 + struct vmclock_state, miscdev); 442 + struct vmclock_file_state *fst = kzalloc(sizeof(*fst), GFP_KERNEL); 443 + 444 + if (!fst) 445 + return -ENOMEM; 446 + 447 + fst->st = st; 448 + atomic_set(&fst->seq, 0); 449 + 450 + fp->private_data = fst; 451 + 452 + return 0; 453 + } 454 + 455 + static int vmclock_miscdev_release(struct inode *inode, struct file *fp) 456 + { 457 + kfree(fp->private_data); 458 + return 0; 459 + } 460 + 437 461 static const struct file_operations vmclock_miscdev_fops = { 438 462 .owner = THIS_MODULE, 463 + .open = vmclock_miscdev_open, 464 + .release = vmclock_miscdev_release, 439 465 .mmap = vmclock_miscdev_mmap, 440 466 .read = vmclock_miscdev_read, 467 + .poll = vmclock_miscdev_poll, 441 468 }; 442 469 443 470 /* module operations */ 444 471 445 - static void vmclock_remove(void *data) 446 - { 447 - struct vmclock_state *st = data; 448 - 449 - if (st->ptp_clock) 450 - ptp_clock_unregister(st->ptp_clock); 451 - 452 - if (st->miscdev.minor != MISC_DYNAMIC_MINOR) 453 - misc_deregister(&st->miscdev); 454 - } 455 - 472 + #if IS_ENABLED(CONFIG_ACPI) 456 473 static acpi_status vmclock_acpi_resources(struct acpi_resource *ares, void *data) 457 474 { 458 475 struct vmclock_state *st = data; ··· 518 459 return AE_ERROR; 519 460 } 520 461 462 + static void 463 + vmclock_acpi_notification_handler(acpi_handle __always_unused handle, 464 + u32 __always_unused event, void *dev) 465 + { 466 + struct device *device = dev; 467 + struct vmclock_state *st = device->driver_data; 468 + 469 + wake_up_interruptible(&st->disrupt_wait); 470 + } 471 + 472 + static int vmclock_setup_acpi_notification(struct device *dev) 473 + { 474 + struct acpi_device *adev = ACPI_COMPANION(dev); 475 + acpi_status status; 476 + 477 + /* 478 + * This should never happen as this function is only called when 479 + * has_acpi_companion(dev) is true, but the logic is sufficiently 480 + * complex that Coverity can't see the tautology. 481 + */ 482 + if (!adev) 483 + return -ENODEV; 484 + 485 + status = acpi_install_notify_handler(adev->handle, ACPI_DEVICE_NOTIFY, 486 + vmclock_acpi_notification_handler, 487 + dev); 488 + if (ACPI_FAILURE(status)) { 489 + dev_err(dev, "failed to install notification handler"); 490 + return -ENODEV; 491 + } 492 + 493 + return 0; 494 + } 495 + 521 496 static int vmclock_probe_acpi(struct device *dev, struct vmclock_state *st) 522 497 { 523 498 struct acpi_device *adev = ACPI_COMPANION(dev); ··· 574 481 575 482 return 0; 576 483 } 484 + #endif /* CONFIG_ACPI */ 485 + 486 + static irqreturn_t vmclock_of_irq_handler(int __always_unused irq, void *_st) 487 + { 488 + struct vmclock_state *st = _st; 489 + 490 + wake_up_interruptible(&st->disrupt_wait); 491 + return IRQ_HANDLED; 492 + } 493 + 494 + static int vmclock_probe_dt(struct device *dev, struct vmclock_state *st) 495 + { 496 + struct platform_device *pdev = to_platform_device(dev); 497 + struct resource *res; 498 + 499 + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); 500 + if (!res) 501 + return -ENODEV; 502 + 503 + st->res = *res; 504 + 505 + return 0; 506 + } 507 + 508 + static int vmclock_setup_of_notification(struct device *dev) 509 + { 510 + struct platform_device *pdev = to_platform_device(dev); 511 + int irq; 512 + 513 + irq = platform_get_irq(pdev, 0); 514 + if (irq < 0) 515 + return irq; 516 + 517 + return devm_request_irq(dev, irq, vmclock_of_irq_handler, IRQF_SHARED, 518 + "vmclock", dev->driver_data); 519 + } 520 + 521 + static int vmclock_setup_notification(struct device *dev, 522 + struct vmclock_state *st) 523 + { 524 + /* The device does not support notifications. Nothing else to do */ 525 + if (!(le64_to_cpu(st->clk->flags) & VMCLOCK_FLAG_NOTIFICATION_PRESENT)) 526 + return 0; 527 + 528 + #if IS_ENABLED(CONFIG_ACPI) 529 + if (has_acpi_companion(dev)) 530 + return vmclock_setup_acpi_notification(dev); 531 + #endif 532 + return vmclock_setup_of_notification(dev); 533 + } 534 + 535 + static void vmclock_remove(void *data) 536 + { 537 + struct device *dev = data; 538 + struct vmclock_state *st = dev->driver_data; 539 + 540 + if (!st) { 541 + dev_err(dev, "%s called with NULL driver_data", __func__); 542 + return; 543 + } 544 + 545 + #if IS_ENABLED(CONFIG_ACPI) 546 + if (has_acpi_companion(dev)) 547 + acpi_remove_notify_handler(ACPI_COMPANION(dev)->handle, 548 + ACPI_DEVICE_NOTIFY, 549 + vmclock_acpi_notification_handler); 550 + #endif 551 + 552 + if (st->ptp_clock) 553 + ptp_clock_unregister(st->ptp_clock); 554 + 555 + if (st->miscdev.minor != MISC_DYNAMIC_MINOR) 556 + misc_deregister(&st->miscdev); 557 + 558 + dev->driver_data = NULL; 559 + } 577 560 578 561 static void vmclock_put_idx(void *data) 579 562 { ··· 668 499 if (!st) 669 500 return -ENOMEM; 670 501 502 + #if IS_ENABLED(CONFIG_ACPI) 671 503 if (has_acpi_companion(dev)) 672 504 ret = vmclock_probe_acpi(dev, st); 673 505 else 674 - ret = -EINVAL; /* Only ACPI for now */ 506 + #endif 507 + ret = vmclock_probe_dt(dev, st); 675 508 676 509 if (ret) { 677 510 dev_info(dev, "Failed to obtain physical address: %d\n", ret); ··· 716 545 717 546 st->miscdev.minor = MISC_DYNAMIC_MINOR; 718 547 719 - ret = devm_add_action_or_reset(&pdev->dev, vmclock_remove, st); 548 + init_waitqueue_head(&st->disrupt_wait); 549 + dev->driver_data = st; 550 + 551 + ret = devm_add_action_or_reset(&pdev->dev, vmclock_remove, dev); 552 + if (ret) 553 + return ret; 554 + 555 + ret = vmclock_setup_notification(dev, st); 720 556 if (ret) 721 557 return ret; 722 558 ··· 769 591 770 592 static const struct acpi_device_id vmclock_acpi_ids[] = { 771 593 { "AMZNC10C", 0 }, 594 + { "VMCLOCK", 0 }, 772 595 {} 773 596 }; 774 597 MODULE_DEVICE_TABLE(acpi, vmclock_acpi_ids); 598 + 599 + static const struct of_device_id vmclock_of_ids[] = { 600 + { .compatible = "amazon,vmclock", }, 601 + { }, 602 + }; 603 + MODULE_DEVICE_TABLE(of, vmclock_of_ids); 775 604 776 605 static struct platform_driver vmclock_platform_driver = { 777 606 .probe = vmclock_probe, 778 607 .driver = { 779 608 .name = "vmclock", 780 609 .acpi_match_table = vmclock_acpi_ids, 610 + .of_match_table = vmclock_of_ids, 781 611 }, 782 612 }; 783 613
+20
include/uapi/linux/vmclock-abi.h
··· 115 115 * bit again after the update, using the about-to-be-valid fields. 116 116 */ 117 117 #define VMCLOCK_FLAG_TIME_MONOTONIC (1 << 7) 118 + /* 119 + * If the VM_GEN_COUNTER_PRESENT flag is set, the hypervisor will 120 + * bump the vm_generation_counter field every time the guest is 121 + * loaded from some save state (restored from a snapshot). 122 + */ 123 + #define VMCLOCK_FLAG_VM_GEN_COUNTER_PRESENT (1 << 8) 124 + /* 125 + * If the NOTIFICATION_PRESENT flag is set, the hypervisor will send 126 + * a notification every time it updates seq_count to a new even number. 127 + */ 128 + #define VMCLOCK_FLAG_NOTIFICATION_PRESENT (1 << 9) 118 129 119 130 __u8 pad[2]; 120 131 __u8 clock_status; ··· 188 177 __le64 time_frac_sec; /* Units of 1/2^64 of a second */ 189 178 __le64 time_esterror_nanosec; 190 179 __le64 time_maxerror_nanosec; 180 + 181 + /* 182 + * This field changes to another non-repeating value when the guest 183 + * has been loaded from a snapshot. In addition to handling a 184 + * disruption in time (which will also be signalled through the 185 + * disruption_marker field), a guest may wish to discard UUIDs, 186 + * reset network connections, reseed entropy, etc. 187 + */ 188 + __le64 vm_generation_counter; 191 189 }; 192 190 193 191 #endif /* __VMCLOCK_ABI_H__ */