Merge git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux-2.6-for-linus

+483 -238

Documentation/lguest/lguest.c

··· 1 - /*P:100 This is the Launcher code, a simple program which lays out the 2 - * "physical" memory for the new Guest by mapping the kernel image and 3 - * the virtual devices, then opens /dev/lguest to tell the kernel 4 - * about the Guest and control it. :*/ 1 + /*P:100 2 + * This is the Launcher code, a simple program which lays out the "physical" 3 + * memory for the new Guest by mapping the kernel image and the virtual 4 + * devices, then opens /dev/lguest to tell the kernel about the Guest and 5 + * control it. 6 + :*/ 5 7 #define _LARGEFILE64_SOURCE 6 8 #define _GNU_SOURCE 7 9 #include <stdio.h> ··· 48 46 #include "linux/virtio_rng.h" 49 47 #include "linux/virtio_ring.h" 50 48 #include "asm/bootparam.h" 51 - /*L:110 We can ignore the 39 include files we need for this program, but I do 52 - * want to draw attention to the use of kernel-style types. 49 + /*L:110 50 + * We can ignore the 42 include files we need for this program, but I do want 51 + * to draw attention to the use of kernel-style types. 53 52 * 54 53 * As Linus said, "C is a Spartan language, and so should your naming be." I 55 54 * like these abbreviations, so we define them here. Note that u64 is always 56 55 * unsigned long long, which works on all Linux systems: this means that we can 57 - * use %llu in printf for any u64. */ 56 + * use %llu in printf for any u64. 57 + */ 58 58 typedef unsigned long long u64; 59 59 typedef uint32_t u32; 60 60 typedef uint16_t u16; ··· 73 69 /* This will occupy 3 pages: it must be a power of 2. */ 74 70 #define VIRTQUEUE_NUM 256 75 71 76 - /*L:120 verbose is both a global flag and a macro. The C preprocessor allows 77 - * this, and although I wouldn't recommend it, it works quite nicely here. */ 72 + /*L:120 73 + * verbose is both a global flag and a macro. The C preprocessor allows 74 + * this, and although I wouldn't recommend it, it works quite nicely here. 75 + */ 78 76 static bool verbose; 79 77 #define verbose(args...) \ 80 78 do { if (verbose) printf(args); } while(0) ··· 93 87 static unsigned int __thread cpu_id; 94 88 95 89 /* This is our list of devices. */ 96 - struct device_list 97 - { 90 + struct device_list { 98 91 /* Counter to assign interrupt numbers. */ 99 92 unsigned int next_irq; 100 93 ··· 105 100 106 101 /* A single linked list of devices. */ 107 102 struct device *dev; 108 - /* And a pointer to the last device for easy append and also for 109 - * configuration appending. */ 103 + /* And a pointer to the last device for easy append. */ 110 104 struct device *lastdev; 111 105 }; 112 106 ··· 113 109 static struct device_list devices; 114 110 115 111 /* The device structure describes a single device. */ 116 - struct device 117 - { 112 + struct device { 118 113 /* The linked-list pointer. */ 119 114 struct device *next; 120 115 ··· 138 135 }; 139 136 140 137 /* The virtqueue structure describes a queue attached to a device. */ 141 - struct virtqueue 142 - { 138 + struct virtqueue { 143 139 struct virtqueue *next; 144 140 145 141 /* Which device owns me. */ ··· 170 168 /* The original tty settings to restore on exit. */ 171 169 static struct termios orig_term; 172 170 173 - /* We have to be careful with barriers: our devices are all run in separate 171 + /* 172 + * We have to be careful with barriers: our devices are all run in separate 174 173 * threads and so we need to make sure that changes visible to the Guest happen 175 - * in precise order. */ 174 + * in precise order. 175 + */ 176 176 #define wmb() __asm__ __volatile__("" : : : "memory") 177 177 #define mb() __asm__ __volatile__("" : : : "memory") 178 178 179 - /* Convert an iovec element to the given type. 179 + /* 180 + * Convert an iovec element to the given type. 180 181 * 181 182 * This is a fairly ugly trick: we need to know the size of the type and 182 183 * alignment requirement to check the pointer is kosher. It's also nice to 183 184 * have the name of the type in case we report failure. 184 185 * 185 186 * Typing those three things all the time is cumbersome and error prone, so we 186 - * have a macro which sets them all up and passes to the real function. */ 187 + * have a macro which sets them all up and passes to the real function. 188 + */ 187 189 #define convert(iov, type) \ 188 190 ((type *)_convert((iov), sizeof(type), __alignof__(type), #type)) 189 191 ··· 204 198 /* Wrapper for the last available index. Makes it easier to change. */ 205 199 #define lg_last_avail(vq) ((vq)->last_avail_idx) 206 200 207 - /* The virtio configuration space is defined to be little-endian. x86 is 208 - * little-endian too, but it's nice to be explicit so we have these helpers. */ 201 + /* 202 + * The virtio configuration space is defined to be little-endian. x86 is 203 + * little-endian too, but it's nice to be explicit so we have these helpers. 204 + */ 209 205 #define cpu_to_le16(v16) (v16) 210 206 #define cpu_to_le32(v32) (v32) 211 207 #define cpu_to_le64(v64) (v64) ··· 249 241 + dev->num_vq * sizeof(struct lguest_vqconfig); 250 242 } 251 243 252 - /*L:100 The Launcher code itself takes us out into userspace, that scary place 253 - * where pointers run wild and free! Unfortunately, like most userspace 254 - * programs, it's quite boring (which is why everyone likes to hack on the 255 - * kernel!). Perhaps if you make up an Lguest Drinking Game at this point, it 256 - * will get you through this section. Or, maybe not. 244 + /*L:100 245 + * The Launcher code itself takes us out into userspace, that scary place where 246 + * pointers run wild and free! Unfortunately, like most userspace programs, 247 + * it's quite boring (which is why everyone likes to hack on the kernel!). 248 + * Perhaps if you make up an Lguest Drinking Game at this point, it will get 249 + * you through this section. Or, maybe not. 257 250 * 258 251 * The Launcher sets up a big chunk of memory to be the Guest's "physical" 259 252 * memory and stores it in "guest_base". In other words, Guest physical == ··· 262 253 * 263 254 * This can be tough to get your head around, but usually it just means that we 264 255 * use these trivial conversion functions when the Guest gives us it's 265 - * "physical" addresses: */ 256 + * "physical" addresses: 257 + */ 266 258 static void *from_guest_phys(unsigned long addr) 267 259 { 268 260 return guest_base + addr; ··· 278 268 * Loading the Kernel. 279 269 * 280 270 * We start with couple of simple helper routines. open_or_die() avoids 281 - * error-checking code cluttering the callers: */ 271 + * error-checking code cluttering the callers: 272 + */ 282 273 static int open_or_die(const char *name, int flags) 283 274 { 284 275 int fd = open(name, flags); ··· 294 283 int fd = open_or_die("/dev/zero", O_RDONLY); 295 284 void *addr; 296 285 297 - /* We use a private mapping (ie. if we write to the page, it will be 298 - * copied). */ 286 + /* 287 + * We use a private mapping (ie. if we write to the page, it will be 288 + * copied). 289 + */ 299 290 addr = mmap(NULL, getpagesize() * num, 300 291 PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, fd, 0); 301 292 if (addr == MAP_FAILED) 302 293 err(1, "Mmaping %u pages of /dev/zero", num); 294 + 295 + /* 296 + * One neat mmap feature is that you can close the fd, and it 297 + * stays mapped. 298 + */ 303 299 close(fd); 304 300 305 301 return addr; ··· 323 305 return addr; 324 306 } 325 307 326 - /* This routine is used to load the kernel or initrd. It tries mmap, but if 308 + /* 309 + * This routine is used to load the kernel or initrd. It tries mmap, but if 327 310 * that fails (Plan 9's kernel file isn't nicely aligned on page boundaries), 328 - * it falls back to reading the memory in. */ 311 + * it falls back to reading the memory in. 312 + */ 329 313 static void map_at(int fd, void *addr, unsigned long offset, unsigned long len) 330 314 { 331 315 ssize_t r; 332 316 333 - /* We map writable even though for some segments are marked read-only. 317 + /* 318 + * We map writable even though for some segments are marked read-only. 334 319 * The kernel really wants to be writable: it patches its own 335 320 * instructions. 336 321 * 337 322 * MAP_PRIVATE means that the page won't be copied until a write is 338 323 * done to it. This allows us to share untouched memory between 339 - * Guests. */ 324 + * Guests. 325 + */ 340 326 if (mmap(addr, len, PROT_READ|PROT_WRITE|PROT_EXEC, 341 327 MAP_FIXED|MAP_PRIVATE, fd, offset) != MAP_FAILED) 342 328 return; ··· 351 329 err(1, "Reading offset %lu len %lu gave %zi", offset, len, r); 352 330 } 353 331 354 - /* This routine takes an open vmlinux image, which is in ELF, and maps it into 332 + /* 333 + * This routine takes an open vmlinux image, which is in ELF, and maps it into 355 334 * the Guest memory. ELF = Embedded Linking Format, which is the format used 356 335 * by all modern binaries on Linux including the kernel. 357 336 * ··· 360 337 * address. We use the physical address; the Guest will map itself to the 361 338 * virtual address. 362 339 * 363 - * We return the starting address. */ 340 + * We return the starting address. 341 + */ 364 342 static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr) 365 343 { 366 344 Elf32_Phdr phdr[ehdr->e_phnum]; 367 345 unsigned int i; 368 346 369 - /* Sanity checks on the main ELF header: an x86 executable with a 370 - * reasonable number of correctly-sized program headers. */ 347 + /* 348 + * Sanity checks on the main ELF header: an x86 executable with a 349 + * reasonable number of correctly-sized program headers. 350 + */ 371 351 if (ehdr->e_type != ET_EXEC 372 352 || ehdr->e_machine != EM_386 373 353 || ehdr->e_phentsize != sizeof(Elf32_Phdr) 374 354 || ehdr->e_phnum < 1 || ehdr->e_phnum > 65536U/sizeof(Elf32_Phdr)) 375 355 errx(1, "Malformed elf header"); 376 356 377 - /* An ELF executable contains an ELF header and a number of "program" 357 + /* 358 + * An ELF executable contains an ELF header and a number of "program" 378 359 * headers which indicate which parts ("segments") of the program to 379 - * load where. */ 360 + * load where. 361 + */ 380 362 381 363 /* We read in all the program headers at once: */ 382 364 if (lseek(elf_fd, ehdr->e_phoff, SEEK_SET) < 0) ··· 389 361 if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr)) 390 362 err(1, "Reading program headers"); 391 363 392 - /* Try all the headers: there are usually only three. A read-only one, 393 - * a read-write one, and a "note" section which we don't load. */ 364 + /* 365 + * Try all the headers: there are usually only three. A read-only one, 366 + * a read-write one, and a "note" section which we don't load. 367 + */ 394 368 for (i = 0; i < ehdr->e_phnum; i++) { 395 369 /* If this isn't a loadable segment, we ignore it */ 396 370 if (phdr[i].p_type != PT_LOAD) ··· 410 380 return ehdr->e_entry; 411 381 } 412 382 413 - /*L:150 A bzImage, unlike an ELF file, is not meant to be loaded. You're 414 - * supposed to jump into it and it will unpack itself. We used to have to 415 - * perform some hairy magic because the unpacking code scared me. 383 + /*L:150 384 + * A bzImage, unlike an ELF file, is not meant to be loaded. You're supposed 385 + * to jump into it and it will unpack itself. We used to have to perform some 386 + * hairy magic because the unpacking code scared me. 416 387 * 417 388 * Fortunately, Jeremy Fitzhardinge convinced me it wasn't that hard and wrote 418 389 * a small patch to jump over the tricky bits in the Guest, so now we just read 419 - * the funky header so we know where in the file to load, and away we go! */ 390 + * the funky header so we know where in the file to load, and away we go! 391 + */ 420 392 static unsigned long load_bzimage(int fd) 421 393 { 422 394 struct boot_params boot; ··· 426 394 /* Modern bzImages get loaded at 1M. */ 427 395 void *p = from_guest_phys(0x100000); 428 396 429 - /* Go back to the start of the file and read the header. It should be 430 - * a Linux boot header (see Documentation/x86/i386/boot.txt) */ 397 + /* 398 + * Go back to the start of the file and read the header. It should be 399 + * a Linux boot header (see Documentation/x86/i386/boot.txt) 400 + */ 431 401 lseek(fd, 0, SEEK_SET); 432 402 read(fd, &boot, sizeof(boot)); 433 403 ··· 448 414 return boot.hdr.code32_start; 449 415 } 450 416 451 - /*L:140 Loading the kernel is easy when it's a "vmlinux", but most kernels 417 + /*L:140 418 + * Loading the kernel is easy when it's a "vmlinux", but most kernels 452 419 * come wrapped up in the self-decompressing "bzImage" format. With a little 453 - * work, we can load those, too. */ 420 + * work, we can load those, too. 421 + */ 454 422 static unsigned long load_kernel(int fd) 455 423 { 456 424 Elf32_Ehdr hdr; ··· 469 433 return load_bzimage(fd); 470 434 } 471 435 472 - /* This is a trivial little helper to align pages. Andi Kleen hated it because 436 + /* 437 + * This is a trivial little helper to align pages. Andi Kleen hated it because 473 438 * it calls getpagesize() twice: "it's dumb code." 474 439 * 475 440 * Kernel guys get really het up about optimization, even when it's not 476 - * necessary. I leave this code as a reaction against that. */ 441 + * necessary. I leave this code as a reaction against that. 442 + */ 477 443 static inline unsigned long page_align(unsigned long addr) 478 444 { 479 445 /* Add upwards and truncate downwards. */ 480 446 return ((addr + getpagesize()-1) & ~(getpagesize()-1)); 481 447 } 482 448 483 - /*L:180 An "initial ram disk" is a disk image loaded into memory along with 484 - * the kernel which the kernel can use to boot from without needing any 485 - * drivers. Most distributions now use this as standard: the initrd contains 486 - * the code to load the appropriate driver modules for the current machine. 449 + /*L:180 450 + * An "initial ram disk" is a disk image loaded into memory along with the 451 + * kernel which the kernel can use to boot from without needing any drivers. 452 + * Most distributions now use this as standard: the initrd contains the code to 453 + * load the appropriate driver modules for the current machine. 487 454 * 488 455 * Importantly, James Morris works for RedHat, and Fedora uses initrds for its 489 - * kernels. He sent me this (and tells me when I break it). */ 456 + * kernels. He sent me this (and tells me when I break it). 457 + */ 490 458 static unsigned long load_initrd(const char *name, unsigned long mem) 491 459 { 492 460 int ifd; ··· 502 462 if (fstat(ifd, &st) < 0) 503 463 err(1, "fstat() on initrd '%s'", name); 504 464 505 - /* We map the initrd at the top of memory, but mmap wants it to be 506 - * page-aligned, so we round the size up for that. */ 465 + /* 466 + * We map the initrd at the top of memory, but mmap wants it to be 467 + * page-aligned, so we round the size up for that. 468 + */ 507 469 len = page_align(st.st_size); 508 470 map_at(ifd, from_guest_phys(mem - len), 0, st.st_size); 509 - /* Once a file is mapped, you can close the file descriptor. It's a 510 - * little odd, but quite useful. */ 471 + /* 472 + * Once a file is mapped, you can close the file descriptor. It's a 473 + * little odd, but quite useful. 474 + */ 511 475 close(ifd); 512 476 verbose("mapped initrd %s size=%lu @ %p\n", name, len, (void*)mem-len); 513 477 ··· 520 476 } 521 477 /*:*/ 522 478 523 - /* Simple routine to roll all the commandline arguments together with spaces 524 - * between them. */ 479 + /* 480 + * Simple routine to roll all the commandline arguments together with spaces 481 + * between them. 482 + */ 525 483 static void concat(char *dst, char *args[]) 526 484 { 527 485 unsigned int i, len = 0; ··· 540 494 dst[len] = '\0'; 541 495 } 542 496 543 - /*L:185 This is where we actually tell the kernel to initialize the Guest. We 497 + /*L:185 498 + * This is where we actually tell the kernel to initialize the Guest. We 544 499 * saw the arguments it expects when we looked at initialize() in lguest_user.c: 545 500 * the base of Guest "physical" memory, the top physical page to allow and the 546 - * entry point for the Guest. */ 501 + * entry point for the Guest. 502 + */ 547 503 static void tell_kernel(unsigned long start) 548 504 { 549 505 unsigned long args[] = { LHREQ_INITIALIZE, ··· 559 511 } 560 512 /*:*/ 561 513 562 - /* 514 + /*L:200 563 515 * Device Handling. 564 516 * 565 517 * When the Guest gives us a buffer, it sends an array of addresses and sizes. ··· 570 522 static void *_check_pointer(unsigned long addr, unsigned int size, 571 523 unsigned int line) 572 524 { 573 - /* We have to separately check addr and addr+size, because size could 574 - * be huge and addr + size might wrap around. */ 525 + /* 526 + * We have to separately check addr and addr+size, because size could 527 + * be huge and addr + size might wrap around. 528 + */ 575 529 if (addr >= guest_limit || addr + size >= guest_limit) 576 530 errx(1, "%s:%i: Invalid address %#lx", __FILE__, line, addr); 577 - /* We return a pointer for the caller's convenience, now we know it's 578 - * safe to use. */ 531 + /* 532 + * We return a pointer for the caller's convenience, now we know it's 533 + * safe to use. 534 + */ 579 535 return from_guest_phys(addr); 580 536 } 581 537 /* A macro which transparently hands the line number to the real function. */ 582 538 #define check_pointer(addr,size) _check_pointer(addr, size, __LINE__) 583 539 584 - /* Each buffer in the virtqueues is actually a chain of descriptors. This 540 + /* 541 + * Each buffer in the virtqueues is actually a chain of descriptors. This 585 542 * function returns the next descriptor in the chain, or vq->vring.num if we're 586 - * at the end. */ 543 + * at the end. 544 + */ 587 545 static unsigned next_desc(struct vring_desc *desc, 588 546 unsigned int i, unsigned int max) 589 547 { ··· 610 556 return next; 611 557 } 612 558 613 - /* This actually sends the interrupt for this virtqueue */ 559 + /* 560 + * This actually sends the interrupt for this virtqueue, if we've used a 561 + * buffer. 562 + */ 614 563 static void trigger_irq(struct virtqueue *vq) 615 564 { 616 565 unsigned long buf[] = { LHREQ_IRQ, vq->config.irq }; ··· 633 576 err(1, "Triggering irq %i", vq->config.irq); 634 577 } 635 578 636 - /* This looks in the virtqueue and for the first available buffer, and converts 579 + /* 580 + * This looks in the virtqueue for the first available buffer, and converts 637 581 * it to an iovec for convenient access. Since descriptors consist of some 638 582 * number of output then some number of input descriptors, it's actually two 639 583 * iovecs, but we pack them into one and note how many of each there were. 640 584 * 641 - * This function returns the descriptor number found. */ 585 + * This function waits if necessary, and returns the descriptor number found. 586 + */ 642 587 static unsigned wait_for_vq_desc(struct virtqueue *vq, 643 588 struct iovec iov[], 644 589 unsigned int *out_num, unsigned int *in_num) ··· 649 590 struct vring_desc *desc; 650 591 u16 last_avail = lg_last_avail(vq); 651 592 593 + /* There's nothing available? */ 652 594 while (last_avail == vq->vring.avail->idx) { 653 595 u64 event; 654 596 655 - /* OK, tell Guest about progress up to now. */ 597 + /* 598 + * Since we're about to sleep, now is a good time to tell the 599 + * Guest about what we've used up to now. 600 + */ 656 601 trigger_irq(vq); 657 602 658 603 /* OK, now we need to know about added descriptors. */ 659 604 vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY; 660 605 661 - /* They could have slipped one in as we were doing that: make 662 - * sure it's written, then check again. */ 606 + /* 607 + * They could have slipped one in as we were doing that: make 608 + * sure it's written, then check again. 609 + */ 663 610 mb(); 664 611 if (last_avail != vq->vring.avail->idx) { 665 612 vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY; ··· 685 620 errx(1, "Guest moved used index from %u to %u", 686 621 last_avail, vq->vring.avail->idx); 687 622 688 - /* Grab the next descriptor number they're advertising, and increment 689 - * the index we've seen. */ 623 + /* 624 + * Grab the next descriptor number they're advertising, and increment 625 + * the index we've seen. 626 + */ 690 627 head = vq->vring.avail->ring[last_avail % vq->vring.num]; 691 628 lg_last_avail(vq)++; 692 629 ··· 703 636 desc = vq->vring.desc; 704 637 i = head; 705 638 706 - /* If this is an indirect entry, then this buffer contains a descriptor 707 - * table which we handle as if it's any normal descriptor chain. */ 639 + /* 640 + * If this is an indirect entry, then this buffer contains a descriptor 641 + * table which we handle as if it's any normal descriptor chain. 642 + */ 708 643 if (desc[i].flags & VRING_DESC_F_INDIRECT) { 709 644 if (desc[i].len % sizeof(struct vring_desc)) 710 645 errx(1, "Invalid size for indirect buffer table"); ··· 725 656 if (desc[i].flags & VRING_DESC_F_WRITE) 726 657 (*in_num)++; 727 658 else { 728 - /* If it's an output descriptor, they're all supposed 729 - * to come before any input descriptors. */ 659 + /* 660 + * If it's an output descriptor, they're all supposed 661 + * to come before any input descriptors. 662 + */ 730 663 if (*in_num) 731 664 errx(1, "Descriptor has out after in"); 732 665 (*out_num)++; ··· 742 671 return head; 743 672 } 744 673 745 - /* After we've used one of their buffers, we tell them about it. We'll then 746 - * want to send them an interrupt, using trigger_irq(). */ 674 + /* 675 + * After we've used one of their buffers, we tell the Guest about it. Sometime 676 + * later we'll want to send them an interrupt using trigger_irq(); note that 677 + * wait_for_vq_desc() does that for us if it has to wait. 678 + */ 747 679 static void add_used(struct virtqueue *vq, unsigned int head, int len) 748 680 { 749 681 struct vring_used_elem *used; 750 682 751 - /* The virtqueue contains a ring of used buffers. Get a pointer to the 752 - * next entry in that used ring. */ 683 + /* 684 + * The virtqueue contains a ring of used buffers. Get a pointer to the 685 + * next entry in that used ring. 686 + */ 753 687 used = &vq->vring.used->ring[vq->vring.used->idx % vq->vring.num]; 754 688 used->id = head; 755 689 used->len = len; ··· 774 698 /* 775 699 * The Console 776 700 * 777 - * We associate some data with the console for our exit hack. */ 778 - struct console_abort 779 - { 701 + * We associate some data with the console for our exit hack. 702 + */ 703 + struct console_abort { 780 704 /* How many times have they hit ^C? */ 781 705 int count; 782 706 /* When did they start? */ ··· 791 715 struct console_abort *abort = vq->dev->priv; 792 716 struct iovec iov[vq->vring.num]; 793 717 794 - /* Make sure there's a descriptor waiting. */ 718 + /* Make sure there's a descriptor available. */ 795 719 head = wait_for_vq_desc(vq, iov, &out_num, &in_num); 796 720 if (out_num) 797 721 errx(1, "Output buffers in console in queue?"); 798 722 799 - /* Read it in. */ 723 + /* Read into it. This is where we usually wait. */ 800 724 len = readv(STDIN_FILENO, iov, in_num); 801 725 if (len <= 0) { 802 726 /* Ran out of input? */ 803 727 warnx("Failed to get console input, ignoring console."); 804 - /* For simplicity, dying threads kill the whole Launcher. So 805 - * just nap here. */ 728 + /* 729 + * For simplicity, dying threads kill the whole Launcher. So 730 + * just nap here. 731 + */ 806 732 for (;;) 807 733 pause(); 808 734 } 809 735 736 + /* Tell the Guest we used a buffer. */ 810 737 add_used_and_trigger(vq, head, len); 811 738 812 - /* Three ^C within one second? Exit. 739 + /* 740 + * Three ^C within one second? Exit. 813 741 * 814 742 * This is such a hack, but works surprisingly well. Each ^C has to 815 743 * be in a buffer by itself, so they can't be too fast. But we check 816 744 * that we get three within about a second, so they can't be too 817 - * slow. */ 745 + * slow. 746 + */ 818 747 if (len != 1 || ((char *)iov[0].iov_base)[0] != 3) { 819 748 abort->count = 0; 820 749 return; ··· 844 763 unsigned int head, out, in; 845 764 struct iovec iov[vq->vring.num]; 846 765 766 + /* We usually wait in here, for the Guest to give us something. */ 847 767 head = wait_for_vq_desc(vq, iov, &out, &in); 848 768 if (in) 849 769 errx(1, "Input buffers in console output queue?"); 770 + 771 + /* writev can return a partial write, so we loop here. */ 850 772 while (!iov_empty(iov, out)) { 851 773 int len = writev(STDOUT_FILENO, iov, out); 852 774 if (len <= 0) 853 775 err(1, "Write to stdout gave %i", len); 854 776 iov_consume(iov, out, len); 855 777 } 778 + 779 + /* 780 + * We're finished with that buffer: if we're going to sleep, 781 + * wait_for_vq_desc() will prod the Guest with an interrupt. 782 + */ 856 783 add_used(vq, head, 0); 857 784 } 858 785 ··· 880 791 unsigned int head, out, in; 881 792 struct iovec iov[vq->vring.num]; 882 793 794 + /* We usually wait in here for the Guest to give us a packet. */ 883 795 head = wait_for_vq_desc(vq, iov, &out, &in); 884 796 if (in) 885 797 errx(1, "Input buffers in net output queue?"); 798 + /* 799 + * Send the whole thing through to /dev/net/tun. It expects the exact 800 + * same format: what a coincidence! 801 + */ 886 802 if (writev(net_info->tunfd, iov, out) < 0) 887 803 errx(1, "Write to tun failed?"); 804 + 805 + /* 806 + * Done with that one; wait_for_vq_desc() will send the interrupt if 807 + * all packets are processed. 808 + */ 888 809 add_used(vq, head, 0); 889 810 } 890 811 891 - /* Will reading from this file descriptor block? */ 812 + /* 813 + * Handling network input is a bit trickier, because I've tried to optimize it. 814 + * 815 + * First we have a helper routine which tells is if from this file descriptor 816 + * (ie. the /dev/net/tun device) will block: 817 + */ 892 818 static bool will_block(int fd) 893 819 { 894 820 fd_set fdset; ··· 913 809 return select(fd+1, &fdset, NULL, NULL, &zero) != 1; 914 810 } 915 811 916 - /* This is where we handle packets coming in from the tun device to our 917 - * Guest. */ 812 + /* 813 + * This handles packets coming in from the tun device to our Guest. Like all 814 + * service routines, it gets called again as soon as it returns, so you don't 815 + * see a while(1) loop here. 816 + */ 918 817 static void net_input(struct virtqueue *vq) 919 818 { 920 819 int len; ··· 925 818 struct iovec iov[vq->vring.num]; 926 819 struct net_info *net_info = vq->dev->priv; 927 820 821 + /* 822 + * Get a descriptor to write an incoming packet into. This will also 823 + * send an interrupt if they're out of descriptors. 824 + */ 928 825 head = wait_for_vq_desc(vq, iov, &out, &in); 929 826 if (out) 930 827 errx(1, "Output buffers in net input queue?"); 931 828 932 - /* Deliver interrupt now, since we're about to sleep. */ 829 + /* 830 + * If it looks like we'll block reading from the tun device, send them 831 + * an interrupt. 832 + */ 933 833 if (vq->pending_used && will_block(net_info->tunfd)) 934 834 trigger_irq(vq); 935 835 836 + /* 837 + * Read in the packet. This is where we normally wait (when there's no 838 + * incoming network traffic). 839 + */ 936 840 len = readv(net_info->tunfd, iov, in); 937 841 if (len <= 0) 938 842 err(1, "Failed to read from tun."); 843 + 844 + /* 845 + * Mark that packet buffer as used, but don't interrupt here. We want 846 + * to wait until we've done as much work as we can. 847 + */ 939 848 add_used(vq, head, len); 940 849 } 850 + /*:*/ 941 851 942 - /* This is the helper to create threads. */ 852 + /* This is the helper to create threads: run the service routine in a loop. */ 943 853 static int do_thread(void *_vq) 944 854 { 945 855 struct virtqueue *vq = _vq; ··· 966 842 return 0; 967 843 } 968 844 969 - /* When a child dies, we kill our entire process group with SIGTERM. This 970 - * also has the side effect that the shell restores the console for us! */ 845 + /* 846 + * When a child dies, we kill our entire process group with SIGTERM. This 847 + * also has the side effect that the shell restores the console for us! 848 + */ 971 849 static void kill_launcher(int signal) 972 850 { 973 851 kill(0, SIGTERM); ··· 1004 878 signal(SIGCHLD, (void *)kill_launcher); 1005 879 } 1006 880 881 + /*L:216 882 + * This actually creates the thread which services the virtqueue for a device. 883 + */ 1007 884 static void create_thread(struct virtqueue *vq) 1008 885 { 1009 - /* Create stack for thread and run it. Since stack grows 1010 - * upwards, we point the stack pointer to the end of this 1011 - * region. */ 886 + /* 887 + * Create stack for thread. Since the stack grows upwards, we point 888 + * the stack pointer to the end of this region. 889 + */ 1012 890 char *stack = malloc(32768); 1013 891 unsigned long args[] = { LHREQ_EVENTFD, 1014 892 vq->config.pfn*getpagesize(), 0 }; ··· 1023 893 err(1, "Creating eventfd"); 1024 894 args[2] = vq->eventfd; 1025 895 1026 - /* Attach an eventfd to this virtqueue: it will go off 1027 - * when the Guest does an LHCALL_NOTIFY for this vq. */ 896 + /* 897 + * Attach an eventfd to this virtqueue: it will go off when the Guest 898 + * does an LHCALL_NOTIFY for this vq. 899 + */ 1028 900 if (write(lguest_fd, &args, sizeof(args)) != 0) 1029 901 err(1, "Attaching eventfd"); 1030 902 1031 - /* CLONE_VM: because it has to access the Guest memory, and 1032 - * SIGCHLD so we get a signal if it dies. */ 903 + /* 904 + * CLONE_VM: because it has to access the Guest memory, and SIGCHLD so 905 + * we get a signal if it dies. 906 + */ 1033 907 vq->thread = clone(do_thread, stack + 32768, CLONE_VM | SIGCHLD, vq); 1034 908 if (vq->thread == (pid_t)-1) 1035 909 err(1, "Creating clone"); 1036 - /* We close our local copy, now the child has it. */ 910 + 911 + /* We close our local copy now the child has it. */ 1037 912 close(vq->eventfd); 1038 913 } 1039 914 ··· 1090 955 } 1091 956 } 1092 957 1093 - /* This is the generic routine we call when the Guest uses LHCALL_NOTIFY. */ 958 + /*L:215 959 + * This is the generic routine we call when the Guest uses LHCALL_NOTIFY. In 960 + * particular, it's used to notify us of device status changes during boot. 961 + */ 1094 962 static void handle_output(unsigned long addr) 1095 963 { 1096 964 struct device *i; ··· 1102 964 for (i = devices.dev; i; i = i->next) { 1103 965 struct virtqueue *vq; 1104 966 1105 - /* Notifications to device descriptors update device status. */ 967 + /* 968 + * Notifications to device descriptors mean they updated the 969 + * device status. 970 + */ 1106 971 if (from_guest_phys(addr) == i->desc) { 1107 972 update_device_status(i); 1108 973 return; 1109 974 } 1110 975 1111 - /* Devices *can* be used before status is set to DRIVER_OK. */ 976 + /* 977 + * Devices *can* be used before status is set to DRIVER_OK. 978 + * The original plan was that they would never do this: they 979 + * would always finish setting up their status bits before 980 + * actually touching the virtqueues. In practice, we allowed 981 + * them to, and they do (eg. the disk probes for partition 982 + * tables as part of initialization). 983 + * 984 + * If we see this, we start the device: once it's running, we 985 + * expect the device to catch all the notifications. 986 + */ 1112 987 for (vq = i->vq; vq; vq = vq->next) { 1113 988 if (addr != vq->config.pfn*getpagesize()) 1114 989 continue; 1115 990 if (i->running) 1116 991 errx(1, "Notification on running %s", i->name); 992 + /* This just calls create_thread() for each virtqueue */ 1117 993 start_device(i); 1118 994 return; 1119 995 } 1120 996 } 1121 997 1122 - /* Early console write is done using notify on a nul-terminated string 1123 - * in Guest memory. */ 998 + /* 999 + * Early console write is done using notify on a nul-terminated string 1000 + * in Guest memory. It's also great for hacking debugging messages 1001 + * into a Guest. 1002 + */ 1124 1003 if (addr >= guest_limit) 1125 1004 errx(1, "Bad NOTIFY %#lx", addr); 1126 1005 ··· 1153 998 * routines to allocate and manage them. 1154 999 */ 1155 1000 1156 - /* The layout of the device page is a "struct lguest_device_desc" followed by a 1001 + /* 1002 + * The layout of the device page is a "struct lguest_device_desc" followed by a 1157 1003 * number of virtqueue descriptors, then two sets of feature bits, then an 1158 1004 * array of configuration bytes. This routine returns the configuration 1159 - * pointer. */ 1005 + * pointer. 1006 + */ 1160 1007 static u8 *device_config(const struct device *dev) 1161 1008 { 1162 1009 return (void *)(dev->desc + 1) ··· 1166 1009 + dev->feature_len * 2; 1167 1010 } 1168 1011 1169 - /* This routine allocates a new "struct lguest_device_desc" from descriptor 1012 + /* 1013 + * This routine allocates a new "struct lguest_device_desc" from descriptor 1170 1014 * table page just above the Guest's normal memory. It returns a pointer to 1171 - * that descriptor. */ 1015 + * that descriptor. 1016 + */ 1172 1017 static struct lguest_device_desc *new_dev_desc(u16 type) 1173 1018 { 1174 1019 struct lguest_device_desc d = { .type = type }; ··· 1191 1032 return memcpy(p, &d, sizeof(d)); 1192 1033 } 1193 1034 1194 - /* Each device descriptor is followed by the description of its virtqueues. We 1195 - * specify how many descriptors the virtqueue is to have. */ 1035 + /* 1036 + * Each device descriptor is followed by the description of its virtqueues. We 1037 + * specify how many descriptors the virtqueue is to have. 1038 + */ 1196 1039 static void add_virtqueue(struct device *dev, unsigned int num_descs, 1197 1040 void (*service)(struct virtqueue *)) 1198 1041 { ··· 1211 1050 vq->next = NULL; 1212 1051 vq->last_avail_idx = 0; 1213 1052 vq->dev = dev; 1053 + 1054 + /* 1055 + * This is the routine the service thread will run, and its Process ID 1056 + * once it's running. 1057 + */ 1214 1058 vq->service = service; 1215 1059 vq->thread = (pid_t)-1; 1216 1060 ··· 1227 1061 /* Initialize the vring. */ 1228 1062 vring_init(&vq->vring, num_descs, p, LGUEST_VRING_ALIGN); 1229 1063 1230 - /* Append virtqueue to this device's descriptor. We use 1064 + /* 1065 + * Append virtqueue to this device's descriptor. We use 1231 1066 * device_config() to get the end of the device's current virtqueues; 1232 1067 * we check that we haven't added any config or feature information 1233 - * yet, otherwise we'd be overwriting them. */ 1068 + * yet, otherwise we'd be overwriting them. 1069 + */ 1234 1070 assert(dev->desc->config_len == 0 && dev->desc->feature_len == 0); 1235 1071 memcpy(device_config(dev), &vq->config, sizeof(vq->config)); 1236 1072 dev->num_vq++; ··· 1240 1072 1241 1073 verbose("Virtqueue page %#lx\n", to_guest_phys(p)); 1242 1074 1243 - /* Add to tail of list, so dev->vq is first vq, dev->vq->next is 1244 - * second. */ 1075 + /* 1076 + * Add to tail of list, so dev->vq is first vq, dev->vq->next is 1077 + * second. 1078 + */ 1245 1079 for (i = &dev->vq; *i; i = &(*i)->next); 1246 1080 *i = vq; 1247 1081 } 1248 1082 1249 - /* The first half of the feature bitmask is for us to advertise features. The 1250 - * second half is for the Guest to accept features. */ 1083 + /* 1084 + * The first half of the feature bitmask is for us to advertise features. The 1085 + * second half is for the Guest to accept features. 1086 + */ 1251 1087 static void add_feature(struct device *dev, unsigned bit) 1252 1088 { 1253 1089 u8 *features = get_feature_bits(dev); ··· 1265 1093 features[bit / CHAR_BIT] |= (1 << (bit % CHAR_BIT)); 1266 1094 } 1267 1095 1268 - /* This routine sets the configuration fields for an existing device's 1096 + /* 1097 + * This routine sets the configuration fields for an existing device's 1269 1098 * descriptor. It only works for the last device, but that's OK because that's 1270 - * how we use it. */ 1099 + * how we use it. 1100 + */ 1271 1101 static void set_config(struct device *dev, unsigned len, const void *conf) 1272 1102 { 1273 1103 /* Check we haven't overflowed our single page. */ ··· 1279 1105 /* Copy in the config information, and store the length. */ 1280 1106 memcpy(device_config(dev), conf, len); 1281 1107 dev->desc->config_len = len; 1108 + 1109 + /* Size must fit in config_len field (8 bits)! */ 1110 + assert(dev->desc->config_len == len); 1282 1111 } 1283 1112 1284 - /* This routine does all the creation and setup of a new device, including 1285 - * calling new_dev_desc() to allocate the descriptor and device memory. 1113 + /* 1114 + * This routine does all the creation and setup of a new device, including 1115 + * calling new_dev_desc() to allocate the descriptor and device memory. We 1116 + * don't actually start the service threads until later. 1286 1117 * 1287 - * See what I mean about userspace being boring? */ 1118 + * See what I mean about userspace being boring? 1119 + */ 1288 1120 static struct device *new_device(const char *name, u16 type) 1289 1121 { 1290 1122 struct device *dev = malloc(sizeof(*dev)); ··· 1303 1123 dev->num_vq = 0; 1304 1124 dev->running = false; 1305 1125 1306 - /* Append to device list. Prepending to a single-linked list is 1126 + /* 1127 + * Append to device list. Prepending to a single-linked list is 1307 1128 * easier, but the user expects the devices to be arranged on the bus 1308 1129 * in command-line order. The first network device on the command line 1309 - * is eth0, the first block device /dev/vda, etc. */ 1130 + * is eth0, the first block device /dev/vda, etc. 1131 + */ 1310 1132 if (devices.lastdev) 1311 1133 devices.lastdev->next = dev; 1312 1134 else ··· 1318 1136 return dev; 1319 1137 } 1320 1138 1321 - /* Our first setup routine is the console. It's a fairly simple device, but 1322 - * UNIX tty handling makes it uglier than it could be. */ 1139 + /* 1140 + * Our first setup routine is the console. It's a fairly simple device, but 1141 + * UNIX tty handling makes it uglier than it could be. 1142 + */ 1323 1143 static void setup_console(void) 1324 1144 { 1325 1145 struct device *dev; ··· 1329 1145 /* If we can save the initial standard input settings... */ 1330 1146 if (tcgetattr(STDIN_FILENO, &orig_term) == 0) { 1331 1147 struct termios term = orig_term; 1332 - /* Then we turn off echo, line buffering and ^C etc. We want a 1333 - * raw input stream to the Guest. */ 1148 + /* 1149 + * Then we turn off echo, line buffering and ^C etc: We want a 1150 + * raw input stream to the Guest. 1151 + */ 1334 1152 term.c_lflag &= ~(ISIG|ICANON|ECHO); 1335 1153 tcsetattr(STDIN_FILENO, TCSANOW, &term); 1336 1154 } ··· 1343 1157 dev->priv = malloc(sizeof(struct console_abort)); 1344 1158 ((struct console_abort *)dev->priv)->count = 0; 1345 1159 1346 - /* The console needs two virtqueues: the input then the output. When 1160 + /* 1161 + * The console needs two virtqueues: the input then the output. When 1347 1162 * they put something the input queue, we make sure we're listening to 1348 1163 * stdin. When they put something in the output queue, we write it to 1349 - * stdout. */ 1164 + * stdout. 1165 + */ 1350 1166 add_virtqueue(dev, VIRTQUEUE_NUM, console_input); 1351 1167 add_virtqueue(dev, VIRTQUEUE_NUM, console_output); 1352 1168 ··· 1356 1168 } 1357 1169 /*:*/ 1358 1170 1359 - /*M:010 Inter-guest networking is an interesting area. Simplest is to have a 1171 + /*M:010 1172 + * Inter-guest networking is an interesting area. Simplest is to have a 1360 1173 * --sharenet=<name> option which opens or creates a named pipe. This can be 1361 1174 * used to send packets to another guest in a 1:1 manner. 1362 1175 * ··· 1371 1182 * multiple inter-guest channels behind one interface, although it would 1372 1183 * require some manner of hotplugging new virtio channels. 1373 1184 * 1374 - * Finally, we could implement a virtio network switch in the kernel. :*/ 1185 + * Finally, we could implement a virtio network switch in the kernel. 1186 + :*/ 1375 1187 1376 1188 static u32 str2ip(const char *ipaddr) 1377 1189 { ··· 1397 1207 mac[5] = m[5]; 1398 1208 } 1399 1209 1400 - /* This code is "adapted" from libbridge: it attaches the Host end of the 1210 + /* 1211 + * This code is "adapted" from libbridge: it attaches the Host end of the 1401 1212 * network device to the bridge device specified by the command line. 1402 1213 * 1403 1214 * This is yet another James Morris contribution (I'm an IP-level guy, so I 1404 - * dislike bridging), and I just try not to break it. */ 1215 + * dislike bridging), and I just try not to break it. 1216 + */ 1405 1217 static void add_to_bridge(int fd, const char *if_name, const char *br_name) 1406 1218 { 1407 1219 int ifidx; ··· 1423 1231 err(1, "can't add %s to bridge %s", if_name, br_name); 1424 1232 } 1425 1233 1426 - /* This sets up the Host end of the network device with an IP address, brings 1234 + /* 1235 + * This sets up the Host end of the network device with an IP address, brings 1427 1236 * it up so packets will flow, the copies the MAC address into the hwaddr 1428 - * pointer. */ 1237 + * pointer. 1238 + */ 1429 1239 static void configure_device(int fd, const char *tapif, u32 ipaddr) 1430 1240 { 1431 1241 struct ifreq ifr; ··· 1454 1260 /* Start with this zeroed. Messy but sure. */ 1455 1261 memset(&ifr, 0, sizeof(ifr)); 1456 1262 1457 - /* We open the /dev/net/tun device and tell it we want a tap device. A 1263 + /* 1264 + * We open the /dev/net/tun device and tell it we want a tap device. A 1458 1265 * tap device is like a tun device, only somehow different. To tell 1459 1266 * the truth, I completely blundered my way through this code, but it 1460 - * works now! */ 1267 + * works now! 1268 + */ 1461 1269 netfd = open_or_die("/dev/net/tun", O_RDWR); 1462 1270 ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR; 1463 1271 strcpy(ifr.ifr_name, "tap%d"); ··· 1470 1274 TUN_F_CSUM|TUN_F_TSO4|TUN_F_TSO6|TUN_F_TSO_ECN) != 0) 1471 1275 err(1, "Could not set features for tun device"); 1472 1276 1473 - /* We don't need checksums calculated for packets coming in this 1474 - * device: trust us! */ 1277 + /* 1278 + * We don't need checksums calculated for packets coming in this 1279 + * device: trust us! 1280 + */ 1475 1281 ioctl(netfd, TUNSETNOCSUM, 1); 1476 1282 1477 1283 memcpy(tapif, ifr.ifr_name, IFNAMSIZ); 1478 1284 return netfd; 1479 1285 } 1480 1286 1481 - /*L:195 Our network is a Host<->Guest network. This can either use bridging or 1287 + /*L:195 1288 + * Our network is a Host<->Guest network. This can either use bridging or 1482 1289 * routing, but the principle is the same: it uses the "tun" device to inject 1483 1290 * packets into the Host as if they came in from a normal network card. We 1484 - * just shunt packets between the Guest and the tun device. */ 1291 + * just shunt packets between the Guest and the tun device. 1292 + */ 1485 1293 static void setup_tun_net(char *arg) 1486 1294 { 1487 1295 struct device *dev; ··· 1502 1302 dev = new_device("net", VIRTIO_ID_NET); 1503 1303 dev->priv = net_info; 1504 1304 1505 - /* Network devices need a receive and a send queue, just like 1506 - * console. */ 1305 + /* Network devices need a recv and a send queue, just like console. */ 1507 1306 add_virtqueue(dev, VIRTQUEUE_NUM, net_input); 1508 1307 add_virtqueue(dev, VIRTQUEUE_NUM, net_output); 1509 1308 1510 - /* We need a socket to perform the magic network ioctls to bring up the 1511 - * tap interface, connect to the bridge etc. Any socket will do! */ 1309 + /* 1310 + * We need a socket to perform the magic network ioctls to bring up the 1311 + * tap interface, connect to the bridge etc. Any socket will do! 1312 + */ 1512 1313 ipfd = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP); 1513 1314 if (ipfd < 0) 1514 1315 err(1, "opening IP socket"); ··· 1563 1362 verbose("device %u: tun %s: %s\n", 1564 1363 devices.device_num, tapif, arg); 1565 1364 } 1566 - 1567 - /* Our block (disk) device should be really simple: the Guest asks for a block 1568 - * number and we read or write that position in the file. Unfortunately, that 1569 - * was amazingly slow: the Guest waits until the read is finished before 1570 - * running anything else, even if it could have been doing useful work. 1571 - * 1572 - * We could use async I/O, except it's reputed to suck so hard that characters 1573 - * actually go missing from your code when you try to use it. 1574 - * 1575 - * So we farm the I/O out to thread, and communicate with it via a pipe. */ 1365 + /*:*/ 1576 1366 1577 1367 /* This hangs off device->priv. */ 1578 - struct vblk_info 1579 - { 1368 + struct vblk_info { 1580 1369 /* The size of the file. */ 1581 1370 off64_t len; 1582 1371 1583 1372 /* The file descriptor for the file. */ 1584 1373 int fd; 1585 1374 1586 - /* IO thread listens on this file descriptor [0]. */ 1587 - int workpipe[2]; 1588 - 1589 - /* IO thread writes to this file descriptor to mark it done, then 1590 - * Launcher triggers interrupt to Guest. */ 1591 - int done_fd; 1592 1375 }; 1593 1376 1594 1377 /*L:210 1595 1378 * The Disk 1596 1379 * 1597 - * Remember that the block device is handled by a separate I/O thread. We head 1598 - * straight into the core of that thread here: 1380 + * The disk only has one virtqueue, so it only has one thread. It is really 1381 + * simple: the Guest asks for a block number and we read or write that position 1382 + * in the file. 1383 + * 1384 + * Before we serviced each virtqueue in a separate thread, that was unacceptably 1385 + * slow: the Guest waits until the read is finished before running anything 1386 + * else, even if it could have been doing useful work. 1387 + * 1388 + * We could have used async I/O, except it's reputed to suck so hard that 1389 + * characters actually go missing from your code when you try to use it. 1599 1390 */ 1600 1391 static void blk_request(struct virtqueue *vq) 1601 1392 { ··· 1599 1406 struct iovec iov[vq->vring.num]; 1600 1407 off64_t off; 1601 1408 1602 - /* Get the next request. */ 1409 + /* 1410 + * Get the next request, where we normally wait. It triggers the 1411 + * interrupt to acknowledge previously serviced requests (if any). 1412 + */ 1603 1413 head = wait_for_vq_desc(vq, iov, &out_num, &in_num); 1604 1414 1605 - /* Every block request should contain at least one output buffer 1415 + /* 1416 + * Every block request should contain at least one output buffer 1606 1417 * (detailing the location on disk and the type of request) and one 1607 - * input buffer (to hold the result). */ 1418 + * input buffer (to hold the result). 1419 + */ 1608 1420 if (out_num == 0 || in_num == 0) 1609 1421 errx(1, "Bad virtblk cmd %u out=%u in=%u", 1610 1422 head, out_num, in_num); 1611 1423 1612 1424 out = convert(&iov[0], struct virtio_blk_outhdr); 1613 1425 in = convert(&iov[out_num+in_num-1], u8); 1426 + /* 1427 + * For historical reasons, block operations are expressed in 512 byte 1428 + * "sectors". 1429 + */ 1614 1430 off = out->sector * 512; 1615 1431 1616 - /* The block device implements "barriers", where the Guest indicates 1432 + /* 1433 + * The block device implements "barriers", where the Guest indicates 1617 1434 * that it wants all previous writes to occur before this write. We 1618 1435 * don't have a way of asking our kernel to do a barrier, so we just 1619 - * synchronize all the data in the file. Pretty poor, no? */ 1436 + * synchronize all the data in the file. Pretty poor, no? 1437 + */ 1620 1438 if (out->type & VIRTIO_BLK_T_BARRIER) 1621 1439 fdatasync(vblk->fd); 1622 1440 1623 - /* In general the virtio block driver is allowed to try SCSI commands. 1624 - * It'd be nice if we supported eject, for example, but we don't. */ 1441 + /* 1442 + * In general the virtio block driver is allowed to try SCSI commands. 1443 + * It'd be nice if we supported eject, for example, but we don't. 1444 + */ 1625 1445 if (out->type & VIRTIO_BLK_T_SCSI_CMD) { 1626 1446 fprintf(stderr, "Scsi commands unsupported\n"); 1627 1447 *in = VIRTIO_BLK_S_UNSUPP; 1628 1448 wlen = sizeof(*in); 1629 1449 } else if (out->type & VIRTIO_BLK_T_OUT) { 1630 - /* Write */ 1631 - 1632 - /* Move to the right location in the block file. This can fail 1633 - * if they try to write past end. */ 1450 + /* 1451 + * Write 1452 + * 1453 + * Move to the right location in the block file. This can fail 1454 + * if they try to write past end. 1455 + */ 1634 1456 if (lseek64(vblk->fd, off, SEEK_SET) != off) 1635 1457 err(1, "Bad seek to sector %llu", out->sector); 1636 1458 1637 1459 ret = writev(vblk->fd, iov+1, out_num-1); 1638 1460 verbose("WRITE to sector %llu: %i\n", out->sector, ret); 1639 1461 1640 - /* Grr... Now we know how long the descriptor they sent was, we 1462 + /* 1463 + * Grr... Now we know how long the descriptor they sent was, we 1641 1464 * make sure they didn't try to write over the end of the block 1642 - * file (possibly extending it). */ 1465 + * file (possibly extending it). 1466 + */ 1643 1467 if (ret > 0 && off + ret > vblk->len) { 1644 1468 /* Trim it back to the correct length */ 1645 1469 ftruncate64(vblk->fd, vblk->len); ··· 1666 1456 wlen = sizeof(*in); 1667 1457 *in = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR); 1668 1458 } else { 1669 - /* Read */ 1670 - 1671 - /* Move to the right location in the block file. This can fail 1672 - * if they try to read past end. */ 1459 + /* 1460 + * Read 1461 + * 1462 + * Move to the right location in the block file. This can fail 1463 + * if they try to read past end. 1464 + */ 1673 1465 if (lseek64(vblk->fd, off, SEEK_SET) != off) 1674 1466 err(1, "Bad seek to sector %llu", out->sector); 1675 1467 ··· 1686 1474 } 1687 1475 } 1688 1476 1689 - /* OK, so we noted that it was pretty poor to use an fdatasync as a 1477 + /* 1478 + * OK, so we noted that it was pretty poor to use an fdatasync as a 1690 1479 * barrier. But Christoph Hellwig points out that we need a sync 1691 1480 * *afterwards* as well: "Barriers specify no reordering to the front 1692 - * or the back." And Jens Axboe confirmed it, so here we are: */ 1481 + * or the back." And Jens Axboe confirmed it, so here we are: 1482 + */ 1693 1483 if (out->type & VIRTIO_BLK_T_BARRIER) 1694 1484 fdatasync(vblk->fd); 1695 1485 1486 + /* Finished that request. */ 1696 1487 add_used(vq, head, wlen); 1697 1488 } 1698 1489 ··· 1706 1491 struct vblk_info *vblk; 1707 1492 struct virtio_blk_config conf; 1708 1493 1709 - /* The device responds to return from I/O thread. */ 1494 + /* Creat the device. */ 1710 1495 dev = new_device("block", VIRTIO_ID_BLOCK); 1711 1496 1712 1497 /* The device has one virtqueue, where the Guest places requests. */ ··· 1725 1510 /* Tell Guest how many sectors this device has. */ 1726 1511 conf.capacity = cpu_to_le64(vblk->len / 512); 1727 1512 1728 - /* Tell Guest not to put in too many descriptors at once: two are used 1729 - * for the in and out elements. */ 1513 + /* 1514 + * Tell Guest not to put in too many descriptors at once: two are used 1515 + * for the in and out elements. 1516 + */ 1730 1517 add_feature(dev, VIRTIO_BLK_F_SEG_MAX); 1731 1518 conf.seg_max = cpu_to_le32(VIRTQUEUE_NUM - 2); 1732 1519 1733 - set_config(dev, sizeof(conf), &conf); 1520 + /* Don't try to put whole struct: we have 8 bit limit. */ 1521 + set_config(dev, offsetof(struct virtio_blk_config, geometry), &conf); 1734 1522 1735 1523 verbose("device %u: virtblock %llu sectors\n", 1736 1524 ++devices.device_num, le64_to_cpu(conf.capacity)); 1737 1525 } 1738 1526 1739 - struct rng_info { 1740 - int rfd; 1741 - }; 1742 - 1743 - /* Our random number generator device reads from /dev/random into the Guest's 1527 + /*L:211 1528 + * Our random number generator device reads from /dev/random into the Guest's 1744 1529 * input buffers. The usual case is that the Guest doesn't want random numbers 1745 1530 * and so has no buffers although /dev/random is still readable, whereas 1746 1531 * console is the reverse. 1747 1532 * 1748 - * The same logic applies, however. */ 1533 + * The same logic applies, however. 1534 + */ 1535 + struct rng_info { 1536 + int rfd; 1537 + }; 1538 + 1749 1539 static void rng_input(struct virtqueue *vq) 1750 1540 { 1751 1541 int len; ··· 1763 1543 if (out_num) 1764 1544 errx(1, "Output buffers in rng?"); 1765 1545 1766 - /* This is why we convert to iovecs: the readv() call uses them, and so 1767 - * it reads straight into the Guest's buffer. We loop to make sure we 1768 - * fill it. */ 1546 + /* 1547 + * Just like the console write, we loop to cover the whole iovec. 1548 + * In this case, short reads actually happen quite a bit. 1549 + */ 1769 1550 while (!iov_empty(iov, in_num)) { 1770 1551 len = readv(rng_info->rfd, iov, in_num); 1771 1552 if (len <= 0) ··· 1779 1558 add_used(vq, head, totlen); 1780 1559 } 1781 1560 1782 - /* And this creates a "hardware" random number device for the Guest. */ 1561 + /*L:199 1562 + * This creates a "hardware" random number device for the Guest. 1563 + */ 1783 1564 static void setup_rng(void) 1784 1565 { 1785 1566 struct device *dev; 1786 1567 struct rng_info *rng_info = malloc(sizeof(*rng_info)); 1787 1568 1569 + /* Our device's privat info simply contains the /dev/random fd. */ 1788 1570 rng_info->rfd = open_or_die("/dev/random", O_RDONLY); 1789 1571 1790 - /* The device responds to return from I/O thread. */ 1572 + /* Create the new device. */ 1791 1573 dev = new_device("rng", VIRTIO_ID_RNG); 1792 1574 dev->priv = rng_info; 1793 1575 ··· 1806 1582 { 1807 1583 unsigned int i; 1808 1584 1809 - /* Since we don't track all open fds, we simply close everything beyond 1810 - * stderr. */ 1585 + /* 1586 + * Since we don't track all open fds, we simply close everything beyond 1587 + * stderr. 1588 + */ 1811 1589 for (i = 3; i < FD_SETSIZE; i++) 1812 1590 close(i); 1813 1591 ··· 1820 1594 err(1, "Could not exec %s", main_args[0]); 1821 1595 } 1822 1596 1823 - /*L:220 Finally we reach the core of the Launcher which runs the Guest, serves 1824 - * its input and output, and finally, lays it to rest. */ 1597 + /*L:220 1598 + * Finally we reach the core of the Launcher which runs the Guest, serves 1599 + * its input and output, and finally, lays it to rest. 1600 + */ 1825 1601 static void __attribute__((noreturn)) run_guest(void) 1826 1602 { 1827 1603 for (;;) { ··· 1858 1630 * 1859 1631 * Are you ready? Take a deep breath and join me in the core of the Host, in 1860 1632 * "make Host". 1861 - :*/ 1633 + :*/ 1862 1634 1863 1635 static struct option opts[] = { 1864 1636 { "verbose", 0, NULL, 'v' }, ··· 1879 1651 /*L:105 The main routine is where the real work begins: */ 1880 1652 int main(int argc, char *argv[]) 1881 1653 { 1882 - /* Memory, top-level pagetable, code startpoint and size of the 1883 - * (optional) initrd. */ 1654 + /* Memory, code startpoint and size of the (optional) initrd. */ 1884 1655 unsigned long mem = 0, start, initrd_size = 0; 1885 1656 /* Two temporaries. */ 1886 1657 int i, c; ··· 1891 1664 /* Save the args: we "reboot" by execing ourselves again. */ 1892 1665 main_args = argv; 1893 1666 1894 - /* First we initialize the device list. We keep a pointer to the last 1667 + /* 1668 + * First we initialize the device list. We keep a pointer to the last 1895 1669 * device, and the next interrupt number to use for devices (1: 1896 - * remember that 0 is used by the timer). */ 1670 + * remember that 0 is used by the timer). 1671 + */ 1897 1672 devices.lastdev = NULL; 1898 1673 devices.next_irq = 1; 1899 1674 1675 + /* We're CPU 0. In fact, that's the only CPU possible right now. */ 1900 1676 cpu_id = 0; 1901 - /* We need to know how much memory so we can set up the device 1677 + 1678 + /* 1679 + * We need to know how much memory so we can set up the device 1902 1680 * descriptor and memory pages for the devices as we parse the command 1903 1681 * line. So we quickly look through the arguments to find the amount 1904 - * of memory now. */ 1682 + * of memory now. 1683 + */ 1905 1684 for (i = 1; i < argc; i++) { 1906 1685 if (argv[i][0] != '-') { 1907 1686 mem = atoi(argv[i]) * 1024 * 1024; 1908 - /* We start by mapping anonymous pages over all of 1687 + /* 1688 + * We start by mapping anonymous pages over all of 1909 1689 * guest-physical memory range. This fills it with 0, 1910 1690 * and ensures that the Guest won't be killed when it 1911 - * tries to access it. */ 1691 + * tries to access it. 1692 + */ 1912 1693 guest_base = map_zeroed_pages(mem / getpagesize() 1913 1694 + DEVICE_PAGES); 1914 1695 guest_limit = mem; ··· 1949 1714 usage(); 1950 1715 } 1951 1716 } 1952 - /* After the other arguments we expect memory and kernel image name, 1953 - * followed by command line arguments for the kernel. */ 1717 + /* 1718 + * After the other arguments we expect memory and kernel image name, 1719 + * followed by command line arguments for the kernel. 1720 + */ 1954 1721 if (optind + 2 > argc) 1955 1722 usage(); 1956 1723 ··· 1970 1733 /* Map the initrd image if requested (at top of physical memory) */ 1971 1734 if (initrd_name) { 1972 1735 initrd_size = load_initrd(initrd_name, mem); 1973 - /* These are the location in the Linux boot header where the 1974 - * start and size of the initrd are expected to be found. */ 1736 + /* 1737 + * These are the location in the Linux boot header where the 1738 + * start and size of the initrd are expected to be found. 1739 + */ 1975 1740 boot->hdr.ramdisk_image = mem - initrd_size; 1976 1741 boot->hdr.ramdisk_size = initrd_size; 1977 1742 /* The bootloader type 0xFF means "unknown"; that's OK. */ 1978 1743 boot->hdr.type_of_loader = 0xFF; 1979 1744 } 1980 1745 1981 - /* The Linux boot header contains an "E820" memory map: ours is a 1982 - * simple, single region. */ 1746 + /* 1747 + * The Linux boot header contains an "E820" memory map: ours is a 1748 + * simple, single region. 1749 + */ 1983 1750 boot->e820_entries = 1; 1984 1751 boot->e820_map[0] = ((struct e820entry) { 0, mem, E820_RAM }); 1985 - /* The boot header contains a command line pointer: we put the command 1986 - * line after the boot header. */ 1752 + /* 1753 + * The boot header contains a command line pointer: we put the command 1754 + * line after the boot header. 1755 + */ 1987 1756 boot->hdr.cmd_line_ptr = to_guest_phys(boot + 1); 1988 1757 /* We use a simple helper to copy the arguments separated by spaces. */ 1989 1758 concat((char *)(boot + 1), argv+optind+2); ··· 2003 1760 /* Tell the entry path not to try to reload segment registers. */ 2004 1761 boot->hdr.loadflags |= KEEP_SEGMENTS; 2005 1762 2006 - /* We tell the kernel to initialize the Guest: this returns the open 2007 - * /dev/lguest file descriptor. */ 1763 + /* 1764 + * We tell the kernel to initialize the Guest: this returns the open 1765 + * /dev/lguest file descriptor. 1766 + */ 2008 1767 tell_kernel(start); 2009 1768 2010 - /* Ensure that we terminate if a child dies. */ 1769 + /* Ensure that we terminate if a device-servicing child dies. */ 2011 1770 signal(SIGCHLD, kill_launcher); 2012 1771 2013 1772 /* If we exit via err(), this kills all the threads, restores tty. */

+1 -2

arch/x86/include/asm/lguest.h

··· 17 17 /* Pages for switcher itself, then two pages per cpu */ 18 18 #define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * nr_cpu_ids) 19 19 20 - /* We map at -4M (-2M when PAE is activated) for ease of mapping 21 - * into the guest (one PTE page). */ 20 + /* We map at -4M (-2M for PAE) for ease of mapping (one PTE page). */ 22 21 #ifdef CONFIG_X86_PAE 23 22 #define SWITCHER_ADDR 0xFFE00000 24 23 #else

+9 -9

arch/x86/include/asm/lguest_hcall.h

··· 30 30 #include <asm/hw_irq.h> 31 31 #include <asm/kvm_para.h> 32 32 33 - /*G:030 But first, how does our Guest contact the Host to ask for privileged 33 + /*G:030 34 + * But first, how does our Guest contact the Host to ask for privileged 34 35 * operations? There are two ways: the direct way is to make a "hypercall", 35 36 * to make requests of the Host Itself. 36 37 * 37 - * We use the KVM hypercall mechanism. Seventeen hypercalls are 38 - * available: the hypercall number is put in the %eax register, and the 39 - * arguments (when required) are placed in %ebx, %ecx, %edx and %esi. 40 - * If a return value makes sense, it's returned in %eax. 38 + * We use the KVM hypercall mechanism, though completely different hypercall 39 + * numbers. Seventeen hypercalls are available: the hypercall number is put in 40 + * the %eax register, and the arguments (when required) are placed in %ebx, 41 + * %ecx, %edx and %esi. If a return value makes sense, it's returned in %eax. 41 42 * 42 43 * Grossly invalid calls result in Sudden Death at the hands of the vengeful 43 44 * Host, rather than returning failure. This reflects Winston Churchill's 44 - * definition of a gentleman: "someone who is only rude intentionally". */ 45 - /*:*/ 45 + * definition of a gentleman: "someone who is only rude intentionally". 46 + :*/ 46 47 47 48 /* Can't use our min() macro here: needs to be a constant */ 48 49 #define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32) 49 50 50 51 #define LHCALL_RING_SIZE 64 51 52 struct hcall_args { 52 - /* These map directly onto eax, ebx, ecx, edx and esi 53 - * in struct lguest_regs */ 53 + /* These map directly onto eax/ebx/ecx/edx/esi in struct lguest_regs */ 54 54 unsigned long arg0, arg1, arg2, arg3, arg4; 55 55 }; 56 56

+347 -162

arch/x86/lguest/boot.c

··· 22 22 * 23 23 * So how does the kernel know it's a Guest? We'll see that later, but let's 24 24 * just say that we end up here where we replace the native functions various 25 - * "paravirt" structures with our Guest versions, then boot like normal. :*/ 25 + * "paravirt" structures with our Guest versions, then boot like normal. 26 + :*/ 26 27 27 28 /* 28 29 * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation. ··· 75 74 * 76 75 * The Guest in our tale is a simple creature: identical to the Host but 77 76 * behaving in simplified but equivalent ways. In particular, the Guest is the 78 - * same kernel as the Host (or at least, built from the same source code). :*/ 77 + * same kernel as the Host (or at least, built from the same source code). 78 + :*/ 79 79 80 80 struct lguest_data lguest_data = { 81 81 .hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF }, ··· 87 85 .syscall_vec = SYSCALL_VECTOR, 88 86 }; 89 87 90 - /*G:037 async_hcall() is pretty simple: I'm quite proud of it really. We have a 88 + /*G:037 89 + * async_hcall() is pretty simple: I'm quite proud of it really. We have a 91 90 * ring buffer of stored hypercalls which the Host will run though next time we 92 91 * do a normal hypercall. Each entry in the ring has 5 slots for the hypercall 93 92 * arguments, and a "hcall_status" word which is 0 if the call is ready to go, ··· 97 94 * If we come around to a slot which hasn't been finished, then the table is 98 95 * full and we just make the hypercall directly. This has the nice side 99 96 * effect of causing the Host to run all the stored calls in the ring buffer 100 - * which empties it for next time! */ 97 + * which empties it for next time! 98 + */ 101 99 static void async_hcall(unsigned long call, unsigned long arg1, 102 100 unsigned long arg2, unsigned long arg3, 103 101 unsigned long arg4) ··· 107 103 static unsigned int next_call; 108 104 unsigned long flags; 109 105 110 - /* Disable interrupts if not already disabled: we don't want an 106 + /* 107 + * Disable interrupts if not already disabled: we don't want an 111 108 * interrupt handler making a hypercall while we're already doing 112 - * one! */ 109 + * one! 110 + */ 113 111 local_irq_save(flags); 114 112 if (lguest_data.hcall_status[next_call] != 0xFF) { 115 113 /* Table full, so do normal hcall which will flush table. */ ··· 131 125 local_irq_restore(flags); 132 126 } 133 127 134 - /*G:035 Notice the lazy_hcall() above, rather than hcall(). This is our first 135 - * real optimization trick! 128 + /*G:035 129 + * Notice the lazy_hcall() above, rather than hcall(). This is our first real 130 + * optimization trick! 136 131 * 137 132 * When lazy_mode is set, it means we're allowed to defer all hypercalls and do 138 133 * them as a batch when lazy_mode is eventually turned off. Because hypercalls ··· 143 136 * lguest_leave_lazy_mode(). 144 137 * 145 138 * So, when we're in lazy mode, we call async_hcall() to store the call for 146 - * future processing: */ 139 + * future processing: 140 + */ 147 141 static void lazy_hcall1(unsigned long call, 148 142 unsigned long arg1) 149 143 { ··· 154 146 async_hcall(call, arg1, 0, 0, 0); 155 147 } 156 148 149 + /* You can imagine what lazy_hcall2, 3 and 4 look like. :*/ 157 150 static void lazy_hcall2(unsigned long call, 158 151 unsigned long arg1, 159 152 unsigned long arg2) ··· 190 181 } 191 182 #endif 192 183 193 - /* When lazy mode is turned off reset the per-cpu lazy mode variable and then 194 - * issue the do-nothing hypercall to flush any stored calls. */ 184 + /*G:036 185 + * When lazy mode is turned off reset the per-cpu lazy mode variable and then 186 + * issue the do-nothing hypercall to flush any stored calls. 187 + :*/ 195 188 static void lguest_leave_lazy_mmu_mode(void) 196 189 { 197 190 kvm_hypercall0(LHCALL_FLUSH_ASYNC); ··· 219 208 * check there before it tries to deliver an interrupt. 220 209 */ 221 210 222 - /* save_flags() is expected to return the processor state (ie. "flags"). The 211 + /* 212 + * save_flags() is expected to return the processor state (ie. "flags"). The 223 213 * flags word contains all kind of stuff, but in practice Linux only cares 224 - * about the interrupt flag. Our "save_flags()" just returns that. */ 214 + * about the interrupt flag. Our "save_flags()" just returns that. 215 + */ 225 216 static unsigned long save_fl(void) 226 217 { 227 218 return lguest_data.irq_enabled; ··· 235 222 lguest_data.irq_enabled = 0; 236 223 } 237 224 238 - /* Let's pause a moment. Remember how I said these are called so often? 225 + /* 226 + * Let's pause a moment. Remember how I said these are called so often? 239 227 * Jeremy Fitzhardinge optimized them so hard early in 2009 that he had to 240 228 * break some rules. In particular, these functions are assumed to save their 241 229 * own registers if they need to: normal C functions assume they can trash the 242 230 * eax register. To use normal C functions, we use 243 231 * PV_CALLEE_SAVE_REGS_THUNK(), which pushes %eax onto the stack, calls the 244 - * C function, then restores it. */ 232 + * C function, then restores it. 233 + */ 245 234 PV_CALLEE_SAVE_REGS_THUNK(save_fl); 246 235 PV_CALLEE_SAVE_REGS_THUNK(irq_disable); 247 236 /*:*/ ··· 252 237 extern void lg_irq_enable(void); 253 238 extern void lg_restore_fl(unsigned long flags); 254 239 255 - /*M:003 Note that we don't check for outstanding interrupts when we re-enable 256 - * them (or when we unmask an interrupt). This seems to work for the moment, 257 - * since interrupts are rare and we'll just get the interrupt on the next timer 258 - * tick, but now we can run with CONFIG_NO_HZ, we should revisit this. One way 259 - * would be to put the "irq_enabled" field in a page by itself, and have the 260 - * Host write-protect it when an interrupt comes in when irqs are disabled. 261 - * There will then be a page fault as soon as interrupts are re-enabled. 240 + /*M:003 241 + * We could be more efficient in our checking of outstanding interrupts, rather 242 + * than using a branch. One way would be to put the "irq_enabled" field in a 243 + * page by itself, and have the Host write-protect it when an interrupt comes 244 + * in when irqs are disabled. There will then be a page fault as soon as 245 + * interrupts are re-enabled. 262 246 * 263 247 * A better method is to implement soft interrupt disable generally for x86: 264 248 * instead of disabling interrupts, we set a flag. If an interrupt does come 265 249 * in, we then disable them for real. This is uncommon, so we could simply use 266 - * a hypercall for interrupt control and not worry about efficiency. :*/ 250 + * a hypercall for interrupt control and not worry about efficiency. 251 + :*/ 267 252 268 253 /*G:034 269 254 * The Interrupt Descriptor Table (IDT). ··· 276 261 static void lguest_write_idt_entry(gate_desc *dt, 277 262 int entrynum, const gate_desc *g) 278 263 { 279 - /* The gate_desc structure is 8 bytes long: we hand it to the Host in 264 + /* 265 + * The gate_desc structure is 8 bytes long: we hand it to the Host in 280 266 * two 32-bit chunks. The whole 32-bit kernel used to hand descriptors 281 267 * around like this; typesafety wasn't a big concern in Linux's early 282 - * years. */ 268 + * years. 269 + */ 283 270 u32 *desc = (u32 *)g; 284 271 /* Keep the local copy up to date. */ 285 272 native_write_idt_entry(dt, entrynum, g); ··· 289 272 kvm_hypercall3(LHCALL_LOAD_IDT_ENTRY, entrynum, desc[0], desc[1]); 290 273 } 291 274 292 - /* Changing to a different IDT is very rare: we keep the IDT up-to-date every 275 + /* 276 + * Changing to a different IDT is very rare: we keep the IDT up-to-date every 293 277 * time it is written, so we can simply loop through all entries and tell the 294 - * Host about them. */ 278 + * Host about them. 279 + */ 295 280 static void lguest_load_idt(const struct desc_ptr *desc) 296 281 { 297 282 unsigned int i; ··· 324 305 kvm_hypercall3(LHCALL_LOAD_GDT_ENTRY, i, gdt[i].a, gdt[i].b); 325 306 } 326 307 327 - /* For a single GDT entry which changes, we do the lazy thing: alter our GDT, 308 + /* 309 + * For a single GDT entry which changes, we do the lazy thing: alter our GDT, 328 310 * then tell the Host to reload the entire thing. This operation is so rare 329 - * that this naive implementation is reasonable. */ 311 + * that this naive implementation is reasonable. 312 + */ 330 313 static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum, 331 314 const void *desc, int type) 332 315 { ··· 338 317 dt[entrynum].a, dt[entrynum].b); 339 318 } 340 319 341 - /* OK, I lied. There are three "thread local storage" GDT entries which change 320 + /* 321 + * OK, I lied. There are three "thread local storage" GDT entries which change 342 322 * on every context switch (these three entries are how glibc implements 343 - * __thread variables). So we have a hypercall specifically for this case. */ 323 + * __thread variables). So we have a hypercall specifically for this case. 324 + */ 344 325 static void lguest_load_tls(struct thread_struct *t, unsigned int cpu) 345 326 { 346 - /* There's one problem which normal hardware doesn't have: the Host 327 + /* 328 + * There's one problem which normal hardware doesn't have: the Host 347 329 * can't handle us removing entries we're currently using. So we clear 348 - * the GS register here: if it's needed it'll be reloaded anyway. */ 330 + * the GS register here: if it's needed it'll be reloaded anyway. 331 + */ 349 332 lazy_load_gs(0); 350 333 lazy_hcall2(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu); 351 334 } 352 335 353 - /*G:038 That's enough excitement for now, back to ploughing through each of 354 - * the different pv_ops structures (we're about 1/3 of the way through). 336 + /*G:038 337 + * That's enough excitement for now, back to ploughing through each of the 338 + * different pv_ops structures (we're about 1/3 of the way through). 355 339 * 356 340 * This is the Local Descriptor Table, another weird Intel thingy. Linux only 357 341 * uses this for some strange applications like Wine. We don't do anything 358 - * here, so they'll get an informative and friendly Segmentation Fault. */ 342 + * here, so they'll get an informative and friendly Segmentation Fault. 343 + */ 359 344 static void lguest_set_ldt(const void *addr, unsigned entries) 360 345 { 361 346 } 362 347 363 - /* This loads a GDT entry into the "Task Register": that entry points to a 348 + /* 349 + * This loads a GDT entry into the "Task Register": that entry points to a 364 350 * structure called the Task State Segment. Some comments scattered though the 365 351 * kernel code indicate that this used for task switching in ages past, along 366 352 * with blood sacrifice and astrology. ··· 375 347 * Now there's nothing interesting in here that we don't get told elsewhere. 376 348 * But the native version uses the "ltr" instruction, which makes the Host 377 349 * complain to the Guest about a Segmentation Fault and it'll oops. So we 378 - * override the native version with a do-nothing version. */ 350 + * override the native version with a do-nothing version. 351 + */ 379 352 static void lguest_load_tr_desc(void) 380 353 { 381 354 } 382 355 383 - /* The "cpuid" instruction is a way of querying both the CPU identity 356 + /* 357 + * The "cpuid" instruction is a way of querying both the CPU identity 384 358 * (manufacturer, model, etc) and its features. It was introduced before the 385 359 * Pentium in 1993 and keeps getting extended by both Intel, AMD and others. 386 360 * As you might imagine, after a decade and a half this treatment, it is now a 387 361 * giant ball of hair. Its entry in the current Intel manual runs to 28 pages. 388 362 * 389 363 * This instruction even it has its own Wikipedia entry. The Wikipedia entry 390 - * has been translated into 4 languages. I am not making this up! 364 + * has been translated into 5 languages. I am not making this up! 391 365 * 392 366 * We could get funky here and identify ourselves as "GenuineLguest", but 393 367 * instead we just use the real "cpuid" instruction. Then I pretty much turned ··· 401 371 * Replacing the cpuid so we can turn features off is great for the kernel, but 402 372 * anyone (including userspace) can just use the raw "cpuid" instruction and 403 373 * the Host won't even notice since it isn't privileged. So we try not to get 404 - * too worked up about it. */ 374 + * too worked up about it. 375 + */ 405 376 static void lguest_cpuid(unsigned int *ax, unsigned int *bx, 406 377 unsigned int *cx, unsigned int *dx) 407 378 { ··· 410 379 411 380 native_cpuid(ax, bx, cx, dx); 412 381 switch (function) { 413 - case 0: /* ID and highest CPUID. Futureproof a little by sticking to 414 - * older ones. */ 382 + /* 383 + * CPUID 0 gives the highest legal CPUID number (and the ID string). 384 + * We futureproof our code a little by sticking to known CPUID values. 385 + */ 386 + case 0: 415 387 if (*ax > 5) 416 388 *ax = 5; 417 389 break; 418 - case 1: /* Basic feature request. */ 419 - /* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */ 390 + 391 + /* 392 + * CPUID 1 is a basic feature request. 393 + * 394 + * CX: we only allow kernel to see SSE3, CMPXCHG16B and SSSE3 395 + * DX: SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU and PAE. 396 + */ 397 + case 1: 420 398 *cx &= 0x00002201; 421 - /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU, PAE. */ 422 399 *dx &= 0x07808151; 423 - /* The Host can do a nice optimization if it knows that the 400 + /* 401 + * The Host can do a nice optimization if it knows that the 424 402 * kernel mappings (addresses above 0xC0000000 or whatever 425 403 * PAGE_OFFSET is set to) haven't changed. But Linux calls 426 404 * flush_tlb_user() for both user and kernel mappings unless 427 - * the Page Global Enable (PGE) feature bit is set. */ 405 + * the Page Global Enable (PGE) feature bit is set. 406 + */ 428 407 *dx |= 0x00002000; 429 - /* We also lie, and say we're family id 5. 6 or greater 408 + /* 409 + * We also lie, and say we're family id 5. 6 or greater 430 410 * leads to a rdmsr in early_init_intel which we can't handle. 431 - * Family ID is returned as bits 8-12 in ax. */ 411 + * Family ID is returned as bits 8-12 in ax. 412 + */ 432 413 *ax &= 0xFFFFF0FF; 433 414 *ax |= 0x00000500; 434 415 break; 416 + /* 417 + * 0x80000000 returns the highest Extended Function, so we futureproof 418 + * like we do above by limiting it to known fields. 419 + */ 435 420 case 0x80000000: 436 - /* Futureproof this a little: if they ask how much extended 437 - * processor information there is, limit it to known fields. */ 438 421 if (*ax > 0x80000008) 439 422 *ax = 0x80000008; 440 423 break; 424 + 425 + /* 426 + * PAE systems can mark pages as non-executable. Linux calls this the 427 + * NX bit. Intel calls it XD (eXecute Disable), AMD EVP (Enhanced 428 + * Virus Protection). We just switch turn if off here, since we don't 429 + * support it. 430 + */ 441 431 case 0x80000001: 442 - /* Here we should fix nx cap depending on host. */ 443 - /* For this version of PAE, we just clear NX bit. */ 444 432 *dx &= ~(1 << 20); 445 433 break; 446 434 } 447 435 } 448 436 449 - /* Intel has four control registers, imaginatively named cr0, cr2, cr3 and cr4. 437 + /* 438 + * Intel has four control registers, imaginatively named cr0, cr2, cr3 and cr4. 450 439 * I assume there's a cr1, but it hasn't bothered us yet, so we'll not bother 451 440 * it. The Host needs to know when the Guest wants to change them, so we have 452 441 * a whole series of functions like read_cr0() and write_cr0(). ··· 481 430 * name like "FPUTRAP bit" be a little less cryptic? 482 431 * 483 432 * We store cr0 locally because the Host never changes it. The Guest sometimes 484 - * wants to read it and we'd prefer not to bother the Host unnecessarily. */ 433 + * wants to read it and we'd prefer not to bother the Host unnecessarily. 434 + */ 485 435 static unsigned long current_cr0; 486 436 static void lguest_write_cr0(unsigned long val) 487 437 { ··· 495 443 return current_cr0; 496 444 } 497 445 498 - /* Intel provided a special instruction to clear the TS bit for people too cool 446 + /* 447 + * Intel provided a special instruction to clear the TS bit for people too cool 499 448 * to use write_cr0() to do it. This "clts" instruction is faster, because all 500 - * the vowels have been optimized out. */ 449 + * the vowels have been optimized out. 450 + */ 501 451 static void lguest_clts(void) 502 452 { 503 453 lazy_hcall1(LHCALL_TS, 0); 504 454 current_cr0 &= ~X86_CR0_TS; 505 455 } 506 456 507 - /* cr2 is the virtual address of the last page fault, which the Guest only ever 457 + /* 458 + * cr2 is the virtual address of the last page fault, which the Guest only ever 508 459 * reads. The Host kindly writes this into our "struct lguest_data", so we 509 - * just read it out of there. */ 460 + * just read it out of there. 461 + */ 510 462 static unsigned long lguest_read_cr2(void) 511 463 { 512 464 return lguest_data.cr2; ··· 519 463 /* See lguest_set_pte() below. */ 520 464 static bool cr3_changed = false; 521 465 522 - /* cr3 is the current toplevel pagetable page: the principle is the same as 466 + /* 467 + * cr3 is the current toplevel pagetable page: the principle is the same as 523 468 * cr0. Keep a local copy, and tell the Host when it changes. The only 524 469 * difference is that our local copy is in lguest_data because the Host needs 525 - * to set it upon our initial hypercall. */ 470 + * to set it upon our initial hypercall. 471 + */ 526 472 static void lguest_write_cr3(unsigned long cr3) 527 473 { 528 474 lguest_data.pgdir = cr3; ··· 569 511 * cr3 ---> +---------+ 570 512 * | --------->+---------+ 571 513 * | | | PADDR1 | 572 - * Top-level | | PADDR2 | 514 + * Mid-level | | PADDR2 | 573 515 * (PMD) page | | | 574 516 * | | Lower-level | 575 517 * | | (PTE) page | ··· 589 531 * Index into top Index into second Offset within page 590 532 * page directory page pagetable page 591 533 * 592 - * The kernel spends a lot of time changing both the top-level page directory 593 - * and lower-level pagetable pages. The Guest doesn't know physical addresses, 594 - * so while it maintains these page tables exactly like normal, it also needs 595 - * to keep the Host informed whenever it makes a change: the Host will create 596 - * the real page tables based on the Guests'. 534 + * Now, unfortunately, this isn't the whole story: Intel added Physical Address 535 + * Extension (PAE) to allow 32 bit systems to use 64GB of memory (ie. 36 bits). 536 + * These are held in 64-bit page table entries, so we can now only fit 512 537 + * entries in a page, and the neat three-level tree breaks down. 538 + * 539 + * The result is a four level page table: 540 + * 541 + * cr3 --> [ 4 Upper ] 542 + * [ Level ] 543 + * [ Entries ] 544 + * [(PUD Page)]---> +---------+ 545 + * | --------->+---------+ 546 + * | | | PADDR1 | 547 + * Mid-level | | PADDR2 | 548 + * (PMD) page | | | 549 + * | | Lower-level | 550 + * | | (PTE) page | 551 + * | | | | 552 + * .... .... 553 + * 554 + * 555 + * And the virtual address is decoded as: 556 + * 557 + * 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 558 + * |<-2->|<--- 9 bits ---->|<---- 9 bits --->|<------ 12 bits ------>| 559 + * Index into Index into mid Index into lower Offset within page 560 + * top entries directory page pagetable page 561 + * 562 + * It's too hard to switch between these two formats at runtime, so Linux only 563 + * supports one or the other depending on whether CONFIG_X86_PAE is set. Many 564 + * distributions turn it on, and not just for people with silly amounts of 565 + * memory: the larger PTE entries allow room for the NX bit, which lets the 566 + * kernel disable execution of pages and increase security. 567 + * 568 + * This was a problem for lguest, which couldn't run on these distributions; 569 + * then Matias Zabaljauregui figured it all out and implemented it, and only a 570 + * handful of puppies were crushed in the process! 571 + * 572 + * Back to our point: the kernel spends a lot of time changing both the 573 + * top-level page directory and lower-level pagetable pages. The Guest doesn't 574 + * know physical addresses, so while it maintains these page tables exactly 575 + * like normal, it also needs to keep the Host informed whenever it makes a 576 + * change: the Host will create the real page tables based on the Guests'. 597 577 */ 598 578 599 - /* The Guest calls this to set a second-level entry (pte), ie. to map a page 600 - * into a process' address space. We set the entry then tell the Host the 601 - * toplevel and address this corresponds to. The Guest uses one pagetable per 602 - * process, so we need to tell the Host which one we're changing (mm->pgd). */ 579 + /* 580 + * The Guest calls this after it has set a second-level entry (pte), ie. to map 581 + * a page into a process' address space. Wetell the Host the toplevel and 582 + * address this corresponds to. The Guest uses one pagetable per process, so 583 + * we need to tell the Host which one we're changing (mm->pgd). 584 + */ 603 585 static void lguest_pte_update(struct mm_struct *mm, unsigned long addr, 604 586 pte_t *ptep) 605 587 { 606 588 #ifdef CONFIG_X86_PAE 589 + /* PAE needs to hand a 64 bit page table entry, so it uses two args. */ 607 590 lazy_hcall4(LHCALL_SET_PTE, __pa(mm->pgd), addr, 608 591 ptep->pte_low, ptep->pte_high); 609 592 #else ··· 652 553 #endif 653 554 } 654 555 556 + /* This is the "set and update" combo-meal-deal version. */ 655 557 static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, 656 558 pte_t *ptep, pte_t pteval) 657 559 { ··· 660 560 lguest_pte_update(mm, addr, ptep); 661 561 } 662 562 663 - /* The Guest calls lguest_set_pud to set a top-level entry and lguest_set_pmd 563 + /* 564 + * The Guest calls lguest_set_pud to set a top-level entry and lguest_set_pmd 664 565 * to set a middle-level entry when PAE is activated. 566 + * 665 567 * Again, we set the entry then tell the Host which page we changed, 666 - * and the index of the entry we changed. */ 568 + * and the index of the entry we changed. 569 + */ 667 570 #ifdef CONFIG_X86_PAE 668 571 static void lguest_set_pud(pud_t *pudp, pud_t pudval) 669 572 { ··· 685 582 } 686 583 #else 687 584 688 - /* The Guest calls lguest_set_pmd to set a top-level entry when PAE is not 689 - * activated. */ 585 + /* The Guest calls lguest_set_pmd to set a top-level entry when !PAE. */ 690 586 static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) 691 587 { 692 588 native_set_pmd(pmdp, pmdval); ··· 694 592 } 695 593 #endif 696 594 697 - /* There are a couple of legacy places where the kernel sets a PTE, but we 595 + /* 596 + * There are a couple of legacy places where the kernel sets a PTE, but we 698 597 * don't know the top level any more. This is useless for us, since we don't 699 598 * know which pagetable is changing or what address, so we just tell the Host 700 599 * to forget all of them. Fortunately, this is very rare. ··· 703 600 * ... except in early boot when the kernel sets up the initial pagetables, 704 601 * which makes booting astonishingly slow: 1.83 seconds! So we don't even tell 705 602 * the Host anything changed until we've done the first page table switch, 706 - * which brings boot back to 0.25 seconds. */ 603 + * which brings boot back to 0.25 seconds. 604 + */ 707 605 static void lguest_set_pte(pte_t *ptep, pte_t pteval) 708 606 { 709 607 native_set_pte(ptep, pteval); ··· 713 609 } 714 610 715 611 #ifdef CONFIG_X86_PAE 612 + /* 613 + * With 64-bit PTE values, we need to be careful setting them: if we set 32 614 + * bits at a time, the hardware could see a weird half-set entry. These 615 + * versions ensure we update all 64 bits at once. 616 + */ 716 617 static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte) 717 618 { 718 619 native_set_pte_atomic(ptep, pte); ··· 725 616 lazy_hcall1(LHCALL_FLUSH_TLB, 1); 726 617 } 727 618 728 - void lguest_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) 619 + static void lguest_pte_clear(struct mm_struct *mm, unsigned long addr, 620 + pte_t *ptep) 729 621 { 730 622 native_pte_clear(mm, addr, ptep); 731 623 lguest_pte_update(mm, addr, ptep); 732 624 } 733 625 734 - void lguest_pmd_clear(pmd_t *pmdp) 626 + static void lguest_pmd_clear(pmd_t *pmdp) 735 627 { 736 628 lguest_set_pmd(pmdp, __pmd(0)); 737 629 } 738 630 #endif 739 631 740 - /* Unfortunately for Lguest, the pv_mmu_ops for page tables were based on 632 + /* 633 + * Unfortunately for Lguest, the pv_mmu_ops for page tables were based on 741 634 * native page table operations. On native hardware you can set a new page 742 635 * table entry whenever you want, but if you want to remove one you have to do 743 636 * a TLB flush (a TLB is a little cache of page table entries kept by the CPU). ··· 748 637 * called when a valid entry is written, not when it's removed (ie. marked not 749 638 * present). Instead, this is where we come when the Guest wants to remove a 750 639 * page table entry: we tell the Host to set that entry to 0 (ie. the present 751 - * bit is zero). */ 640 + * bit is zero). 641 + */ 752 642 static void lguest_flush_tlb_single(unsigned long addr) 753 643 { 754 644 /* Simply set it to zero: if it was not, it will fault back in. */ 755 645 lazy_hcall3(LHCALL_SET_PTE, lguest_data.pgdir, addr, 0); 756 646 } 757 647 758 - /* This is what happens after the Guest has removed a large number of entries. 648 + /* 649 + * This is what happens after the Guest has removed a large number of entries. 759 650 * This tells the Host that any of the page table entries for userspace might 760 - * have changed, ie. virtual addresses below PAGE_OFFSET. */ 651 + * have changed, ie. virtual addresses below PAGE_OFFSET. 652 + */ 761 653 static void lguest_flush_tlb_user(void) 762 654 { 763 655 lazy_hcall1(LHCALL_FLUSH_TLB, 0); 764 656 } 765 657 766 - /* This is called when the kernel page tables have changed. That's not very 658 + /* 659 + * This is called when the kernel page tables have changed. That's not very 767 660 * common (unless the Guest is using highmem, which makes the Guest extremely 768 - * slow), so it's worth separating this from the user flushing above. */ 661 + * slow), so it's worth separating this from the user flushing above. 662 + */ 769 663 static void lguest_flush_tlb_kernel(void) 770 664 { 771 665 lazy_hcall1(LHCALL_FLUSH_TLB, 1); ··· 807 691 .unmask = enable_lguest_irq, 808 692 }; 809 693 810 - /* This sets up the Interrupt Descriptor Table (IDT) entry for each hardware 694 + /* 695 + * This sets up the Interrupt Descriptor Table (IDT) entry for each hardware 811 696 * interrupt (except 128, which is used for system calls), and then tells the 812 697 * Linux infrastructure that each interrupt is controlled by our level-based 813 - * lguest interrupt controller. */ 698 + * lguest interrupt controller. 699 + */ 814 700 static void __init lguest_init_IRQ(void) 815 701 { 816 702 unsigned int i; 817 703 818 704 for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { 819 - /* Some systems map "vectors" to interrupts weirdly. Lguest has 820 - * a straightforward 1 to 1 mapping, so force that here. */ 705 + /* Some systems map "vectors" to interrupts weirdly. Not us! */ 821 706 __get_cpu_var(vector_irq)[i] = i - FIRST_EXTERNAL_VECTOR; 822 707 if (i != SYSCALL_VECTOR) 823 708 set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]); 824 709 } 825 - /* This call is required to set up for 4k stacks, where we have 826 - * separate stacks for hard and soft interrupts. */ 710 + 711 + /* 712 + * This call is required to set up for 4k stacks, where we have 713 + * separate stacks for hard and soft interrupts. 714 + */ 827 715 irq_ctx_init(smp_processor_id()); 828 716 } 829 717 718 + /* 719 + * With CONFIG_SPARSE_IRQ, interrupt descriptors are allocated as-needed, so 720 + * rather than set them in lguest_init_IRQ we are called here every time an 721 + * lguest device needs an interrupt. 722 + * 723 + * FIXME: irq_to_desc_alloc_node() can fail due to lack of memory, we should 724 + * pass that up! 725 + */ 830 726 void lguest_setup_irq(unsigned int irq) 831 727 { 832 728 irq_to_desc_alloc_node(irq, 0); ··· 857 729 return lguest_data.time.tv_sec; 858 730 } 859 731 860 - /* The TSC is an Intel thing called the Time Stamp Counter. The Host tells us 732 + /* 733 + * The TSC is an Intel thing called the Time Stamp Counter. The Host tells us 861 734 * what speed it runs at, or 0 if it's unusable as a reliable clock source. 862 735 * This matches what we want here: if we return 0 from this function, the x86 863 - * TSC clock will give up and not register itself. */ 736 + * TSC clock will give up and not register itself. 737 + */ 864 738 static unsigned long lguest_tsc_khz(void) 865 739 { 866 740 return lguest_data.tsc_khz; 867 741 } 868 742 869 - /* If we can't use the TSC, the kernel falls back to our lower-priority 870 - * "lguest_clock", where we read the time value given to us by the Host. */ 743 + /* 744 + * If we can't use the TSC, the kernel falls back to our lower-priority 745 + * "lguest_clock", where we read the time value given to us by the Host. 746 + */ 871 747 static cycle_t lguest_clock_read(struct clocksource *cs) 872 748 { 873 749 unsigned long sec, nsec; 874 750 875 - /* Since the time is in two parts (seconds and nanoseconds), we risk 751 + /* 752 + * Since the time is in two parts (seconds and nanoseconds), we risk 876 753 * reading it just as it's changing from 99 & 0.999999999 to 100 and 0, 877 754 * and getting 99 and 0. As Linux tends to come apart under the stress 878 - * of time travel, we must be careful: */ 755 + * of time travel, we must be careful: 756 + */ 879 757 do { 880 758 /* First we read the seconds part. */ 881 759 sec = lguest_data.time.tv_sec; 882 - /* This read memory barrier tells the compiler and the CPU that 760 + /* 761 + * This read memory barrier tells the compiler and the CPU that 883 762 * this can't be reordered: we have to complete the above 884 - * before going on. */ 763 + * before going on. 764 + */ 885 765 rmb(); 886 766 /* Now we read the nanoseconds part. */ 887 767 nsec = lguest_data.time.tv_nsec; ··· 913 777 .flags = CLOCK_SOURCE_IS_CONTINUOUS, 914 778 }; 915 779 916 - /* We also need a "struct clock_event_device": Linux asks us to set it to go 780 + /* 781 + * We also need a "struct clock_event_device": Linux asks us to set it to go 917 782 * off some time in the future. Actually, James Morris figured all this out, I 918 - * just applied the patch. */ 783 + * just applied the patch. 784 + */ 919 785 static int lguest_clockevent_set_next_event(unsigned long delta, 920 786 struct clock_event_device *evt) 921 787 { ··· 967 829 .max_delta_ns = LG_CLOCK_MAX_DELTA, 968 830 }; 969 831 970 - /* This is the Guest timer interrupt handler (hardware interrupt 0). We just 971 - * call the clockevent infrastructure and it does whatever needs doing. */ 832 + /* 833 + * This is the Guest timer interrupt handler (hardware interrupt 0). We just 834 + * call the clockevent infrastructure and it does whatever needs doing. 835 + */ 972 836 static void lguest_time_irq(unsigned int irq, struct irq_desc *desc) 973 837 { 974 838 unsigned long flags; ··· 981 841 local_irq_restore(flags); 982 842 } 983 843 984 - /* At some point in the boot process, we get asked to set up our timing 844 + /* 845 + * At some point in the boot process, we get asked to set up our timing 985 846 * infrastructure. The kernel doesn't expect timer interrupts before this, but 986 847 * we cleverly initialized the "blocked_interrupts" field of "struct 987 - * lguest_data" so that timer interrupts were blocked until now. */ 848 + * lguest_data" so that timer interrupts were blocked until now. 849 + */ 988 850 static void lguest_time_init(void) 989 851 { 990 852 /* Set up the timer interrupt (0) to go to our simple timer routine */ ··· 1010 868 * to work. They're pretty simple. 1011 869 */ 1012 870 1013 - /* The Guest needs to tell the Host what stack it expects traps to use. For 871 + /* 872 + * The Guest needs to tell the Host what stack it expects traps to use. For 1014 873 * native hardware, this is part of the Task State Segment mentioned above in 1015 874 * lguest_load_tr_desc(), but to help hypervisors there's this special call. 1016 875 * 1017 876 * We tell the Host the segment we want to use (__KERNEL_DS is the kernel data 1018 877 * segment), the privilege level (we're privilege level 1, the Host is 0 and 1019 878 * will not tolerate us trying to use that), the stack pointer, and the number 1020 - * of pages in the stack. */ 879 + * of pages in the stack. 880 + */ 1021 881 static void lguest_load_sp0(struct tss_struct *tss, 1022 882 struct thread_struct *thread) 1023 883 { ··· 1033 889 /* FIXME: Implement */ 1034 890 } 1035 891 1036 - /* There are times when the kernel wants to make sure that no memory writes are 892 + /* 893 + * There are times when the kernel wants to make sure that no memory writes are 1037 894 * caught in the cache (that they've all reached real hardware devices). This 1038 895 * doesn't matter for the Guest which has virtual hardware. 1039 896 * ··· 1048 903 { 1049 904 } 1050 905 1051 - /* If the Guest expects to have an Advanced Programmable Interrupt Controller, 906 + /* 907 + * If the Guest expects to have an Advanced Programmable Interrupt Controller, 1052 908 * we play dumb by ignoring writes and returning 0 for reads. So it's no 1053 909 * longer Programmable nor Controlling anything, and I don't think 8 lines of 1054 910 * code qualifies for Advanced. It will also never interrupt anything. It 1055 - * does, however, allow us to get through the Linux boot code. */ 911 + * does, however, allow us to get through the Linux boot code. 912 + */ 1056 913 #ifdef CONFIG_X86_LOCAL_APIC 1057 914 static void lguest_apic_write(u32 reg, u32 v) 1058 915 { ··· 1103 956 kvm_hypercall0(LHCALL_HALT); 1104 957 } 1105 958 1106 - /* The SHUTDOWN hypercall takes a string to describe what's happening, and 959 + /* 960 + * The SHUTDOWN hypercall takes a string to describe what's happening, and 1107 961 * an argument which says whether this to restart (reboot) the Guest or not. 1108 962 * 1109 963 * Note that the Host always prefers that the Guest speak in physical addresses 1110 - * rather than virtual addresses, so we use __pa() here. */ 964 + * rather than virtual addresses, so we use __pa() here. 965 + */ 1111 966 static void lguest_power_off(void) 1112 967 { 1113 968 kvm_hypercall2(LHCALL_SHUTDOWN, __pa("Power down"), ··· 1140 991 * nice to move it back to lguest_init. Patch welcome... */ 1141 992 atomic_notifier_chain_register(&panic_notifier_list, &paniced); 1142 993 1143 - /* The Linux bootloader header contains an "e820" memory map: the 1144 - * Launcher populated the first entry with our memory limit. */ 994 + /* 995 + *The Linux bootloader header contains an "e820" memory map: the 996 + * Launcher populated the first entry with our memory limit. 997 + */ 1145 998 e820_add_region(boot_params.e820_map[0].addr, 1146 999 boot_params.e820_map[0].size, 1147 1000 boot_params.e820_map[0].type); ··· 1152 1001 return "LGUEST"; 1153 1002 } 1154 1003 1155 - /* We will eventually use the virtio console device to produce console output, 1004 + /* 1005 + * We will eventually use the virtio console device to produce console output, 1156 1006 * but before that is set up we use LHCALL_NOTIFY on normal memory to produce 1157 - * console output. */ 1007 + * console output. 1008 + */ 1158 1009 static __init int early_put_chars(u32 vtermno, const char *buf, int count) 1159 1010 { 1160 1011 char scratch[17]; 1161 1012 unsigned int len = count; 1162 1013 1163 - /* We use a nul-terminated string, so we have to make a copy. Icky, 1164 - * huh? */ 1014 + /* We use a nul-terminated string, so we make a copy. Icky, huh? */ 1165 1015 if (len > sizeof(scratch) - 1) 1166 1016 len = sizeof(scratch) - 1; 1167 1017 scratch[len] = '\0'; ··· 1173 1021 return len; 1174 1022 } 1175 1023 1176 - /* Rebooting also tells the Host we're finished, but the RESTART flag tells the 1177 - * Launcher to reboot us. */ 1024 + /* 1025 + * Rebooting also tells the Host we're finished, but the RESTART flag tells the 1026 + * Launcher to reboot us. 1027 + */ 1178 1028 static void lguest_restart(char *reason) 1179 1029 { 1180 1030 kvm_hypercall2(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART); ··· 1203 1049 * fit comfortably. 1204 1050 * 1205 1051 * First we need assembly templates of each of the patchable Guest operations, 1206 - * and these are in i386_head.S. */ 1052 + * and these are in i386_head.S. 1053 + */ 1207 1054 1208 1055 /*G:060 We construct a table from the assembler templates: */ 1209 1056 static const struct lguest_insns ··· 1215 1060 [PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf }, 1216 1061 }; 1217 1062 1218 - /* Now our patch routine is fairly simple (based on the native one in 1063 + /* 1064 + * Now our patch routine is fairly simple (based on the native one in 1219 1065 * paravirt.c). If we have a replacement, we copy it in and return how much of 1220 - * the available space we used. */ 1066 + * the available space we used. 1067 + */ 1221 1068 static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf, 1222 1069 unsigned long addr, unsigned len) 1223 1070 { ··· 1231 1074 1232 1075 insn_len = lguest_insns[type].end - lguest_insns[type].start; 1233 1076 1234 - /* Similarly if we can't fit replacement (shouldn't happen, but let's 1235 - * be thorough). */ 1077 + /* Similarly if it can't fit (doesn't happen, but let's be thorough). */ 1236 1078 if (len < insn_len) 1237 1079 return paravirt_patch_default(type, clobber, ibuf, addr, len); 1238 1080 ··· 1240 1084 return insn_len; 1241 1085 } 1242 1086 1243 - /*G:029 Once we get to lguest_init(), we know we're a Guest. The various 1087 + /*G:029 1088 + * Once we get to lguest_init(), we know we're a Guest. The various 1244 1089 * pv_ops structures in the kernel provide points for (almost) every routine we 1245 - * have to override to avoid privileged instructions. */ 1090 + * have to override to avoid privileged instructions. 1091 + */ 1246 1092 __init void lguest_init(void) 1247 1093 { 1248 - /* We're under lguest, paravirt is enabled, and we're running at 1249 - * privilege level 1, not 0 as normal. */ 1094 + /* We're under lguest. */ 1250 1095 pv_info.name = "lguest"; 1096 + /* Paravirt is enabled. */ 1251 1097 pv_info.paravirt_enabled = 1; 1098 + /* We're running at privilege level 1, not 0 as normal. */ 1252 1099 pv_info.kernel_rpl = 1; 1100 + /* Everyone except Xen runs with this set. */ 1253 1101 pv_info.shared_kernel_pmd = 1; 1254 1102 1255 - /* We set up all the lguest overrides for sensitive operations. These 1256 - * are detailed with the operations themselves. */ 1103 + /* 1104 + * We set up all the lguest overrides for sensitive operations. These 1105 + * are detailed with the operations themselves. 1106 + */ 1257 1107 1258 - /* interrupt-related operations */ 1108 + /* Interrupt-related operations */ 1259 1109 pv_irq_ops.init_IRQ = lguest_init_IRQ; 1260 1110 pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl); 1261 1111 pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl); ··· 1269 1107 pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(lg_irq_enable); 1270 1108 pv_irq_ops.safe_halt = lguest_safe_halt; 1271 1109 1272 - /* init-time operations */ 1110 + /* Setup operations */ 1273 1111 pv_init_ops.memory_setup = lguest_memory_setup; 1274 1112 pv_init_ops.patch = lguest_patch; 1275 1113 1276 - /* Intercepts of various cpu instructions */ 1114 + /* Intercepts of various CPU instructions */ 1277 1115 pv_cpu_ops.load_gdt = lguest_load_gdt; 1278 1116 pv_cpu_ops.cpuid = lguest_cpuid; 1279 1117 pv_cpu_ops.load_idt = lguest_load_idt; ··· 1294 1132 pv_cpu_ops.start_context_switch = paravirt_start_context_switch; 1295 1133 pv_cpu_ops.end_context_switch = lguest_end_context_switch; 1296 1134 1297 - /* pagetable management */ 1135 + /* Pagetable management */ 1298 1136 pv_mmu_ops.write_cr3 = lguest_write_cr3; 1299 1137 pv_mmu_ops.flush_tlb_user = lguest_flush_tlb_user; 1300 1138 pv_mmu_ops.flush_tlb_single = lguest_flush_tlb_single; ··· 1316 1154 pv_mmu_ops.pte_update_defer = lguest_pte_update; 1317 1155 1318 1156 #ifdef CONFIG_X86_LOCAL_APIC 1319 - /* apic read/write intercepts */ 1157 + /* APIC read/write intercepts */ 1320 1158 set_lguest_basic_apic_ops(); 1321 1159 #endif 1322 1160 1323 - /* time operations */ 1161 + /* Time operations */ 1324 1162 pv_time_ops.get_wallclock = lguest_get_wallclock; 1325 1163 pv_time_ops.time_init = lguest_time_init; 1326 1164 pv_time_ops.get_tsc_khz = lguest_tsc_khz; 1327 1165 1328 - /* Now is a good time to look at the implementations of these functions 1329 - * before returning to the rest of lguest_init(). */ 1166 + /* 1167 + * Now is a good time to look at the implementations of these functions 1168 + * before returning to the rest of lguest_init(). 1169 + */ 1330 1170 1331 - /*G:070 Now we've seen all the paravirt_ops, we return to 1171 + /*G:070 1172 + * Now we've seen all the paravirt_ops, we return to 1332 1173 * lguest_init() where the rest of the fairly chaotic boot setup 1333 - * occurs. */ 1174 + * occurs. 1175 + */ 1334 1176 1335 - /* The stack protector is a weird thing where gcc places a canary 1177 + /* 1178 + * The stack protector is a weird thing where gcc places a canary 1336 1179 * value on the stack and then checks it on return. This file is 1337 1180 * compiled with -fno-stack-protector it, so we got this far without 1338 1181 * problems. The value of the canary is kept at offset 20 from the 1339 1182 * %gs register, so we need to set that up before calling C functions 1340 - * in other files. */ 1183 + * in other files. 1184 + */ 1341 1185 setup_stack_canary_segment(0); 1342 - /* We could just call load_stack_canary_segment(), but we might as 1343 - * call switch_to_new_gdt() which loads the whole table and sets up 1344 - * the per-cpu segment descriptor register %fs as well. */ 1186 + 1187 + /* 1188 + * We could just call load_stack_canary_segment(), but we might as well 1189 + * call switch_to_new_gdt() which loads the whole table and sets up the 1190 + * per-cpu segment descriptor register %fs as well. 1191 + */ 1345 1192 switch_to_new_gdt(0); 1346 1193 1347 - /* As described in head_32.S, we map the first 128M of memory. */ 1194 + /* We actually boot with all memory mapped, but let's say 128MB. */ 1348 1195 max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT; 1349 1196 1350 - /* The Host<->Guest Switcher lives at the top of our address space, and 1197 + /* 1198 + * The Host<->Guest Switcher lives at the top of our address space, and 1351 1199 * the Host told us how big it is when we made LGUEST_INIT hypercall: 1352 - * it put the answer in lguest_data.reserve_mem */ 1200 + * it put the answer in lguest_data.reserve_mem 1201 + */ 1353 1202 reserve_top_address(lguest_data.reserve_mem); 1354 1203 1355 - /* If we don't initialize the lock dependency checker now, it crashes 1356 - * paravirt_disable_iospace. */ 1204 + /* 1205 + * If we don't initialize the lock dependency checker now, it crashes 1206 + * paravirt_disable_iospace. 1207 + */ 1357 1208 lockdep_init(); 1358 1209 1359 - /* The IDE code spends about 3 seconds probing for disks: if we reserve 1210 + /* 1211 + * The IDE code spends about 3 seconds probing for disks: if we reserve 1360 1212 * all the I/O ports up front it can't get them and so doesn't probe. 1361 1213 * Other device drivers are similar (but less severe). This cuts the 1362 - * kernel boot time on my machine from 4.1 seconds to 0.45 seconds. */ 1214 + * kernel boot time on my machine from 4.1 seconds to 0.45 seconds. 1215 + */ 1363 1216 paravirt_disable_iospace(); 1364 1217 1365 - /* This is messy CPU setup stuff which the native boot code does before 1366 - * start_kernel, so we have to do, too: */ 1218 + /* 1219 + * This is messy CPU setup stuff which the native boot code does before 1220 + * start_kernel, so we have to do, too: 1221 + */ 1367 1222 cpu_detect(&new_cpu_data); 1368 1223 /* head.S usually sets up the first capability word, so do it here. */ 1369 1224 new_cpu_data.x86_capability[0] = cpuid_edx(1); ··· 1397 1218 acpi_ht = 0; 1398 1219 #endif 1399 1220 1400 - /* We set the preferred console to "hvc". This is the "hypervisor 1221 + /* 1222 + * We set the preferred console to "hvc". This is the "hypervisor 1401 1223 * virtual console" driver written by the PowerPC people, which we also 1402 - * adapted for lguest's use. */ 1224 + * adapted for lguest's use. 1225 + */ 1403 1226 add_preferred_console("hvc", 0, NULL); 1404 1227 1405 1228 /* Register our very early console. */ 1406 1229 virtio_cons_early_init(early_put_chars); 1407 1230 1408 - /* Last of all, we set the power management poweroff hook to point to 1231 + /* 1232 + * Last of all, we set the power management poweroff hook to point to 1409 1233 * the Guest routine to power off, and the reboot hook to our restart 1410 - * routine. */ 1234 + * routine. 1235 + */ 1411 1236 pm_power_off = lguest_power_off; 1412 1237 machine_ops.restart = lguest_restart; 1413 1238 1414 - /* Now we're set up, call i386_start_kernel() in head32.c and we proceed 1415 - * to boot as normal. It never returns. */ 1239 + /* 1240 + * Now we're set up, call i386_start_kernel() in head32.c and we proceed 1241 + * to boot as normal. It never returns. 1242 + */ 1416 1243 i386_start_kernel(); 1417 1244 } 1418 1245 /*

+70 -42

arch/x86/lguest/i386_head.S

··· 5 5 #include <asm/thread_info.h> 6 6 #include <asm/processor-flags.h> 7 7 8 - /*G:020 Our story starts with the kernel booting into startup_32 in 8 + /*G:020 9 + * Our story starts with the kernel booting into startup_32 in 9 10 * arch/x86/kernel/head_32.S. It expects a boot header, which is created by 10 11 * the bootloader (the Launcher in our case). 11 12 * ··· 22 21 * data without remembering to subtract __PAGE_OFFSET! 23 22 * 24 23 * The .section line puts this code in .init.text so it will be discarded after 25 - * boot. */ 24 + * boot. 25 + */ 26 26 .section .init.text, "ax", @progbits 27 27 ENTRY(lguest_entry) 28 - /* We make the "initialization" hypercall now to tell the Host about 29 - * us, and also find out where it put our page tables. */ 28 + /* 29 + * We make the "initialization" hypercall now to tell the Host about 30 + * us, and also find out where it put our page tables. 31 + */ 30 32 movl $LHCALL_LGUEST_INIT, %eax 31 33 movl $lguest_data - __PAGE_OFFSET, %ebx 32 34 .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */ ··· 37 33 /* Set up the initial stack so we can run C code. */ 38 34 movl $(init_thread_union+THREAD_SIZE),%esp 39 35 40 - /* Jumps are relative, and we're running __PAGE_OFFSET too low at the 41 - * moment. */ 36 + /* Jumps are relative: we're running __PAGE_OFFSET too low. */ 42 37 jmp lguest_init+__PAGE_OFFSET 43 38 44 - /*G:055 We create a macro which puts the assembler code between lgstart_ and 45 - * lgend_ markers. These templates are put in the .text section: they can't be 46 - * discarded after boot as we may need to patch modules, too. */ 39 + /*G:055 40 + * We create a macro which puts the assembler code between lgstart_ and lgend_ 41 + * markers. These templates are put in the .text section: they can't be 42 + * discarded after boot as we may need to patch modules, too. 43 + */ 47 44 .text 48 45 #define LGUEST_PATCH(name, insns...) \ 49 46 lgstart_##name: insns; lgend_##name:; \ ··· 53 48 LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled) 54 49 LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax) 55 50 56 - /*G:033 But using those wrappers is inefficient (we'll see why that doesn't 57 - * matter for save_fl and irq_disable later). If we write our routines 58 - * carefully in assembler, we can avoid clobbering any registers and avoid 59 - * jumping through the wrapper functions. 51 + /*G:033 52 + * But using those wrappers is inefficient (we'll see why that doesn't matter 53 + * for save_fl and irq_disable later). If we write our routines carefully in 54 + * assembler, we can avoid clobbering any registers and avoid jumping through 55 + * the wrapper functions. 60 56 * 61 57 * I skipped over our first piece of assembler, but this one is worth studying 62 - * in a bit more detail so I'll describe in easy stages. First, the routine 63 - * to enable interrupts: */ 58 + * in a bit more detail so I'll describe in easy stages. First, the routine to 59 + * enable interrupts: 60 + */ 64 61 ENTRY(lg_irq_enable) 65 - /* The reverse of irq_disable, this sets lguest_data.irq_enabled to 66 - * X86_EFLAGS_IF (ie. "Interrupts enabled"). */ 62 + /* 63 + * The reverse of irq_disable, this sets lguest_data.irq_enabled to 64 + * X86_EFLAGS_IF (ie. "Interrupts enabled"). 65 + */ 67 66 movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled 68 - /* But now we need to check if the Host wants to know: there might have 67 + /* 68 + * But now we need to check if the Host wants to know: there might have 69 69 * been interrupts waiting to be delivered, in which case it will have 70 70 * set lguest_data.irq_pending to X86_EFLAGS_IF. If it's not zero, we 71 - * jump to send_interrupts, otherwise we're done. */ 71 + * jump to send_interrupts, otherwise we're done. 72 + */ 72 73 testl $0, lguest_data+LGUEST_DATA_irq_pending 73 74 jnz send_interrupts 74 - /* One cool thing about x86 is that you can do many things without using 75 + /* 76 + * One cool thing about x86 is that you can do many things without using 75 77 * a register. In this case, the normal path hasn't needed to save or 76 - * restore any registers at all! */ 78 + * restore any registers at all! 79 + */ 77 80 ret 78 81 send_interrupts: 79 - /* OK, now we need a register: eax is used for the hypercall number, 82 + /* 83 + * OK, now we need a register: eax is used for the hypercall number, 80 84 * which is LHCALL_SEND_INTERRUPTS. 81 85 * 82 86 * We used not to bother with this pending detection at all, which was 83 87 * much simpler. Sooner or later the Host would realize it had to 84 88 * send us an interrupt. But that turns out to make performance 7 85 89 * times worse on a simple tcp benchmark. So now we do this the hard 86 - * way. */ 90 + * way. 91 + */ 87 92 pushl %eax 88 93 movl $LHCALL_SEND_INTERRUPTS, %eax 89 - /* This is a vmcall instruction (same thing that KVM uses). Older 94 + /* 95 + * This is a vmcall instruction (same thing that KVM uses). Older 90 96 * assembler versions might not know the "vmcall" instruction, so we 91 - * create one manually here. */ 97 + * create one manually here. 98 + */ 92 99 .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */ 100 + /* Put eax back the way we found it. */ 93 101 popl %eax 94 102 ret 95 103 96 - /* Finally, the "popf" or "restore flags" routine. The %eax register holds the 104 + /* 105 + * Finally, the "popf" or "restore flags" routine. The %eax register holds the 97 106 * flags (in practice, either X86_EFLAGS_IF or 0): if it's X86_EFLAGS_IF we're 98 - * enabling interrupts again, if it's 0 we're leaving them off. */ 107 + * enabling interrupts again, if it's 0 we're leaving them off. 108 + */ 99 109 ENTRY(lg_restore_fl) 100 110 /* This is just "lguest_data.irq_enabled = flags;" */ 101 111 movl %eax, lguest_data+LGUEST_DATA_irq_enabled 102 - /* Now, if the %eax value has enabled interrupts and 112 + /* 113 + * Now, if the %eax value has enabled interrupts and 103 114 * lguest_data.irq_pending is set, we want to tell the Host so it can 104 115 * deliver any outstanding interrupts. Fortunately, both values will 105 116 * be X86_EFLAGS_IF (ie. 512) in that case, and the "testl" 106 117 * instruction will AND them together for us. If both are set, we 107 - * jump to send_interrupts. */ 118 + * jump to send_interrupts. 119 + */ 108 120 testl lguest_data+LGUEST_DATA_irq_pending, %eax 109 121 jnz send_interrupts 110 122 /* Again, the normal path has used no extra registers. Clever, huh? */ 111 123 ret 124 + /*:*/ 112 125 113 126 /* These demark the EIP range where host should never deliver interrupts. */ 114 127 .global lguest_noirq_start 115 128 .global lguest_noirq_end 116 129 117 - /*M:004 When the Host reflects a trap or injects an interrupt into the Guest, 118 - * it sets the eflags interrupt bit on the stack based on 119 - * lguest_data.irq_enabled, so the Guest iret logic does the right thing when 120 - * restoring it. However, when the Host sets the Guest up for direct traps, 121 - * such as system calls, the processor is the one to push eflags onto the 122 - * stack, and the interrupt bit will be 1 (in reality, interrupts are always 123 - * enabled in the Guest). 130 + /*M:004 131 + * When the Host reflects a trap or injects an interrupt into the Guest, it 132 + * sets the eflags interrupt bit on the stack based on lguest_data.irq_enabled, 133 + * so the Guest iret logic does the right thing when restoring it. However, 134 + * when the Host sets the Guest up for direct traps, such as system calls, the 135 + * processor is the one to push eflags onto the stack, and the interrupt bit 136 + * will be 1 (in reality, interrupts are always enabled in the Guest). 124 137 * 125 138 * This turns out to be harmless: the only trap which should happen under Linux 126 139 * with interrupts disabled is Page Fault (due to our lazy mapping of vmalloc 127 140 * regions), which has to be reflected through the Host anyway. If another 128 141 * trap *does* go off when interrupts are disabled, the Guest will panic, and 129 - * we'll never get to this iret! :*/ 142 + * we'll never get to this iret! 143 + :*/ 130 144 131 - /*G:045 There is one final paravirt_op that the Guest implements, and glancing 132 - * at it you can see why I left it to last. It's *cool*! It's in *assembler*! 145 + /*G:045 146 + * There is one final paravirt_op that the Guest implements, and glancing at it 147 + * you can see why I left it to last. It's *cool*! It's in *assembler*! 133 148 * 134 149 * The "iret" instruction is used to return from an interrupt or trap. The 135 150 * stack looks like this: ··· 173 148 * return to userspace or wherever. Our solution to this is to surround the 174 149 * code with lguest_noirq_start: and lguest_noirq_end: labels. We tell the 175 150 * Host that it is *never* to interrupt us there, even if interrupts seem to be 176 - * enabled. */ 151 + * enabled. 152 + */ 177 153 ENTRY(lguest_iret) 178 154 pushl %eax 179 155 movl 12(%esp), %eax 180 156 lguest_noirq_start: 181 - /* Note the %ss: segment prefix here. Normal data accesses use the 157 + /* 158 + * Note the %ss: segment prefix here. Normal data accesses use the 182 159 * "ds" segment, but that will have already been restored for whatever 183 160 * we're returning to (such as userspace): we can't trust it. The %ss: 184 - * prefix makes sure we use the stack segment, which is still valid. */ 161 + * prefix makes sure we use the stack segment, which is still valid. 162 + */ 185 163 movl %eax,%ss:lguest_data+LGUEST_DATA_irq_enabled 186 164 popl %eax 187 165 iret

+80 -39

drivers/lguest/core.c

··· 1 - /*P:400 This contains run_guest() which actually calls into the Host<->Guest 1 + /*P:400 2 + * This contains run_guest() which actually calls into the Host<->Guest 2 3 * Switcher and analyzes the return, such as determining if the Guest wants the 3 - * Host to do something. This file also contains useful helper routines. :*/ 4 + * Host to do something. This file also contains useful helper routines. 5 + :*/ 4 6 #include <linux/module.h> 5 7 #include <linux/stringify.h> 6 8 #include <linux/stddef.h> ··· 26 24 /* This One Big lock protects all inter-guest data structures. */ 27 25 DEFINE_MUTEX(lguest_lock); 28 26 29 - /*H:010 We need to set up the Switcher at a high virtual address. Remember the 27 + /*H:010 28 + * We need to set up the Switcher at a high virtual address. Remember the 30 29 * Switcher is a few hundred bytes of assembler code which actually changes the 31 30 * CPU to run the Guest, and then changes back to the Host when a trap or 32 31 * interrupt happens. ··· 36 33 * Host since it will be running as the switchover occurs. 37 34 * 38 35 * Trying to map memory at a particular address is an unusual thing to do, so 39 - * it's not a simple one-liner. */ 36 + * it's not a simple one-liner. 37 + */ 40 38 static __init int map_switcher(void) 41 39 { 42 40 int i, err; ··· 51 47 * easy. 52 48 */ 53 49 54 - /* We allocate an array of struct page pointers. map_vm_area() wants 55 - * this, rather than just an array of pages. */ 50 + /* 51 + * We allocate an array of struct page pointers. map_vm_area() wants 52 + * this, rather than just an array of pages. 53 + */ 56 54 switcher_page = kmalloc(sizeof(switcher_page[0])*TOTAL_SWITCHER_PAGES, 57 55 GFP_KERNEL); 58 56 if (!switcher_page) { ··· 62 56 goto out; 63 57 } 64 58 65 - /* Now we actually allocate the pages. The Guest will see these pages, 66 - * so we make sure they're zeroed. */ 59 + /* 60 + * Now we actually allocate the pages. The Guest will see these pages, 61 + * so we make sure they're zeroed. 62 + */ 67 63 for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) { 68 64 unsigned long addr = get_zeroed_page(GFP_KERNEL); 69 65 if (!addr) { ··· 75 67 switcher_page[i] = virt_to_page(addr); 76 68 } 77 69 78 - /* First we check that the Switcher won't overlap the fixmap area at 70 + /* 71 + * First we check that the Switcher won't overlap the fixmap area at 79 72 * the top of memory. It's currently nowhere near, but it could have 80 - * very strange effects if it ever happened. */ 73 + * very strange effects if it ever happened. 74 + */ 81 75 if (SWITCHER_ADDR + (TOTAL_SWITCHER_PAGES+1)*PAGE_SIZE > FIXADDR_START){ 82 76 err = -ENOMEM; 83 77 printk("lguest: mapping switcher would thwack fixmap\n"); 84 78 goto free_pages; 85 79 } 86 80 87 - /* Now we reserve the "virtual memory area" we want: 0xFFC00000 81 + /* 82 + * Now we reserve the "virtual memory area" we want: 0xFFC00000 88 83 * (SWITCHER_ADDR). We might not get it in theory, but in practice 89 84 * it's worked so far. The end address needs +1 because __get_vm_area 90 - * allocates an extra guard page, so we need space for that. */ 85 + * allocates an extra guard page, so we need space for that. 86 + */ 91 87 switcher_vma = __get_vm_area(TOTAL_SWITCHER_PAGES * PAGE_SIZE, 92 88 VM_ALLOC, SWITCHER_ADDR, SWITCHER_ADDR 93 89 + (TOTAL_SWITCHER_PAGES+1) * PAGE_SIZE); ··· 101 89 goto free_pages; 102 90 } 103 91 104 - /* This code actually sets up the pages we've allocated to appear at 92 + /* 93 + * This code actually sets up the pages we've allocated to appear at 105 94 * SWITCHER_ADDR. map_vm_area() takes the vma we allocated above, the 106 95 * kind of pages we're mapping (kernel pages), and a pointer to our 107 96 * array of struct pages. It increments that pointer, but we don't 108 - * care. */ 97 + * care. 98 + */ 109 99 pagep = switcher_page; 110 100 err = map_vm_area(switcher_vma, PAGE_KERNEL_EXEC, &pagep); 111 101 if (err) { ··· 115 101 goto free_vma; 116 102 } 117 103 118 - /* Now the Switcher is mapped at the right address, we can't fail! 119 - * Copy in the compiled-in Switcher code (from <arch>_switcher.S). */ 104 + /* 105 + * Now the Switcher is mapped at the right address, we can't fail! 106 + * Copy in the compiled-in Switcher code (from <arch>_switcher.S). 107 + */ 120 108 memcpy(switcher_vma->addr, start_switcher_text, 121 109 end_switcher_text - start_switcher_text); 122 110 ··· 140 124 } 141 125 /*:*/ 142 126 143 - /* Cleaning up the mapping when the module is unloaded is almost... 144 - * too easy. */ 127 + /* Cleaning up the mapping when the module is unloaded is almost... too easy. */ 145 128 static void unmap_switcher(void) 146 129 { 147 130 unsigned int i; ··· 166 151 * But we can't trust the Guest: it might be trying to access the Launcher 167 152 * code. We have to check that the range is below the pfn_limit the Launcher 168 153 * gave us. We have to make sure that addr + len doesn't give us a false 169 - * positive by overflowing, too. */ 154 + * positive by overflowing, too. 155 + */ 170 156 bool lguest_address_ok(const struct lguest *lg, 171 157 unsigned long addr, unsigned long len) 172 158 { 173 159 return (addr+len) / PAGE_SIZE < lg->pfn_limit && (addr+len >= addr); 174 160 } 175 161 176 - /* This routine copies memory from the Guest. Here we can see how useful the 162 + /* 163 + * This routine copies memory from the Guest. Here we can see how useful the 177 164 * kill_lguest() routine we met in the Launcher can be: we return a random 178 - * value (all zeroes) instead of needing to return an error. */ 165 + * value (all zeroes) instead of needing to return an error. 166 + */ 179 167 void __lgread(struct lg_cpu *cpu, void *b, unsigned long addr, unsigned bytes) 180 168 { 181 169 if (!lguest_address_ok(cpu->lg, addr, bytes) ··· 199 181 } 200 182 /*:*/ 201 183 202 - /*H:030 Let's jump straight to the the main loop which runs the Guest. 184 + /*H:030 185 + * Let's jump straight to the the main loop which runs the Guest. 203 186 * Remember, this is called by the Launcher reading /dev/lguest, and we keep 204 - * going around and around until something interesting happens. */ 187 + * going around and around until something interesting happens. 188 + */ 205 189 int run_guest(struct lg_cpu *cpu, unsigned long __user *user) 206 190 { 207 191 /* We stop running once the Guest is dead. */ ··· 215 195 if (cpu->hcall) 216 196 do_hypercalls(cpu); 217 197 218 - /* It's possible the Guest did a NOTIFY hypercall to the 219 - * Launcher, in which case we return from the read() now. */ 198 + /* 199 + * It's possible the Guest did a NOTIFY hypercall to the 200 + * Launcher. 201 + */ 220 202 if (cpu->pending_notify) { 203 + /* 204 + * Does it just needs to write to a registered 205 + * eventfd (ie. the appropriate virtqueue thread)? 206 + */ 221 207 if (!send_notify_to_eventfd(cpu)) { 208 + /* OK, we tell the main Laucher. */ 222 209 if (put_user(cpu->pending_notify, user)) 223 210 return -EFAULT; 224 211 return sizeof(cpu->pending_notify); ··· 236 209 if (signal_pending(current)) 237 210 return -ERESTARTSYS; 238 211 239 - /* Check if there are any interrupts which can be delivered now: 212 + /* 213 + * Check if there are any interrupts which can be delivered now: 240 214 * if so, this sets up the hander to be executed when we next 241 - * run the Guest. */ 215 + * run the Guest. 216 + */ 242 217 irq = interrupt_pending(cpu, &more); 243 218 if (irq < LGUEST_IRQS) 244 219 try_deliver_interrupt(cpu, irq, more); 245 220 246 - /* All long-lived kernel loops need to check with this horrible 221 + /* 222 + * All long-lived kernel loops need to check with this horrible 247 223 * thing called the freezer. If the Host is trying to suspend, 248 - * it stops us. */ 224 + * it stops us. 225 + */ 249 226 try_to_freeze(); 250 227 251 - /* Just make absolutely sure the Guest is still alive. One of 252 - * those hypercalls could have been fatal, for example. */ 228 + /* 229 + * Just make absolutely sure the Guest is still alive. One of 230 + * those hypercalls could have been fatal, for example. 231 + */ 253 232 if (cpu->lg->dead) 254 233 break; 255 234 256 - /* If the Guest asked to be stopped, we sleep. The Guest's 257 - * clock timer will wake us. */ 235 + /* 236 + * If the Guest asked to be stopped, we sleep. The Guest's 237 + * clock timer will wake us. 238 + */ 258 239 if (cpu->halted) { 259 240 set_current_state(TASK_INTERRUPTIBLE); 260 - /* Just before we sleep, make sure no interrupt snuck in 261 - * which we should be doing. */ 241 + /* 242 + * Just before we sleep, make sure no interrupt snuck in 243 + * which we should be doing. 244 + */ 262 245 if (interrupt_pending(cpu, &more) < LGUEST_IRQS) 263 246 set_current_state(TASK_RUNNING); 264 247 else ··· 276 239 continue; 277 240 } 278 241 279 - /* OK, now we're ready to jump into the Guest. First we put up 280 - * the "Do Not Disturb" sign: */ 242 + /* 243 + * OK, now we're ready to jump into the Guest. First we put up 244 + * the "Do Not Disturb" sign: 245 + */ 281 246 local_irq_disable(); 282 247 283 248 /* Actually run the Guest until something happens. */ ··· 366 327 } 367 328 /*:*/ 368 329 369 - /* The Host side of lguest can be a module. This is a nice way for people to 370 - * play with it. */ 330 + /* 331 + * The Host side of lguest can be a module. This is a nice way for people to 332 + * play with it. 333 + */ 371 334 module_init(init); 372 335 module_exit(fini); 373 336 MODULE_LICENSE("GPL");

+96 -49

drivers/lguest/hypercalls.c

··· 1 - /*P:500 Just as userspace programs request kernel operations through a system 1 + /*P:500 2 + * Just as userspace programs request kernel operations through a system 2 3 * call, the Guest requests Host operations through a "hypercall". You might 3 4 * notice this nomenclature doesn't really follow any logic, but the name has 4 5 * been around for long enough that we're stuck with it. As you'd expect, this 5 - * code is basically a one big switch statement. :*/ 6 + * code is basically a one big switch statement. 7 + :*/ 6 8 7 9 /* Copyright (C) 2006 Rusty Russell IBM Corporation 8 10 ··· 30 28 #include <asm/pgtable.h> 31 29 #include "lg.h" 32 30 33 - /*H:120 This is the core hypercall routine: where the Guest gets what it wants. 34 - * Or gets killed. Or, in the case of LHCALL_SHUTDOWN, both. */ 31 + /*H:120 32 + * This is the core hypercall routine: where the Guest gets what it wants. 33 + * Or gets killed. Or, in the case of LHCALL_SHUTDOWN, both. 34 + */ 35 35 static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args) 36 36 { 37 37 switch (args->arg0) { 38 38 case LHCALL_FLUSH_ASYNC: 39 - /* This call does nothing, except by breaking out of the Guest 40 - * it makes us process all the asynchronous hypercalls. */ 39 + /* 40 + * This call does nothing, except by breaking out of the Guest 41 + * it makes us process all the asynchronous hypercalls. 42 + */ 41 43 break; 42 44 case LHCALL_SEND_INTERRUPTS: 43 - /* This call does nothing too, but by breaking out of the Guest 44 - * it makes us process any pending interrupts. */ 45 + /* 46 + * This call does nothing too, but by breaking out of the Guest 47 + * it makes us process any pending interrupts. 48 + */ 45 49 break; 46 50 case LHCALL_LGUEST_INIT: 47 - /* You can't get here unless you're already initialized. Don't 48 - * do that. */ 51 + /* 52 + * You can't get here unless you're already initialized. Don't 53 + * do that. 54 + */ 49 55 kill_guest(cpu, "already have lguest_data"); 50 56 break; 51 57 case LHCALL_SHUTDOWN: { 52 - /* Shutdown is such a trivial hypercall that we do it in four 53 - * lines right here. */ 54 58 char msg[128]; 55 - /* If the lgread fails, it will call kill_guest() itself; the 56 - * kill_guest() with the message will be ignored. */ 59 + /* 60 + * Shutdown is such a trivial hypercall that we do it in five 61 + * lines right here. 62 + * 63 + * If the lgread fails, it will call kill_guest() itself; the 64 + * kill_guest() with the message will be ignored. 65 + */ 57 66 __lgread(cpu, msg, args->arg1, sizeof(msg)); 58 67 msg[sizeof(msg)-1] = '\0'; 59 68 kill_guest(cpu, "CRASH: %s", msg); ··· 73 60 break; 74 61 } 75 62 case LHCALL_FLUSH_TLB: 76 - /* FLUSH_TLB comes in two flavors, depending on the 77 - * argument: */ 63 + /* FLUSH_TLB comes in two flavors, depending on the argument: */ 78 64 if (args->arg1) 79 65 guest_pagetable_clear_all(cpu); 80 66 else 81 67 guest_pagetable_flush_user(cpu); 82 68 break; 83 69 84 - /* All these calls simply pass the arguments through to the right 85 - * routines. */ 70 + /* 71 + * All these calls simply pass the arguments through to the right 72 + * routines. 73 + */ 86 74 case LHCALL_NEW_PGTABLE: 87 75 guest_new_pagetable(cpu, args->arg1); 88 76 break; ··· 126 112 kill_guest(cpu, "Bad hypercall %li\n", args->arg0); 127 113 } 128 114 } 129 - /*:*/ 130 115 131 - /*H:124 Asynchronous hypercalls are easy: we just look in the array in the 116 + /*H:124 117 + * Asynchronous hypercalls are easy: we just look in the array in the 132 118 * Guest's "struct lguest_data" to see if any new ones are marked "ready". 133 119 * 134 120 * We are careful to do these in order: obviously we respect the order the 135 121 * Guest put them in the ring, but we also promise the Guest that they will 136 122 * happen before any normal hypercall (which is why we check this before 137 - * checking for a normal hcall). */ 123 + * checking for a normal hcall). 124 + */ 138 125 static void do_async_hcalls(struct lg_cpu *cpu) 139 126 { 140 127 unsigned int i; ··· 148 133 /* We process "struct lguest_data"s hcalls[] ring once. */ 149 134 for (i = 0; i < ARRAY_SIZE(st); i++) { 150 135 struct hcall_args args; 151 - /* We remember where we were up to from last time. This makes 136 + /* 137 + * We remember where we were up to from last time. This makes 152 138 * sure that the hypercalls are done in the order the Guest 153 - * places them in the ring. */ 139 + * places them in the ring. 140 + */ 154 141 unsigned int n = cpu->next_hcall; 155 142 156 143 /* 0xFF means there's no call here (yet). */ 157 144 if (st[n] == 0xFF) 158 145 break; 159 146 160 - /* OK, we have hypercall. Increment the "next_hcall" cursor, 161 - * and wrap back to 0 if we reach the end. */ 147 + /* 148 + * OK, we have hypercall. Increment the "next_hcall" cursor, 149 + * and wrap back to 0 if we reach the end. 150 + */ 162 151 if (++cpu->next_hcall == LHCALL_RING_SIZE) 163 152 cpu->next_hcall = 0; 164 153 165 - /* Copy the hypercall arguments into a local copy of 166 - * the hcall_args struct. */ 154 + /* 155 + * Copy the hypercall arguments into a local copy of the 156 + * hcall_args struct. 157 + */ 167 158 if (copy_from_user(&args, &cpu->lg->lguest_data->hcalls[n], 168 159 sizeof(struct hcall_args))) { 169 160 kill_guest(cpu, "Fetching async hypercalls"); ··· 185 164 break; 186 165 } 187 166 188 - /* Stop doing hypercalls if they want to notify the Launcher: 189 - * it needs to service this first. */ 167 + /* 168 + * Stop doing hypercalls if they want to notify the Launcher: 169 + * it needs to service this first. 170 + */ 190 171 if (cpu->pending_notify) 191 172 break; 192 173 } 193 174 } 194 175 195 - /* Last of all, we look at what happens first of all. The very first time the 196 - * Guest makes a hypercall, we end up here to set things up: */ 176 + /* 177 + * Last of all, we look at what happens first of all. The very first time the 178 + * Guest makes a hypercall, we end up here to set things up: 179 + */ 197 180 static void initialize(struct lg_cpu *cpu) 198 181 { 199 - /* You can't do anything until you're initialized. The Guest knows the 200 - * rules, so we're unforgiving here. */ 182 + /* 183 + * You can't do anything until you're initialized. The Guest knows the 184 + * rules, so we're unforgiving here. 185 + */ 201 186 if (cpu->hcall->arg0 != LHCALL_LGUEST_INIT) { 202 187 kill_guest(cpu, "hypercall %li before INIT", cpu->hcall->arg0); 203 188 return; ··· 212 185 if (lguest_arch_init_hypercalls(cpu)) 213 186 kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); 214 187 215 - /* The Guest tells us where we're not to deliver interrupts by putting 216 - * the range of addresses into "struct lguest_data". */ 188 + /* 189 + * The Guest tells us where we're not to deliver interrupts by putting 190 + * the range of addresses into "struct lguest_data". 191 + */ 217 192 if (get_user(cpu->lg->noirq_start, &cpu->lg->lguest_data->noirq_start) 218 193 || get_user(cpu->lg->noirq_end, &cpu->lg->lguest_data->noirq_end)) 219 194 kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); 220 195 221 - /* We write the current time into the Guest's data page once so it can 222 - * set its clock. */ 196 + /* 197 + * We write the current time into the Guest's data page once so it can 198 + * set its clock. 199 + */ 223 200 write_timestamp(cpu); 224 201 225 202 /* page_tables.c will also do some setup. */ 226 203 page_table_guest_data_init(cpu); 227 204 228 - /* This is the one case where the above accesses might have been the 205 + /* 206 + * This is the one case where the above accesses might have been the 229 207 * first write to a Guest page. This may have caused a copy-on-write 230 208 * fault, but the old page might be (read-only) in the Guest 231 - * pagetable. */ 209 + * pagetable. 210 + */ 232 211 guest_pagetable_clear_all(cpu); 233 212 } 234 213 /*:*/ 235 214 236 - /*M:013 If a Guest reads from a page (so creates a mapping) that it has never 215 + /*M:013 216 + * If a Guest reads from a page (so creates a mapping) that it has never 237 217 * written to, and then the Launcher writes to it (ie. the output of a virtual 238 218 * device), the Guest will still see the old page. In practice, this never 239 219 * happens: why would the Guest read a page which it has never written to? But 240 - * a similar scenario might one day bite us, so it's worth mentioning. :*/ 220 + * a similar scenario might one day bite us, so it's worth mentioning. 221 + * 222 + * Note that if we used a shared anonymous mapping in the Launcher instead of 223 + * mapping /dev/zero private, we wouldn't worry about cop-on-write. And we 224 + * need that to switch the Launcher to processes (away from threads) anyway. 225 + :*/ 241 226 242 227 /*H:100 243 228 * Hypercalls ··· 268 229 return; 269 230 } 270 231 271 - /* The Guest has initialized. 232 + /* 233 + * The Guest has initialized. 272 234 * 273 - * Look in the hypercall ring for the async hypercalls: */ 235 + * Look in the hypercall ring for the async hypercalls: 236 + */ 274 237 do_async_hcalls(cpu); 275 238 276 - /* If we stopped reading the hypercall ring because the Guest did a 239 + /* 240 + * If we stopped reading the hypercall ring because the Guest did a 277 241 * NOTIFY to the Launcher, we want to return now. Otherwise we do 278 - * the hypercall. */ 242 + * the hypercall. 243 + */ 279 244 if (!cpu->pending_notify) { 280 245 do_hcall(cpu, cpu->hcall); 281 - /* Tricky point: we reset the hcall pointer to mark the 246 + /* 247 + * Tricky point: we reset the hcall pointer to mark the 282 248 * hypercall as "done". We use the hcall pointer rather than 283 249 * the trap number to indicate a hypercall is pending. 284 250 * Normally it doesn't matter: the Guest will run again and ··· 292 248 * However, if we are signalled or the Guest sends I/O to the 293 249 * Launcher, the run_guest() loop will exit without running the 294 250 * Guest. When it comes back it would try to re-run the 295 - * hypercall. Finding that bug sucked. */ 251 + * hypercall. Finding that bug sucked. 252 + */ 296 253 cpu->hcall = NULL; 297 254 } 298 255 } 299 256 300 - /* This routine supplies the Guest with time: it's used for wallclock time at 301 - * initial boot and as a rough time source if the TSC isn't available. */ 257 + /* 258 + * This routine supplies the Guest with time: it's used for wallclock time at 259 + * initial boot and as a rough time source if the TSC isn't available. 260 + */ 302 261 void write_timestamp(struct lg_cpu *cpu) 303 262 { 304 263 struct timespec now;

+191 -97

drivers/lguest/interrupts_and_traps.c

··· 1 - /*P:800 Interrupts (traps) are complicated enough to earn their own file. 1 + /*P:800 2 + * Interrupts (traps) are complicated enough to earn their own file. 2 3 * There are three classes of interrupts: 3 4 * 4 5 * 1) Real hardware interrupts which occur while we're running the Guest, ··· 11 10 * just like real hardware would deliver them. Traps from the Guest can be set 12 11 * up to go directly back into the Guest, but sometimes the Host wants to see 13 12 * them first, so we also have a way of "reflecting" them into the Guest as if 14 - * they had been delivered to it directly. :*/ 13 + * they had been delivered to it directly. 14 + :*/ 15 15 #include <linux/uaccess.h> 16 16 #include <linux/interrupt.h> 17 17 #include <linux/module.h> ··· 28 26 return (lo & 0x0000FFFF) | (hi & 0xFFFF0000); 29 27 } 30 28 31 - /* The "type" of the interrupt handler is a 4 bit field: we only support a 32 - * couple of types. */ 29 + /* 30 + * The "type" of the interrupt handler is a 4 bit field: we only support a 31 + * couple of types. 32 + */ 33 33 static int idt_type(u32 lo, u32 hi) 34 34 { 35 35 return (hi >> 8) & 0xF; ··· 43 39 return (hi & 0x8000); 44 40 } 45 41 46 - /* We need a helper to "push" a value onto the Guest's stack, since that's a 47 - * big part of what delivering an interrupt does. */ 42 + /* 43 + * We need a helper to "push" a value onto the Guest's stack, since that's a 44 + * big part of what delivering an interrupt does. 45 + */ 48 46 static void push_guest_stack(struct lg_cpu *cpu, unsigned long *gstack, u32 val) 49 47 { 50 48 /* Stack grows upwards: move stack then write value. */ ··· 54 48 lgwrite(cpu, *gstack, u32, val); 55 49 } 56 50 57 - /*H:210 The set_guest_interrupt() routine actually delivers the interrupt or 51 + /*H:210 52 + * The set_guest_interrupt() routine actually delivers the interrupt or 58 53 * trap. The mechanics of delivering traps and interrupts to the Guest are the 59 54 * same, except some traps have an "error code" which gets pushed onto the 60 55 * stack as well: the caller tells us if this is one. ··· 66 59 * 67 60 * We set up the stack just like the CPU does for a real interrupt, so it's 68 61 * identical for the Guest (and the standard "iret" instruction will undo 69 - * it). */ 62 + * it). 63 + */ 70 64 static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi, 71 65 bool has_err) 72 66 { ··· 75 67 u32 eflags, ss, irq_enable; 76 68 unsigned long virtstack; 77 69 78 - /* There are two cases for interrupts: one where the Guest is already 70 + /* 71 + * There are two cases for interrupts: one where the Guest is already 79 72 * in the kernel, and a more complex one where the Guest is in 80 - * userspace. We check the privilege level to find out. */ 73 + * userspace. We check the privilege level to find out. 74 + */ 81 75 if ((cpu->regs->ss&0x3) != GUEST_PL) { 82 - /* The Guest told us their kernel stack with the SET_STACK 83 - * hypercall: both the virtual address and the segment */ 76 + /* 77 + * The Guest told us their kernel stack with the SET_STACK 78 + * hypercall: both the virtual address and the segment. 79 + */ 84 80 virtstack = cpu->esp1; 85 81 ss = cpu->ss1; 86 82 87 83 origstack = gstack = guest_pa(cpu, virtstack); 88 - /* We push the old stack segment and pointer onto the new 84 + /* 85 + * We push the old stack segment and pointer onto the new 89 86 * stack: when the Guest does an "iret" back from the interrupt 90 87 * handler the CPU will notice they're dropping privilege 91 - * levels and expect these here. */ 88 + * levels and expect these here. 89 + */ 92 90 push_guest_stack(cpu, &gstack, cpu->regs->ss); 93 91 push_guest_stack(cpu, &gstack, cpu->regs->esp); 94 92 } else { ··· 105 91 origstack = gstack = guest_pa(cpu, virtstack); 106 92 } 107 93 108 - /* Remember that we never let the Guest actually disable interrupts, so 94 + /* 95 + * Remember that we never let the Guest actually disable interrupts, so 109 96 * the "Interrupt Flag" bit is always set. We copy that bit from the 110 97 * Guest's "irq_enabled" field into the eflags word: we saw the Guest 111 - * copy it back in "lguest_iret". */ 98 + * copy it back in "lguest_iret". 99 + */ 112 100 eflags = cpu->regs->eflags; 113 101 if (get_user(irq_enable, &cpu->lg->lguest_data->irq_enabled) == 0 114 102 && !(irq_enable & X86_EFLAGS_IF)) 115 103 eflags &= ~X86_EFLAGS_IF; 116 104 117 - /* An interrupt is expected to push three things on the stack: the old 105 + /* 106 + * An interrupt is expected to push three things on the stack: the old 118 107 * "eflags" word, the old code segment, and the old instruction 119 - * pointer. */ 108 + * pointer. 109 + */ 120 110 push_guest_stack(cpu, &gstack, eflags); 121 111 push_guest_stack(cpu, &gstack, cpu->regs->cs); 122 112 push_guest_stack(cpu, &gstack, cpu->regs->eip); ··· 129 111 if (has_err) 130 112 push_guest_stack(cpu, &gstack, cpu->regs->errcode); 131 113 132 - /* Now we've pushed all the old state, we change the stack, the code 133 - * segment and the address to execute. */ 114 + /* 115 + * Now we've pushed all the old state, we change the stack, the code 116 + * segment and the address to execute. 117 + */ 134 118 cpu->regs->ss = ss; 135 119 cpu->regs->esp = virtstack + (gstack - origstack); 136 120 cpu->regs->cs = (__KERNEL_CS|GUEST_PL); 137 121 cpu->regs->eip = idt_address(lo, hi); 138 122 139 - /* There are two kinds of interrupt handlers: 0xE is an "interrupt 140 - * gate" which expects interrupts to be disabled on entry. */ 123 + /* 124 + * There are two kinds of interrupt handlers: 0xE is an "interrupt 125 + * gate" which expects interrupts to be disabled on entry. 126 + */ 141 127 if (idt_type(lo, hi) == 0xE) 142 128 if (put_user(0, &cpu->lg->lguest_data->irq_enabled)) 143 129 kill_guest(cpu, "Disabling interrupts"); ··· 152 130 * 153 131 * interrupt_pending() returns the first pending interrupt which isn't blocked 154 132 * by the Guest. It is called before every entry to the Guest, and just before 155 - * we go to sleep when the Guest has halted itself. */ 133 + * we go to sleep when the Guest has halted itself. 134 + */ 156 135 unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more) 157 136 { 158 137 unsigned int irq; ··· 163 140 if (!cpu->lg->lguest_data) 164 141 return LGUEST_IRQS; 165 142 166 - /* Take our "irqs_pending" array and remove any interrupts the Guest 167 - * wants blocked: the result ends up in "blk". */ 143 + /* 144 + * Take our "irqs_pending" array and remove any interrupts the Guest 145 + * wants blocked: the result ends up in "blk". 146 + */ 168 147 if (copy_from_user(&blk, cpu->lg->lguest_data->blocked_interrupts, 169 148 sizeof(blk))) 170 149 return LGUEST_IRQS; ··· 179 154 return irq; 180 155 } 181 156 182 - /* This actually diverts the Guest to running an interrupt handler, once an 183 - * interrupt has been identified by interrupt_pending(). */ 157 + /* 158 + * This actually diverts the Guest to running an interrupt handler, once an 159 + * interrupt has been identified by interrupt_pending(). 160 + */ 184 161 void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more) 185 162 { 186 163 struct desc_struct *idt; 187 164 188 165 BUG_ON(irq >= LGUEST_IRQS); 189 166 190 - /* They may be in the middle of an iret, where they asked us never to 191 - * deliver interrupts. */ 167 + /* 168 + * They may be in the middle of an iret, where they asked us never to 169 + * deliver interrupts. 170 + */ 192 171 if (cpu->regs->eip >= cpu->lg->noirq_start && 193 172 (cpu->regs->eip < cpu->lg->noirq_end)) 194 173 return; ··· 216 187 } 217 188 } 218 189 219 - /* Look at the IDT entry the Guest gave us for this interrupt. The 190 + /* 191 + * Look at the IDT entry the Guest gave us for this interrupt. The 220 192 * first 32 (FIRST_EXTERNAL_VECTOR) entries are for traps, so we skip 221 - * over them. */ 193 + * over them. 194 + */ 222 195 idt = &cpu->arch.idt[FIRST_EXTERNAL_VECTOR+irq]; 223 196 /* If they don't have a handler (yet?), we just ignore it */ 224 197 if (idt_present(idt->a, idt->b)) { 225 198 /* OK, mark it no longer pending and deliver it. */ 226 199 clear_bit(irq, cpu->irqs_pending); 227 - /* set_guest_interrupt() takes the interrupt descriptor and a 200 + /* 201 + * set_guest_interrupt() takes the interrupt descriptor and a 228 202 * flag to say whether this interrupt pushes an error code onto 229 - * the stack as well: virtual interrupts never do. */ 203 + * the stack as well: virtual interrupts never do. 204 + */ 230 205 set_guest_interrupt(cpu, idt->a, idt->b, false); 231 206 } 232 207 233 - /* Every time we deliver an interrupt, we update the timestamp in the 208 + /* 209 + * Every time we deliver an interrupt, we update the timestamp in the 234 210 * Guest's lguest_data struct. It would be better for the Guest if we 235 211 * did this more often, but it can actually be quite slow: doing it 236 212 * here is a compromise which means at least it gets updated every 237 - * timer interrupt. */ 213 + * timer interrupt. 214 + */ 238 215 write_timestamp(cpu); 239 216 240 - /* If there are no other interrupts we want to deliver, clear 241 - * the pending flag. */ 217 + /* 218 + * If there are no other interrupts we want to deliver, clear 219 + * the pending flag. 220 + */ 242 221 if (!more) 243 222 put_user(0, &cpu->lg->lguest_data->irq_pending); 244 223 } ··· 254 217 /* And this is the routine when we want to set an interrupt for the Guest. */ 255 218 void set_interrupt(struct lg_cpu *cpu, unsigned int irq) 256 219 { 257 - /* Next time the Guest runs, the core code will see if it can deliver 258 - * this interrupt. */ 220 + /* 221 + * Next time the Guest runs, the core code will see if it can deliver 222 + * this interrupt. 223 + */ 259 224 set_bit(irq, cpu->irqs_pending); 260 225 261 - /* Make sure it sees it; it might be asleep (eg. halted), or 262 - * running the Guest right now, in which case kick_process() 263 - * will knock it out. */ 226 + /* 227 + * Make sure it sees it; it might be asleep (eg. halted), or running 228 + * the Guest right now, in which case kick_process() will knock it out. 229 + */ 264 230 if (!wake_up_process(cpu->tsk)) 265 231 kick_process(cpu->tsk); 266 232 } 267 233 /*:*/ 268 234 269 - /* Linux uses trap 128 for system calls. Plan9 uses 64, and Ron Minnich sent 235 + /* 236 + * Linux uses trap 128 for system calls. Plan9 uses 64, and Ron Minnich sent 270 237 * me a patch, so we support that too. It'd be a big step for lguest if half 271 238 * the Plan 9 user base were to start using it. 272 239 * 273 240 * Actually now I think of it, it's possible that Ron *is* half the Plan 9 274 - * userbase. Oh well. */ 241 + * userbase. Oh well. 242 + */ 275 243 static bool could_be_syscall(unsigned int num) 276 244 { 277 245 /* Normal Linux SYSCALL_VECTOR or reserved vector? */ ··· 316 274 clear_bit(syscall_vector, used_vectors); 317 275 } 318 276 319 - /*H:220 Now we've got the routines to deliver interrupts, delivering traps like 277 + /*H:220 278 + * Now we've got the routines to deliver interrupts, delivering traps like 320 279 * page fault is easy. The only trick is that Intel decided that some traps 321 - * should have error codes: */ 280 + * should have error codes: 281 + */ 322 282 static bool has_err(unsigned int trap) 323 283 { 324 284 return (trap == 8 || (trap >= 10 && trap <= 14) || trap == 17); ··· 329 285 /* deliver_trap() returns true if it could deliver the trap. */ 330 286 bool deliver_trap(struct lg_cpu *cpu, unsigned int num) 331 287 { 332 - /* Trap numbers are always 8 bit, but we set an impossible trap number 333 - * for traps inside the Switcher, so check that here. */ 288 + /* 289 + * Trap numbers are always 8 bit, but we set an impossible trap number 290 + * for traps inside the Switcher, so check that here. 291 + */ 334 292 if (num >= ARRAY_SIZE(cpu->arch.idt)) 335 293 return false; 336 294 337 - /* Early on the Guest hasn't set the IDT entries (or maybe it put a 338 - * bogus one in): if we fail here, the Guest will be killed. */ 295 + /* 296 + * Early on the Guest hasn't set the IDT entries (or maybe it put a 297 + * bogus one in): if we fail here, the Guest will be killed. 298 + */ 339 299 if (!idt_present(cpu->arch.idt[num].a, cpu->arch.idt[num].b)) 340 300 return false; 341 301 set_guest_interrupt(cpu, cpu->arch.idt[num].a, ··· 347 299 return true; 348 300 } 349 301 350 - /*H:250 Here's the hard part: returning to the Host every time a trap happens 302 + /*H:250 303 + * Here's the hard part: returning to the Host every time a trap happens 351 304 * and then calling deliver_trap() and re-entering the Guest is slow. 352 305 * Particularly because Guest userspace system calls are traps (usually trap 353 306 * 128). ··· 360 311 * the other hypervisors would beat it up at lunchtime. 361 312 * 362 313 * This routine indicates if a particular trap number could be delivered 363 - * directly. */ 314 + * directly. 315 + */ 364 316 static bool direct_trap(unsigned int num) 365 317 { 366 - /* Hardware interrupts don't go to the Guest at all (except system 367 - * call). */ 318 + /* 319 + * Hardware interrupts don't go to the Guest at all (except system 320 + * call). 321 + */ 368 322 if (num >= FIRST_EXTERNAL_VECTOR && !could_be_syscall(num)) 369 323 return false; 370 324 371 - /* The Host needs to see page faults (for shadow paging and to save the 325 + /* 326 + * The Host needs to see page faults (for shadow paging and to save the 372 327 * fault address), general protection faults (in/out emulation) and 373 328 * device not available (TS handling), invalid opcode fault (kvm hcall), 374 - * and of course, the hypercall trap. */ 329 + * and of course, the hypercall trap. 330 + */ 375 331 return num != 14 && num != 13 && num != 7 && 376 332 num != 6 && num != LGUEST_TRAP_ENTRY; 377 333 } 378 334 /*:*/ 379 335 380 - /*M:005 The Guest has the ability to turn its interrupt gates into trap gates, 336 + /*M:005 337 + * The Guest has the ability to turn its interrupt gates into trap gates, 381 338 * if it is careful. The Host will let trap gates can go directly to the 382 339 * Guest, but the Guest needs the interrupts atomically disabled for an 383 340 * interrupt gate. It can do this by pointing the trap gate at instructions 384 - * within noirq_start and noirq_end, where it can safely disable interrupts. */ 341 + * within noirq_start and noirq_end, where it can safely disable interrupts. 342 + */ 385 343 386 - /*M:006 The Guests do not use the sysenter (fast system call) instruction, 344 + /*M:006 345 + * The Guests do not use the sysenter (fast system call) instruction, 387 346 * because it's hardcoded to enter privilege level 0 and so can't go direct. 388 347 * It's about twice as fast as the older "int 0x80" system call, so it might 389 348 * still be worthwhile to handle it in the Switcher and lcall down to the 390 349 * Guest. The sysenter semantics are hairy tho: search for that keyword in 391 - * entry.S :*/ 350 + * entry.S 351 + :*/ 392 352 393 - /*H:260 When we make traps go directly into the Guest, we need to make sure 353 + /*H:260 354 + * When we make traps go directly into the Guest, we need to make sure 394 355 * the kernel stack is valid (ie. mapped in the page tables). Otherwise, the 395 356 * CPU trying to deliver the trap will fault while trying to push the interrupt 396 357 * words on the stack: this is called a double fault, and it forces us to kill 397 358 * the Guest. 398 359 * 399 - * Which is deeply unfair, because (literally!) it wasn't the Guests' fault. */ 360 + * Which is deeply unfair, because (literally!) it wasn't the Guests' fault. 361 + */ 400 362 void pin_stack_pages(struct lg_cpu *cpu) 401 363 { 402 364 unsigned int i; 403 365 404 - /* Depending on the CONFIG_4KSTACKS option, the Guest can have one or 405 - * two pages of stack space. */ 366 + /* 367 + * Depending on the CONFIG_4KSTACKS option, the Guest can have one or 368 + * two pages of stack space. 369 + */ 406 370 for (i = 0; i < cpu->lg->stack_pages; i++) 407 - /* The stack grows *upwards*, so the address we're given is the 371 + /* 372 + * The stack grows *upwards*, so the address we're given is the 408 373 * start of the page after the kernel stack. Subtract one to 409 374 * get back onto the first stack page, and keep subtracting to 410 - * get to the rest of the stack pages. */ 375 + * get to the rest of the stack pages. 376 + */ 411 377 pin_page(cpu, cpu->esp1 - 1 - i * PAGE_SIZE); 412 378 } 413 379 414 - /* Direct traps also mean that we need to know whenever the Guest wants to use 380 + /* 381 + * Direct traps also mean that we need to know whenever the Guest wants to use 415 382 * a different kernel stack, so we can change the IDT entries to use that 416 383 * stack. The IDT entries expect a virtual address, so unlike most addresses 417 384 * the Guest gives us, the "esp" (stack pointer) value here is virtual, not 418 385 * physical. 419 386 * 420 387 * In Linux each process has its own kernel stack, so this happens a lot: we 421 - * change stacks on each context switch. */ 388 + * change stacks on each context switch. 389 + */ 422 390 void guest_set_stack(struct lg_cpu *cpu, u32 seg, u32 esp, unsigned int pages) 423 391 { 424 - /* You are not allowed have a stack segment with privilege level 0: bad 425 - * Guest! */ 392 + /* 393 + * You're not allowed a stack segment with privilege level 0: bad Guest! 394 + */ 426 395 if ((seg & 0x3) != GUEST_PL) 427 396 kill_guest(cpu, "bad stack segment %i", seg); 428 397 /* We only expect one or two stack pages. */ ··· 454 387 pin_stack_pages(cpu); 455 388 } 456 389 457 - /* All this reference to mapping stacks leads us neatly into the other complex 458 - * part of the Host: page table handling. */ 390 + /* 391 + * All this reference to mapping stacks leads us neatly into the other complex 392 + * part of the Host: page table handling. 393 + */ 459 394 460 - /*H:235 This is the routine which actually checks the Guest's IDT entry and 461 - * transfers it into the entry in "struct lguest": */ 395 + /*H:235 396 + * This is the routine which actually checks the Guest's IDT entry and 397 + * transfers it into the entry in "struct lguest": 398 + */ 462 399 static void set_trap(struct lg_cpu *cpu, struct desc_struct *trap, 463 400 unsigned int num, u32 lo, u32 hi) 464 401 { ··· 478 407 if (type != 0xE && type != 0xF) 479 408 kill_guest(cpu, "bad IDT type %i", type); 480 409 481 - /* We only copy the handler address, present bit, privilege level and 410 + /* 411 + * We only copy the handler address, present bit, privilege level and 482 412 * type. The privilege level controls where the trap can be triggered 483 413 * manually with an "int" instruction. This is usually GUEST_PL, 484 - * except for system calls which userspace can use. */ 414 + * except for system calls which userspace can use. 415 + */ 485 416 trap->a = ((__KERNEL_CS|GUEST_PL)<<16) | (lo&0x0000FFFF); 486 417 trap->b = (hi&0xFFFFEF00); 487 418 } 488 419 489 - /*H:230 While we're here, dealing with delivering traps and interrupts to the 420 + /*H:230 421 + * While we're here, dealing with delivering traps and interrupts to the 490 422 * Guest, we might as well complete the picture: how the Guest tells us where 491 423 * it wants them to go. This would be simple, except making traps fast 492 424 * requires some tricks. 493 425 * 494 426 * We saw the Guest setting Interrupt Descriptor Table (IDT) entries with the 495 - * LHCALL_LOAD_IDT_ENTRY hypercall before: that comes here. */ 427 + * LHCALL_LOAD_IDT_ENTRY hypercall before: that comes here. 428 + */ 496 429 void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int num, u32 lo, u32 hi) 497 430 { 498 - /* Guest never handles: NMI, doublefault, spurious interrupt or 499 - * hypercall. We ignore when it tries to set them. */ 431 + /* 432 + * Guest never handles: NMI, doublefault, spurious interrupt or 433 + * hypercall. We ignore when it tries to set them. 434 + */ 500 435 if (num == 2 || num == 8 || num == 15 || num == LGUEST_TRAP_ENTRY) 501 436 return; 502 437 503 - /* Mark the IDT as changed: next time the Guest runs we'll know we have 504 - * to copy this again. */ 438 + /* 439 + * Mark the IDT as changed: next time the Guest runs we'll know we have 440 + * to copy this again. 441 + */ 505 442 cpu->changed |= CHANGED_IDT; 506 443 507 444 /* Check that the Guest doesn't try to step outside the bounds. */ ··· 519 440 set_trap(cpu, &cpu->arch.idt[num], num, lo, hi); 520 441 } 521 442 522 - /* The default entry for each interrupt points into the Switcher routines which 443 + /* 444 + * The default entry for each interrupt points into the Switcher routines which 523 445 * simply return to the Host. The run_guest() loop will then call 524 - * deliver_trap() to bounce it back into the Guest. */ 446 + * deliver_trap() to bounce it back into the Guest. 447 + */ 525 448 static void default_idt_entry(struct desc_struct *idt, 526 449 int trap, 527 450 const unsigned long handler, ··· 532 451 /* A present interrupt gate. */ 533 452 u32 flags = 0x8e00; 534 453 535 - /* Set the privilege level on the entry for the hypercall: this allows 536 - * the Guest to use the "int" instruction to trigger it. */ 454 + /* 455 + * Set the privilege level on the entry for the hypercall: this allows 456 + * the Guest to use the "int" instruction to trigger it. 457 + */ 537 458 if (trap == LGUEST_TRAP_ENTRY) 538 459 flags |= (GUEST_PL << 13); 539 460 else if (base) 540 - /* Copy priv. level from what Guest asked for. This allows 541 - * debug (int 3) traps from Guest userspace, for example. */ 461 + /* 462 + * Copy privilege level from what Guest asked for. This allows 463 + * debug (int 3) traps from Guest userspace, for example. 464 + */ 542 465 flags |= (base->b & 0x6000); 543 466 544 467 /* Now pack it into the IDT entry in its weird format. */ ··· 560 475 default_idt_entry(&state->guest_idt[i], i, def[i], NULL); 561 476 } 562 477 563 - /*H:240 We don't use the IDT entries in the "struct lguest" directly, instead 478 + /*H:240 479 + * We don't use the IDT entries in the "struct lguest" directly, instead 564 480 * we copy them into the IDT which we've set up for Guests on this CPU, just 565 - * before we run the Guest. This routine does that copy. */ 481 + * before we run the Guest. This routine does that copy. 482 + */ 566 483 void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt, 567 484 const unsigned long *def) 568 485 { 569 486 unsigned int i; 570 487 571 - /* We can simply copy the direct traps, otherwise we use the default 572 - * ones in the Switcher: they will return to the Host. */ 488 + /* 489 + * We can simply copy the direct traps, otherwise we use the default 490 + * ones in the Switcher: they will return to the Host. 491 + */ 573 492 for (i = 0; i < ARRAY_SIZE(cpu->arch.idt); i++) { 574 493 const struct desc_struct *gidt = &cpu->arch.idt[i]; 575 494 ··· 581 492 if (!direct_trap(i)) 582 493 continue; 583 494 584 - /* Only trap gates (type 15) can go direct to the Guest. 495 + /* 496 + * Only trap gates (type 15) can go direct to the Guest. 585 497 * Interrupt gates (type 14) disable interrupts as they are 586 498 * entered, which we never let the Guest do. Not present 587 499 * entries (type 0x0) also can't go direct, of course. 588 500 * 589 501 * If it can't go direct, we still need to copy the priv. level: 590 502 * they might want to give userspace access to a software 591 - * interrupt. */ 503 + * interrupt. 504 + */ 592 505 if (idt_type(gidt->a, gidt->b) == 0xF) 593 506 idt[i] = *gidt; 594 507 else ··· 609 518 * the next timer interrupt (in nanoseconds). We use the high-resolution timer 610 519 * infrastructure to set a callback at that time. 611 520 * 612 - * 0 means "turn off the clock". */ 521 + * 0 means "turn off the clock". 522 + */ 613 523 void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta) 614 524 { 615 525 ktime_t expires; ··· 621 529 return; 622 530 } 623 531 624 - /* We use wallclock time here, so the Guest might not be running for 532 + /* 533 + * We use wallclock time here, so the Guest might not be running for 625 534 * all the time between now and the timer interrupt it asked for. This 626 - * is almost always the right thing to do. */ 535 + * is almost always the right thing to do. 536 + */ 627 537 expires = ktime_add_ns(ktime_get_real(), delta); 628 538 hrtimer_start(&cpu->hrt, expires, HRTIMER_MODE_ABS); 629 539 }

+18 -14

drivers/lguest/lg.h

··· 16 16 void free_pagetables(void); 17 17 int init_pagetables(struct page **switcher_page, unsigned int pages); 18 18 19 - struct pgdir 20 - { 19 + struct pgdir { 21 20 unsigned long gpgdir; 22 21 pgd_t *pgdir; 23 22 }; 24 23 25 24 /* We have two pages shared with guests, per cpu. */ 26 - struct lguest_pages 27 - { 25 + struct lguest_pages { 28 26 /* This is the stack page mapped rw in guest */ 29 27 char spare[PAGE_SIZE - sizeof(struct lguest_regs)]; 30 28 struct lguest_regs regs; ··· 52 54 53 55 unsigned long pending_notify; /* pfn from LHCALL_NOTIFY */ 54 56 55 - /* At end of a page shared mapped over lguest_pages in guest. */ 57 + /* At end of a page shared mapped over lguest_pages in guest. */ 56 58 unsigned long regs_page; 57 59 struct lguest_regs *regs; 58 60 59 61 struct lguest_pages *last_pages; 60 62 61 - int cpu_pgd; /* which pgd this cpu is currently using */ 63 + int cpu_pgd; /* Which pgd this cpu is currently using */ 62 64 63 65 /* If a hypercall was asked for, this points to the arguments. */ 64 66 struct hcall_args *hcall; ··· 87 89 }; 88 90 89 91 /* The private info the thread maintains about the guest. */ 90 - struct lguest 91 - { 92 + struct lguest { 92 93 struct lguest_data __user *lguest_data; 93 94 struct lg_cpu cpus[NR_CPUS]; 94 95 unsigned int nr_cpus; 95 96 96 97 u32 pfn_limit; 97 - /* This provides the offset to the base of guest-physical 98 - * memory in the Launcher. */ 98 + 99 + /* 100 + * This provides the offset to the base of guest-physical memory in the 101 + * Launcher. 102 + */ 99 103 void __user *mem_base; 100 104 unsigned long kernel_address; 101 105 ··· 122 122 void __lgread(struct lg_cpu *, void *, unsigned long, unsigned); 123 123 void __lgwrite(struct lg_cpu *, unsigned long, const void *, unsigned); 124 124 125 - /*H:035 Using memory-copy operations like that is usually inconvient, so we 125 + /*H:035 126 + * Using memory-copy operations like that is usually inconvient, so we 126 127 * have the following helper macros which read and write a specific type (often 127 128 * an unsigned long). 128 129 * 129 - * This reads into a variable of the given type then returns that. */ 130 + * This reads into a variable of the given type then returns that. 131 + */ 130 132 #define lgread(cpu, addr, type) \ 131 133 ({ type _v; __lgread((cpu), &_v, (addr), sizeof(_v)); _v; }) 132 134 ··· 142 140 143 141 int run_guest(struct lg_cpu *cpu, unsigned long __user *user); 144 142 145 - /* Helper macros to obtain the first 12 or the last 20 bits, this is only the 143 + /* 144 + * Helper macros to obtain the first 12 or the last 20 bits, this is only the 146 145 * first step in the migration to the kernel types. pte_pfn is already defined 147 - * in the kernel. */ 146 + * in the kernel. 147 + */ 148 148 #define pgd_flags(x) (pgd_val(x) & ~PAGE_MASK) 149 149 #define pgd_pfn(x) (pgd_val(x) >> PAGE_SHIFT) 150 150 #define pmd_flags(x) (pmd_val(x) & ~PAGE_MASK)

+102 -58

drivers/lguest/lguest_device.c

··· 1 - /*P:050 Lguest guests use a very simple method to describe devices. It's a 1 + /*P:050 2 + * Lguest guests use a very simple method to describe devices. It's a 2 3 * series of device descriptors contained just above the top of normal Guest 3 4 * memory. 4 5 * 5 6 * We use the standard "virtio" device infrastructure, which provides us with a 6 7 * console, a network and a block driver. Each one expects some configuration 7 - * information and a "virtqueue" or two to send and receive data. :*/ 8 + * information and a "virtqueue" or two to send and receive data. 9 + :*/ 8 10 #include <linux/init.h> 9 11 #include <linux/bootmem.h> 10 12 #include <linux/lguest_launcher.h> ··· 22 20 /* The pointer to our (page) of device descriptions. */ 23 21 static void *lguest_devices; 24 22 25 - /* For Guests, device memory can be used as normal memory, so we cast away the 26 - * __iomem to quieten sparse. */ 23 + /* 24 + * For Guests, device memory can be used as normal memory, so we cast away the 25 + * __iomem to quieten sparse. 26 + */ 27 27 static inline void *lguest_map(unsigned long phys_addr, unsigned long pages) 28 28 { 29 29 return (__force void *)ioremap_cache(phys_addr, PAGE_SIZE*pages); ··· 36 32 iounmap((__force void __iomem *)addr); 37 33 } 38 34 39 - /*D:100 Each lguest device is just a virtio device plus a pointer to its entry 40 - * in the lguest_devices page. */ 35 + /*D:100 36 + * Each lguest device is just a virtio device plus a pointer to its entry 37 + * in the lguest_devices page. 38 + */ 41 39 struct lguest_device { 42 40 struct virtio_device vdev; 43 41 ··· 47 41 struct lguest_device_desc *desc; 48 42 }; 49 43 50 - /* Since the virtio infrastructure hands us a pointer to the virtio_device all 44 + /* 45 + * Since the virtio infrastructure hands us a pointer to the virtio_device all 51 46 * the time, it helps to have a curt macro to get a pointer to the struct 52 - * lguest_device it's enclosed in. */ 47 + * lguest_device it's enclosed in. 48 + */ 53 49 #define to_lgdev(vd) container_of(vd, struct lguest_device, vdev) 54 50 55 51 /*D:130 ··· 63 55 * the driver will look at them during setup. 64 56 * 65 57 * A convenient routine to return the device's virtqueue config array: 66 - * immediately after the descriptor. */ 58 + * immediately after the descriptor. 59 + */ 67 60 static struct lguest_vqconfig *lg_vq(const struct lguest_device_desc *desc) 68 61 { 69 62 return (void *)(desc + 1); ··· 107 98 return features; 108 99 } 109 100 110 - /* The virtio core takes the features the Host offers, and copies the 111 - * ones supported by the driver into the vdev->features array. Once 112 - * that's all sorted out, this routine is called so we can tell the 113 - * Host which features we understand and accept. */ 101 + /* 102 + * The virtio core takes the features the Host offers, and copies the ones 103 + * supported by the driver into the vdev->features array. Once that's all 104 + * sorted out, this routine is called so we can tell the Host which features we 105 + * understand and accept. 106 + */ 114 107 static void lg_finalize_features(struct virtio_device *vdev) 115 108 { 116 109 unsigned int i, bits; ··· 123 112 /* Give virtio_ring a chance to accept features. */ 124 113 vring_transport_features(vdev); 125 114 126 - /* The vdev->feature array is a Linux bitmask: this isn't the 127 - * same as a the simple array of bits used by lguest devices 128 - * for features. So we do this slow, manual conversion which is 129 - * completely general. */ 115 + /* 116 + * The vdev->feature array is a Linux bitmask: this isn't the same as a 117 + * the simple array of bits used by lguest devices for features. So we 118 + * do this slow, manual conversion which is completely general. 119 + */ 130 120 memset(out_features, 0, desc->feature_len); 131 121 bits = min_t(unsigned, desc->feature_len, sizeof(vdev->features)) * 8; 132 122 for (i = 0; i < bits; i++) { ··· 158 146 memcpy(lg_config(desc) + offset, buf, len); 159 147 } 160 148 161 - /* The operations to get and set the status word just access the status field 162 - * of the device descriptor. */ 149 + /* 150 + * The operations to get and set the status word just access the status field 151 + * of the device descriptor. 152 + */ 163 153 static u8 lg_get_status(struct virtio_device *vdev) 164 154 { 165 155 return to_lgdev(vdev)->desc->status; 166 156 } 167 157 168 - /* To notify on status updates, we (ab)use the NOTIFY hypercall, with the 169 - * descriptor address of the device. A zero status means "reset". */ 158 + /* 159 + * To notify on status updates, we (ab)use the NOTIFY hypercall, with the 160 + * descriptor address of the device. A zero status means "reset". 161 + */ 170 162 static void set_status(struct virtio_device *vdev, u8 status) 171 163 { 172 164 unsigned long offset = (void *)to_lgdev(vdev)->desc - lguest_devices; ··· 207 191 */ 208 192 209 193 /*D:140 This is the information we remember about each virtqueue. */ 210 - struct lguest_vq_info 211 - { 194 + struct lguest_vq_info { 212 195 /* A copy of the information contained in the device config. */ 213 196 struct lguest_vqconfig config; 214 197 ··· 215 200 void *pages; 216 201 }; 217 202 218 - /* When the virtio_ring code wants to prod the Host, it calls us here and we 203 + /* 204 + * When the virtio_ring code wants to prod the Host, it calls us here and we 219 205 * make a hypercall. We hand the physical address of the virtqueue so the Host 220 - * knows which virtqueue we're talking about. */ 206 + * knows which virtqueue we're talking about. 207 + */ 221 208 static void lg_notify(struct virtqueue *vq) 222 209 { 223 - /* We store our virtqueue information in the "priv" pointer of the 224 - * virtqueue structure. */ 210 + /* 211 + * We store our virtqueue information in the "priv" pointer of the 212 + * virtqueue structure. 213 + */ 225 214 struct lguest_vq_info *lvq = vq->priv; 226 215 227 216 kvm_hypercall1(LHCALL_NOTIFY, lvq->config.pfn << PAGE_SHIFT); ··· 234 215 /* An extern declaration inside a C file is bad form. Don't do it. */ 235 216 extern void lguest_setup_irq(unsigned int irq); 236 217 237 - /* This routine finds the first virtqueue described in the configuration of 218 + /* 219 + * This routine finds the Nth virtqueue described in the configuration of 238 220 * this device and sets it up. 239 221 * 240 222 * This is kind of an ugly duckling. It'd be nicer to have a standard ··· 243 223 * everyone wants to do it differently. The KVM coders want the Guest to 244 224 * allocate its own pages and tell the Host where they are, but for lguest it's 245 225 * simpler for the Host to simply tell us where the pages are. 246 - * 247 - * So we provide drivers with a "find the Nth virtqueue and set it up" 248 - * function. */ 226 + */ 249 227 static struct virtqueue *lg_find_vq(struct virtio_device *vdev, 250 228 unsigned index, 251 229 void (*callback)(struct virtqueue *vq), ··· 262 244 if (!lvq) 263 245 return ERR_PTR(-ENOMEM); 264 246 265 - /* Make a copy of the "struct lguest_vqconfig" entry, which sits after 247 + /* 248 + * Make a copy of the "struct lguest_vqconfig" entry, which sits after 266 249 * the descriptor. We need a copy because the config space might not 267 - * be aligned correctly. */ 250 + * be aligned correctly. 251 + */ 268 252 memcpy(&lvq->config, lg_vq(ldev->desc)+index, sizeof(lvq->config)); 269 253 270 254 printk("Mapping virtqueue %i addr %lx\n", index, ··· 281 261 goto free_lvq; 282 262 } 283 263 284 - /* OK, tell virtio_ring.c to set up a virtqueue now we know its size 285 - * and we've got a pointer to its pages. */ 264 + /* 265 + * OK, tell virtio_ring.c to set up a virtqueue now we know its size 266 + * and we've got a pointer to its pages. 267 + */ 286 268 vq = vring_new_virtqueue(lvq->config.num, LGUEST_VRING_ALIGN, 287 269 vdev, lvq->pages, lg_notify, callback, name); 288 270 if (!vq) { ··· 295 273 /* Make sure the interrupt is allocated. */ 296 274 lguest_setup_irq(lvq->config.irq); 297 275 298 - /* Tell the interrupt for this virtqueue to go to the virtio_ring 299 - * interrupt handler. */ 300 - /* FIXME: We used to have a flag for the Host to tell us we could use 276 + /* 277 + * Tell the interrupt for this virtqueue to go to the virtio_ring 278 + * interrupt handler. 279 + * 280 + * FIXME: We used to have a flag for the Host to tell us we could use 301 281 * the interrupt as a source of randomness: it'd be nice to have that 302 - * back.. */ 282 + * back. 283 + */ 303 284 err = request_irq(lvq->config.irq, vring_interrupt, IRQF_SHARED, 304 285 dev_name(&vdev->dev), vq); 305 286 if (err) 306 287 goto destroy_vring; 307 288 308 - /* Last of all we hook up our 'struct lguest_vq_info" to the 309 - * virtqueue's priv pointer. */ 289 + /* 290 + * Last of all we hook up our 'struct lguest_vq_info" to the 291 + * virtqueue's priv pointer. 292 + */ 310 293 vq->priv = lvq; 311 294 return vq; 312 295 ··· 385 358 .del_vqs = lg_del_vqs, 386 359 }; 387 360 388 - /* The root device for the lguest virtio devices. This makes them appear as 389 - * /sys/devices/lguest/0,1,2 not /sys/devices/0,1,2. */ 361 + /* 362 + * The root device for the lguest virtio devices. This makes them appear as 363 + * /sys/devices/lguest/0,1,2 not /sys/devices/0,1,2. 364 + */ 390 365 static struct device *lguest_root; 391 366 392 - /*D:120 This is the core of the lguest bus: actually adding a new device. 367 + /*D:120 368 + * This is the core of the lguest bus: actually adding a new device. 393 369 * It's a separate function because it's neater that way, and because an 394 370 * earlier version of the code supported hotplug and unplug. They were removed 395 371 * early on because they were never used. ··· 401 371 * 402 372 * It's worth reading this carefully: we start with a pointer to the new device 403 373 * descriptor in the "lguest_devices" page, and the offset into the device 404 - * descriptor page so we can uniquely identify it if things go badly wrong. */ 374 + * descriptor page so we can uniquely identify it if things go badly wrong. 375 + */ 405 376 static void add_lguest_device(struct lguest_device_desc *d, 406 377 unsigned int offset) 407 378 { 408 379 struct lguest_device *ldev; 409 380 410 - /* Start with zeroed memory; Linux's device layer seems to count on 411 - * it. */ 381 + /* Start with zeroed memory; Linux's device layer counts on it. */ 412 382 ldev = kzalloc(sizeof(*ldev), GFP_KERNEL); 413 383 if (!ldev) { 414 384 printk(KERN_EMERG "Cannot allocate lguest dev %u type %u\n", ··· 418 388 419 389 /* This devices' parent is the lguest/ dir. */ 420 390 ldev->vdev.dev.parent = lguest_root; 421 - /* We have a unique device index thanks to the dev_index counter. */ 391 + /* 392 + * The device type comes straight from the descriptor. There's also a 393 + * device vendor field in the virtio_device struct, which we leave as 394 + * 0. 395 + */ 422 396 ldev->vdev.id.device = d->type; 423 - /* We have a simple set of routines for querying the device's 424 - * configuration information and setting its status. */ 397 + /* 398 + * We have a simple set of routines for querying the device's 399 + * configuration information and setting its status. 400 + */ 425 401 ldev->vdev.config = &lguest_config_ops; 426 402 /* And we remember the device's descriptor for lguest_config_ops. */ 427 403 ldev->desc = d; 428 404 429 - /* register_virtio_device() sets up the generic fields for the struct 405 + /* 406 + * register_virtio_device() sets up the generic fields for the struct 430 407 * virtio_device and calls device_register(). This makes the bus 431 - * infrastructure look for a matching driver. */ 408 + * infrastructure look for a matching driver. 409 + */ 432 410 if (register_virtio_device(&ldev->vdev) != 0) { 433 411 printk(KERN_ERR "Failed to register lguest dev %u type %u\n", 434 412 offset, d->type); ··· 444 406 } 445 407 } 446 408 447 - /*D:110 scan_devices() simply iterates through the device page. The type 0 is 448 - * reserved to mean "end of devices". */ 409 + /*D:110 410 + * scan_devices() simply iterates through the device page. The type 0 is 411 + * reserved to mean "end of devices". 412 + */ 449 413 static void scan_devices(void) 450 414 { 451 415 unsigned int i; ··· 466 426 } 467 427 } 468 428 469 - /*D:105 Fairly early in boot, lguest_devices_init() is called to set up the 429 + /*D:105 430 + * Fairly early in boot, lguest_devices_init() is called to set up the 470 431 * lguest device infrastructure. We check that we are a Guest by checking 471 432 * pv_info.name: there are other ways of checking, but this seems most 472 433 * obvious to me. ··· 478 437 * correct sysfs incantation). 479 438 * 480 439 * Finally we call scan_devices() which adds all the devices found in the 481 - * lguest_devices page. */ 440 + * lguest_devices page. 441 + */ 482 442 static int __init lguest_devices_init(void) 483 443 { 484 444 if (strcmp(pv_info.name, "lguest") != 0) ··· 498 456 /* We do this after core stuff, but before the drivers. */ 499 457 postcore_initcall(lguest_devices_init); 500 458 501 - /*D:150 At this point in the journey we used to now wade through the lguest 459 + /*D:150 460 + * At this point in the journey we used to now wade through the lguest 502 461 * devices themselves: net, block and console. Since they're all now virtio 503 462 * devices rather than lguest-specific, I've decided to ignore them. Mostly, 504 463 * they're kind of boring. But this does mean you'll never experience the 505 464 * thrill of reading the forbidden love scene buried deep in the block driver. 506 465 * 507 466 * "make Launcher" beckons, where we answer questions like "Where do Guests 508 - * come from?", and "What do you do when someone asks for optimization?". */ 467 + * come from?", and "What do you do when someone asks for optimization?". 468 + */

+178 -54

drivers/lguest/lguest_user.c

··· 1 1 /*P:200 This contains all the /dev/lguest code, whereby the userspace launcher 2 2 * controls and communicates with the Guest. For example, the first write will 3 - * tell us the Guest's memory layout, pagetable, entry point and kernel address 4 - * offset. A read will run the Guest until something happens, such as a signal 5 - * or the Guest doing a NOTIFY out to the Launcher. :*/ 3 + * tell us the Guest's memory layout and entry point. A read will run the 4 + * Guest until something happens, such as a signal or the Guest doing a NOTIFY 5 + * out to the Launcher. 6 + :*/ 6 7 #include <linux/uaccess.h> 7 8 #include <linux/miscdevice.h> 8 9 #include <linux/fs.h> ··· 12 11 #include <linux/file.h> 13 12 #include "lg.h" 14 13 14 + /*L:056 15 + * Before we move on, let's jump ahead and look at what the kernel does when 16 + * it needs to look up the eventfds. That will complete our picture of how we 17 + * use RCU. 18 + * 19 + * The notification value is in cpu->pending_notify: we return true if it went 20 + * to an eventfd. 21 + */ 15 22 bool send_notify_to_eventfd(struct lg_cpu *cpu) 16 23 { 17 24 unsigned int i; 18 25 struct lg_eventfd_map *map; 19 26 20 - /* lg->eventfds is RCU-protected */ 27 + /* 28 + * This "rcu_read_lock()" helps track when someone is still looking at 29 + * the (RCU-using) eventfds array. It's not actually a lock at all; 30 + * indeed it's a noop in many configurations. (You didn't expect me to 31 + * explain all the RCU secrets here, did you?) 32 + */ 21 33 rcu_read_lock(); 34 + /* 35 + * rcu_dereference is the counter-side of rcu_assign_pointer(); it 36 + * makes sure we don't access the memory pointed to by 37 + * cpu->lg->eventfds before cpu->lg->eventfds is set. Sounds crazy, 38 + * but Alpha allows this! Paul McKenney points out that a really 39 + * aggressive compiler could have the same effect: 40 + * http://lists.ozlabs.org/pipermail/lguest/2009-July/001560.html 41 + * 42 + * So play safe, use rcu_dereference to get the rcu-protected pointer: 43 + */ 22 44 map = rcu_dereference(cpu->lg->eventfds); 45 + /* 46 + * Simple array search: even if they add an eventfd while we do this, 47 + * we'll continue to use the old array and just won't see the new one. 48 + */ 23 49 for (i = 0; i < map->num; i++) { 24 50 if (map->map[i].addr == cpu->pending_notify) { 25 51 eventfd_signal(map->map[i].event, 1); ··· 54 26 break; 55 27 } 56 28 } 29 + /* We're done with the rcu-protected variable cpu->lg->eventfds. */ 57 30 rcu_read_unlock(); 31 + 32 + /* If we cleared the notification, it's because we found a match. */ 58 33 return cpu->pending_notify == 0; 59 34 } 60 35 36 + /*L:055 37 + * One of the more tricksy tricks in the Linux Kernel is a technique called 38 + * Read Copy Update. Since one point of lguest is to teach lguest journeyers 39 + * about kernel coding, I use it here. (In case you're curious, other purposes 40 + * include learning about virtualization and instilling a deep appreciation for 41 + * simplicity and puppies). 42 + * 43 + * We keep a simple array which maps LHCALL_NOTIFY values to eventfds, but we 44 + * add new eventfds without ever blocking readers from accessing the array. 45 + * The current Launcher only does this during boot, so that never happens. But 46 + * Read Copy Update is cool, and adding a lock risks damaging even more puppies 47 + * than this code does. 48 + * 49 + * We allocate a brand new one-larger array, copy the old one and add our new 50 + * element. Then we make the lg eventfd pointer point to the new array. 51 + * That's the easy part: now we need to free the old one, but we need to make 52 + * sure no slow CPU somewhere is still looking at it. That's what 53 + * synchronize_rcu does for us: waits until every CPU has indicated that it has 54 + * moved on to know it's no longer using the old one. 55 + * 56 + * If that's unclear, see http://en.wikipedia.org/wiki/Read-copy-update. 57 + */ 61 58 static int add_eventfd(struct lguest *lg, unsigned long addr, int fd) 62 59 { 63 60 struct lg_eventfd_map *new, *old = lg->eventfds; 64 61 62 + /* 63 + * We don't allow notifications on value 0 anyway (pending_notify of 64 + * 0 means "nothing pending"). 65 + */ 65 66 if (!addr) 66 67 return -EINVAL; 67 68 68 - /* Replace the old array with the new one, carefully: others can 69 - * be accessing it at the same time */ 69 + /* 70 + * Replace the old array with the new one, carefully: others can 71 + * be accessing it at the same time. 72 + */ 70 73 new = kmalloc(sizeof(*new) + sizeof(new->map[0]) * (old->num + 1), 71 74 GFP_KERNEL); 72 75 if (!new) ··· 111 52 new->map[new->num].addr = addr; 112 53 new->map[new->num].event = eventfd_ctx_fdget(fd); 113 54 if (IS_ERR(new->map[new->num].event)) { 55 + int err = PTR_ERR(new->map[new->num].event); 114 56 kfree(new); 115 - return PTR_ERR(new->map[new->num].event); 57 + return err; 116 58 } 117 59 new->num++; 118 60 119 - /* Now put new one in place. */ 61 + /* 62 + * Now put new one in place: rcu_assign_pointer() is a fancy way of 63 + * doing "lg->eventfds = new", but it uses memory barriers to make 64 + * absolutely sure that the contents of "new" written above is nailed 65 + * down before we actually do the assignment. 66 + * 67 + * We have to think about these kinds of things when we're operating on 68 + * live data without locks. 69 + */ 120 70 rcu_assign_pointer(lg->eventfds, new); 121 71 122 - /* We're not in a big hurry. Wait until noone's looking at old 123 - * version, then delete it. */ 72 + /* 73 + * We're not in a big hurry. Wait until noone's looking at old 74 + * version, then free it. 75 + */ 124 76 synchronize_rcu(); 125 77 kfree(old); 126 78 127 79 return 0; 128 80 } 129 81 82 + /*L:052 83 + * Receiving notifications from the Guest is usually done by attaching a 84 + * particular LHCALL_NOTIFY value to an event filedescriptor. The eventfd will 85 + * become readable when the Guest does an LHCALL_NOTIFY with that value. 86 + * 87 + * This is really convenient for processing each virtqueue in a separate 88 + * thread. 89 + */ 130 90 static int attach_eventfd(struct lguest *lg, const unsigned long __user *input) 131 91 { 132 92 unsigned long addr, fd; ··· 157 79 if (get_user(fd, input) != 0) 158 80 return -EFAULT; 159 81 82 + /* 83 + * Just make sure two callers don't add eventfds at once. We really 84 + * only need to lock against callers adding to the same Guest, so using 85 + * the Big Lguest Lock is overkill. But this is setup, not a fast path. 86 + */ 160 87 mutex_lock(&lguest_lock); 161 88 err = add_eventfd(lg, addr, fd); 162 89 mutex_unlock(&lguest_lock); 163 90 164 - return 0; 91 + return err; 165 92 } 166 93 167 - /*L:050 Sending an interrupt is done by writing LHREQ_IRQ and an interrupt 168 - * number to /dev/lguest. */ 94 + /*L:050 95 + * Sending an interrupt is done by writing LHREQ_IRQ and an interrupt 96 + * number to /dev/lguest. 97 + */ 169 98 static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input) 170 99 { 171 100 unsigned long irq; ··· 182 97 if (irq >= LGUEST_IRQS) 183 98 return -EINVAL; 184 99 100 + /* 101 + * Next time the Guest runs, the core code will see if it can deliver 102 + * this interrupt. 103 + */ 185 104 set_interrupt(cpu, irq); 186 105 return 0; 187 106 } 188 107 189 - /*L:040 Once our Guest is initialized, the Launcher makes it run by reading 190 - * from /dev/lguest. */ 108 + /*L:040 109 + * Once our Guest is initialized, the Launcher makes it run by reading 110 + * from /dev/lguest. 111 + */ 191 112 static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o) 192 113 { 193 114 struct lguest *lg = file->private_data; ··· 229 138 return len; 230 139 } 231 140 232 - /* If we returned from read() last time because the Guest sent I/O, 233 - * clear the flag. */ 141 + /* 142 + * If we returned from read() last time because the Guest sent I/O, 143 + * clear the flag. 144 + */ 234 145 if (cpu->pending_notify) 235 146 cpu->pending_notify = 0; 236 147 ··· 240 147 return run_guest(cpu, (unsigned long __user *)user); 241 148 } 242 149 243 - /*L:025 This actually initializes a CPU. For the moment, a Guest is only 244 - * uniprocessor, so "id" is always 0. */ 150 + /*L:025 151 + * This actually initializes a CPU. For the moment, a Guest is only 152 + * uniprocessor, so "id" is always 0. 153 + */ 245 154 static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip) 246 155 { 247 156 /* We have a limited number the number of CPUs in the lguest struct. */ ··· 258 163 /* Each CPU has a timer it can set. */ 259 164 init_clockdev(cpu); 260 165 261 - /* We need a complete page for the Guest registers: they are accessible 262 - * to the Guest and we can only grant it access to whole pages. */ 166 + /* 167 + * We need a complete page for the Guest registers: they are accessible 168 + * to the Guest and we can only grant it access to whole pages. 169 + */ 263 170 cpu->regs_page = get_zeroed_page(GFP_KERNEL); 264 171 if (!cpu->regs_page) 265 172 return -ENOMEM; ··· 269 172 /* We actually put the registers at the bottom of the page. */ 270 173 cpu->regs = (void *)cpu->regs_page + PAGE_SIZE - sizeof(*cpu->regs); 271 174 272 - /* Now we initialize the Guest's registers, handing it the start 273 - * address. */ 175 + /* 176 + * Now we initialize the Guest's registers, handing it the start 177 + * address. 178 + */ 274 179 lguest_arch_setup_regs(cpu, start_ip); 275 180 276 - /* We keep a pointer to the Launcher task (ie. current task) for when 277 - * other Guests want to wake this one (eg. console input). */ 181 + /* 182 + * We keep a pointer to the Launcher task (ie. current task) for when 183 + * other Guests want to wake this one (eg. console input). 184 + */ 278 185 cpu->tsk = current; 279 186 280 - /* We need to keep a pointer to the Launcher's memory map, because if 187 + /* 188 + * We need to keep a pointer to the Launcher's memory map, because if 281 189 * the Launcher dies we need to clean it up. If we don't keep a 282 - * reference, it is destroyed before close() is called. */ 190 + * reference, it is destroyed before close() is called. 191 + */ 283 192 cpu->mm = get_task_mm(cpu->tsk); 284 193 285 - /* We remember which CPU's pages this Guest used last, for optimization 286 - * when the same Guest runs on the same CPU twice. */ 194 + /* 195 + * We remember which CPU's pages this Guest used last, for optimization 196 + * when the same Guest runs on the same CPU twice. 197 + */ 287 198 cpu->last_pages = NULL; 288 199 289 200 /* No error == success. */ 290 201 return 0; 291 202 } 292 203 293 - /*L:020 The initialization write supplies 3 pointer sized (32 or 64 bit) 294 - * values (in addition to the LHREQ_INITIALIZE value). These are: 204 + /*L:020 205 + * The initialization write supplies 3 pointer sized (32 or 64 bit) values (in 206 + * addition to the LHREQ_INITIALIZE value). These are: 295 207 * 296 208 * base: The start of the Guest-physical memory inside the Launcher memory. 297 209 * ··· 312 206 */ 313 207 static int initialize(struct file *file, const unsigned long __user *input) 314 208 { 315 - /* "struct lguest" contains everything we (the Host) know about a 316 - * Guest. */ 209 + /* "struct lguest" contains all we (the Host) know about a Guest. */ 317 210 struct lguest *lg; 318 211 int err; 319 212 unsigned long args[3]; 320 213 321 - /* We grab the Big Lguest lock, which protects against multiple 322 - * simultaneous initializations. */ 214 + /* 215 + * We grab the Big Lguest lock, which protects against multiple 216 + * simultaneous initializations. 217 + */ 323 218 mutex_lock(&lguest_lock); 324 219 /* You can't initialize twice! Close the device and start again... */ 325 220 if (file->private_data) { ··· 355 248 if (err) 356 249 goto free_eventfds; 357 250 358 - /* Initialize the Guest's shadow page tables, using the toplevel 359 - * address the Launcher gave us. This allocates memory, so can fail. */ 251 + /* 252 + * Initialize the Guest's shadow page tables, using the toplevel 253 + * address the Launcher gave us. This allocates memory, so can fail. 254 + */ 360 255 err = init_guest_pagetable(lg); 361 256 if (err) 362 257 goto free_regs; ··· 383 274 return err; 384 275 } 385 276 386 - /*L:010 The first operation the Launcher does must be a write. All writes 277 + /*L:010 278 + * The first operation the Launcher does must be a write. All writes 387 279 * start with an unsigned long number: for the first write this must be 388 280 * LHREQ_INITIALIZE to set up the Guest. After that the Launcher can use 389 - * writes of other values to send interrupts. 281 + * writes of other values to send interrupts or set up receipt of notifications. 390 282 * 391 283 * Note that we overload the "offset" in the /dev/lguest file to indicate what 392 - * CPU number we're dealing with. Currently this is always 0, since we only 284 + * CPU number we're dealing with. Currently this is always 0 since we only 393 285 * support uniprocessor Guests, but you can see the beginnings of SMP support 394 - * here. */ 286 + * here. 287 + */ 395 288 static ssize_t write(struct file *file, const char __user *in, 396 289 size_t size, loff_t *off) 397 290 { 398 - /* Once the Guest is initialized, we hold the "struct lguest" in the 399 - * file private data. */ 291 + /* 292 + * Once the Guest is initialized, we hold the "struct lguest" in the 293 + * file private data. 294 + */ 400 295 struct lguest *lg = file->private_data; 401 296 const unsigned long __user *input = (const unsigned long __user *)in; 402 297 unsigned long req; ··· 435 322 } 436 323 } 437 324 438 - /*L:060 The final piece of interface code is the close() routine. It reverses 325 + /*L:060 326 + * The final piece of interface code is the close() routine. It reverses 439 327 * everything done in initialize(). This is usually called because the 440 328 * Launcher exited. 441 329 * 442 330 * Note that the close routine returns 0 or a negative error number: it can't 443 331 * really fail, but it can whine. I blame Sun for this wart, and K&R C for 444 - * letting them do it. :*/ 332 + * letting them do it. 333 + :*/ 445 334 static int close(struct inode *inode, struct file *file) 446 335 { 447 336 struct lguest *lg = file->private_data; ··· 453 338 if (!lg) 454 339 return 0; 455 340 456 - /* We need the big lock, to protect from inter-guest I/O and other 457 - * Launchers initializing guests. */ 341 + /* 342 + * We need the big lock, to protect from inter-guest I/O and other 343 + * Launchers initializing guests. 344 + */ 458 345 mutex_lock(&lguest_lock); 459 346 460 347 /* Free up the shadow page tables for the Guest. */ ··· 467 350 hrtimer_cancel(&lg->cpus[i].hrt); 468 351 /* We can free up the register page we allocated. */ 469 352 free_page(lg->cpus[i].regs_page); 470 - /* Now all the memory cleanups are done, it's safe to release 471 - * the Launcher's memory management structure. */ 353 + /* 354 + * Now all the memory cleanups are done, it's safe to release 355 + * the Launcher's memory management structure. 356 + */ 472 357 mmput(lg->cpus[i].mm); 473 358 } 474 359 ··· 479 360 eventfd_ctx_put(lg->eventfds->map[i].event); 480 361 kfree(lg->eventfds); 481 362 482 - /* If lg->dead doesn't contain an error code it will be NULL or a 483 - * kmalloc()ed string, either of which is ok to hand to kfree(). */ 363 + /* 364 + * If lg->dead doesn't contain an error code it will be NULL or a 365 + * kmalloc()ed string, either of which is ok to hand to kfree(). 366 + */ 484 367 if (!IS_ERR(lg->dead)) 485 368 kfree(lg->dead); 486 369 /* Free the memory allocated to the lguest_struct */ ··· 506 385 * 507 386 * We begin our understanding with the Host kernel interface which the Launcher 508 387 * uses: reading and writing a character device called /dev/lguest. All the 509 - * work happens in the read(), write() and close() routines: */ 388 + * work happens in the read(), write() and close() routines: 389 + */ 510 390 static struct file_operations lguest_fops = { 511 391 .owner = THIS_MODULE, 512 392 .release = close, ··· 515 393 .read = read, 516 394 }; 517 395 518 - /* This is a textbook example of a "misc" character device. Populate a "struct 519 - * miscdevice" and register it with misc_register(). */ 396 + /* 397 + * This is a textbook example of a "misc" character device. Populate a "struct 398 + * miscdevice" and register it with misc_register(). 399 + */ 520 400 static struct miscdevice lguest_dev = { 521 401 .minor = MISC_DYNAMIC_MINOR, 522 402 .name = "lguest",

+336 -153

drivers/lguest/page_tables.c

··· 1 - /*P:700 The pagetable code, on the other hand, still shows the scars of 1 + /*P:700 2 + * The pagetable code, on the other hand, still shows the scars of 2 3 * previous encounters. It's functional, and as neat as it can be in the 3 4 * circumstances, but be wary, for these things are subtle and break easily. 4 5 * The Guest provides a virtual to physical mapping, but we can neither trust 5 6 * it nor use it: we verify and convert it here then point the CPU to the 6 - * converted Guest pages when running the Guest. :*/ 7 + * converted Guest pages when running the Guest. 8 + :*/ 7 9 8 10 /* Copyright (C) Rusty Russell IBM Corporation 2006. 9 11 * GPL v2 and any later version */ ··· 19 17 #include <asm/bootparam.h> 20 18 #include "lg.h" 21 19 22 - /*M:008 We hold reference to pages, which prevents them from being swapped. 20 + /*M:008 21 + * We hold reference to pages, which prevents them from being swapped. 23 22 * It'd be nice to have a callback in the "struct mm_struct" when Linux wants 24 23 * to swap out. If we had this, and a shrinker callback to trim PTE pages, we 25 - * could probably consider launching Guests as non-root. :*/ 24 + * could probably consider launching Guests as non-root. 25 + :*/ 26 26 27 27 /*H:300 28 28 * The Page Table Code 29 29 * 30 - * We use two-level page tables for the Guest. If you're not entirely 31 - * comfortable with virtual addresses, physical addresses and page tables then 32 - * I recommend you review arch/x86/lguest/boot.c's "Page Table Handling" (with 33 - * diagrams!). 30 + * We use two-level page tables for the Guest, or three-level with PAE. If 31 + * you're not entirely comfortable with virtual addresses, physical addresses 32 + * and page tables then I recommend you review arch/x86/lguest/boot.c's "Page 33 + * Table Handling" (with diagrams!). 34 34 * 35 35 * The Guest keeps page tables, but we maintain the actual ones here: these are 36 36 * called "shadow" page tables. Which is a very Guest-centric name: these are ··· 49 45 * (v) Flushing (throwing away) page tables, 50 46 * (vi) Mapping the Switcher when the Guest is about to run, 51 47 * (vii) Setting up the page tables initially. 52 - :*/ 48 + :*/ 53 49 54 - 55 - /* 1024 entries in a page table page maps 1024 pages: 4MB. The Switcher is 56 - * conveniently placed at the top 4MB, so it uses a separate, complete PTE 57 - * page. */ 50 + /* 51 + * The Switcher uses the complete top PTE page. That's 1024 PTE entries (4MB) 52 + * or 512 PTE entries with PAE (2MB). 53 + */ 58 54 #define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1) 59 55 60 - /* For PAE we need the PMD index as well. We use the last 2MB, so we 61 - * will need the last pmd entry of the last pmd page. */ 56 + /* 57 + * For PAE we need the PMD index as well. We use the last 2MB, so we 58 + * will need the last pmd entry of the last pmd page. 59 + */ 62 60 #ifdef CONFIG_X86_PAE 63 61 #define SWITCHER_PMD_INDEX (PTRS_PER_PMD - 1) 64 62 #define RESERVE_MEM 2U ··· 70 64 #define CHECK_GPGD_MASK _PAGE_TABLE 71 65 #endif 72 66 73 - /* We actually need a separate PTE page for each CPU. Remember that after the 67 + /* 68 + * We actually need a separate PTE page for each CPU. Remember that after the 74 69 * Switcher code itself comes two pages for each CPU, and we don't want this 75 - * CPU's guest to see the pages of any other CPU. */ 70 + * CPU's guest to see the pages of any other CPU. 71 + */ 76 72 static DEFINE_PER_CPU(pte_t *, switcher_pte_pages); 77 73 #define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu) 78 74 79 - /*H:320 The page table code is curly enough to need helper functions to keep it 80 - * clear and clean. 75 + /*H:320 76 + * The page table code is curly enough to need helper functions to keep it 77 + * clear and clean. The kernel itself provides many of them; one advantage 78 + * of insisting that the Guest and Host use the same CONFIG_PAE setting. 81 79 * 82 80 * There are two functions which return pointers to the shadow (aka "real") 83 81 * page tables. ··· 89 79 * spgd_addr() takes the virtual address and returns a pointer to the top-level 90 80 * page directory entry (PGD) for that address. Since we keep track of several 91 81 * page tables, the "i" argument tells us which one we're interested in (it's 92 - * usually the current one). */ 82 + * usually the current one). 83 + */ 93 84 static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr) 94 85 { 95 86 unsigned int index = pgd_index(vaddr); ··· 107 96 } 108 97 109 98 #ifdef CONFIG_X86_PAE 110 - /* This routine then takes the PGD entry given above, which contains the 99 + /* 100 + * This routine then takes the PGD entry given above, which contains the 111 101 * address of the PMD page. It then returns a pointer to the PMD entry for the 112 - * given address. */ 102 + * given address. 103 + */ 113 104 static pmd_t *spmd_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) 114 105 { 115 106 unsigned int index = pmd_index(vaddr); ··· 132 119 } 133 120 #endif 134 121 135 - /* This routine then takes the page directory entry returned above, which 122 + /* 123 + * This routine then takes the page directory entry returned above, which 136 124 * contains the address of the page table entry (PTE) page. It then returns a 137 - * pointer to the PTE entry for the given address. */ 125 + * pointer to the PTE entry for the given address. 126 + */ 138 127 static pte_t *spte_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) 139 128 { 140 129 #ifdef CONFIG_X86_PAE ··· 154 139 return &page[pte_index(vaddr)]; 155 140 } 156 141 157 - /* These two functions just like the above two, except they access the Guest 158 - * page tables. Hence they return a Guest address. */ 142 + /* 143 + * These functions are just like the above two, except they access the Guest 144 + * page tables. Hence they return a Guest address. 145 + */ 159 146 static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr) 160 147 { 161 148 unsigned int index = vaddr >> (PGDIR_SHIFT); ··· 165 148 } 166 149 167 150 #ifdef CONFIG_X86_PAE 151 + /* Follow the PGD to the PMD. */ 168 152 static unsigned long gpmd_addr(pgd_t gpgd, unsigned long vaddr) 169 153 { 170 154 unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT; ··· 173 155 return gpage + pmd_index(vaddr) * sizeof(pmd_t); 174 156 } 175 157 158 + /* Follow the PMD to the PTE. */ 176 159 static unsigned long gpte_addr(struct lg_cpu *cpu, 177 160 pmd_t gpmd, unsigned long vaddr) 178 161 { ··· 183 164 return gpage + pte_index(vaddr) * sizeof(pte_t); 184 165 } 185 166 #else 167 + /* Follow the PGD to the PTE (no mid-level for !PAE). */ 186 168 static unsigned long gpte_addr(struct lg_cpu *cpu, 187 169 pgd_t gpgd, unsigned long vaddr) 188 170 { ··· 195 175 #endif 196 176 /*:*/ 197 177 198 - /*M:014 get_pfn is slow: we could probably try to grab batches of pages here as 199 - * an optimization (ie. pre-faulting). :*/ 178 + /*M:014 179 + * get_pfn is slow: we could probably try to grab batches of pages here as 180 + * an optimization (ie. pre-faulting). 181 + :*/ 200 182 201 - /*H:350 This routine takes a page number given by the Guest and converts it to 183 + /*H:350 184 + * This routine takes a page number given by the Guest and converts it to 202 185 * an actual, physical page number. It can fail for several reasons: the 203 186 * virtual address might not be mapped by the Launcher, the write flag is set 204 187 * and the page is read-only, or the write flag was set and the page was 205 188 * shared so had to be copied, but we ran out of memory. 206 189 * 207 190 * This holds a reference to the page, so release_pte() is careful to put that 208 - * back. */ 191 + * back. 192 + */ 209 193 static unsigned long get_pfn(unsigned long virtpfn, int write) 210 194 { 211 195 struct page *page; ··· 222 198 return -1UL; 223 199 } 224 200 225 - /*H:340 Converting a Guest page table entry to a shadow (ie. real) page table 201 + /*H:340 202 + * Converting a Guest page table entry to a shadow (ie. real) page table 226 203 * entry can be a little tricky. The flags are (almost) the same, but the 227 204 * Guest PTE contains a virtual page number: the CPU needs the real page 228 - * number. */ 205 + * number. 206 + */ 229 207 static pte_t gpte_to_spte(struct lg_cpu *cpu, pte_t gpte, int write) 230 208 { 231 209 unsigned long pfn, base, flags; 232 210 233 - /* The Guest sets the global flag, because it thinks that it is using 211 + /* 212 + * The Guest sets the global flag, because it thinks that it is using 234 213 * PGE. We only told it to use PGE so it would tell us whether it was 235 214 * flushing a kernel mapping or a userspace mapping. We don't actually 236 - * use the global bit, so throw it away. */ 215 + * use the global bit, so throw it away. 216 + */ 237 217 flags = (pte_flags(gpte) & ~_PAGE_GLOBAL); 238 218 239 219 /* The Guest's pages are offset inside the Launcher. */ 240 220 base = (unsigned long)cpu->lg->mem_base / PAGE_SIZE; 241 221 242 - /* We need a temporary "unsigned long" variable to hold the answer from 222 + /* 223 + * We need a temporary "unsigned long" variable to hold the answer from 243 224 * get_pfn(), because it returns 0xFFFFFFFF on failure, which wouldn't 244 225 * fit in spte.pfn. get_pfn() finds the real physical number of the 245 - * page, given the virtual number. */ 226 + * page, given the virtual number. 227 + */ 246 228 pfn = get_pfn(base + pte_pfn(gpte), write); 247 229 if (pfn == -1UL) { 248 230 kill_guest(cpu, "failed to get page %lu", pte_pfn(gpte)); 249 - /* When we destroy the Guest, we'll go through the shadow page 231 + /* 232 + * When we destroy the Guest, we'll go through the shadow page 250 233 * tables and release_pte() them. Make sure we don't think 251 - * this one is valid! */ 234 + * this one is valid! 235 + */ 252 236 flags = 0; 253 237 } 254 238 /* Now we assemble our shadow PTE from the page number and flags. */ ··· 266 234 /*H:460 And to complete the chain, release_pte() looks like this: */ 267 235 static void release_pte(pte_t pte) 268 236 { 269 - /* Remember that get_user_pages_fast() took a reference to the page, in 270 - * get_pfn()? We have to put it back now. */ 237 + /* 238 + * Remember that get_user_pages_fast() took a reference to the page, in 239 + * get_pfn()? We have to put it back now. 240 + */ 271 241 if (pte_flags(pte) & _PAGE_PRESENT) 272 242 put_page(pte_page(pte)); 273 243 } ··· 307 273 * and return to the Guest without it knowing. 308 274 * 309 275 * If we fixed up the fault (ie. we mapped the address), this routine returns 310 - * true. Otherwise, it was a real fault and we need to tell the Guest. */ 276 + * true. Otherwise, it was a real fault and we need to tell the Guest. 277 + */ 311 278 bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) 312 279 { 313 280 pgd_t gpgd; ··· 317 282 pte_t gpte; 318 283 pte_t *spte; 319 284 285 + /* Mid level for PAE. */ 320 286 #ifdef CONFIG_X86_PAE 321 287 pmd_t *spmd; 322 288 pmd_t gpmd; ··· 334 298 if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) { 335 299 /* No shadow entry: allocate a new shadow PTE page. */ 336 300 unsigned long ptepage = get_zeroed_page(GFP_KERNEL); 337 - /* This is not really the Guest's fault, but killing it is 338 - * simple for this corner case. */ 301 + /* 302 + * This is not really the Guest's fault, but killing it is 303 + * simple for this corner case. 304 + */ 339 305 if (!ptepage) { 340 306 kill_guest(cpu, "out of memory allocating pte page"); 341 307 return false; 342 308 } 343 309 /* We check that the Guest pgd is OK. */ 344 310 check_gpgd(cpu, gpgd); 345 - /* And we copy the flags to the shadow PGD entry. The page 346 - * number in the shadow PGD is the page we just allocated. */ 311 + /* 312 + * And we copy the flags to the shadow PGD entry. The page 313 + * number in the shadow PGD is the page we just allocated. 314 + */ 347 315 set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags(gpgd))); 348 316 } 349 317 350 318 #ifdef CONFIG_X86_PAE 351 319 gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); 352 - /* middle level not present? We can't map it in. */ 320 + /* Middle level not present? We can't map it in. */ 353 321 if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) 354 322 return false; 355 323 ··· 364 324 /* No shadow entry: allocate a new shadow PTE page. */ 365 325 unsigned long ptepage = get_zeroed_page(GFP_KERNEL); 366 326 367 - /* This is not really the Guest's fault, but killing it is 368 - * simple for this corner case. */ 327 + /* 328 + * This is not really the Guest's fault, but killing it is 329 + * simple for this corner case. 330 + */ 369 331 if (!ptepage) { 370 332 kill_guest(cpu, "out of memory allocating pte page"); 371 333 return false; ··· 376 334 /* We check that the Guest pmd is OK. */ 377 335 check_gpmd(cpu, gpmd); 378 336 379 - /* And we copy the flags to the shadow PMD entry. The page 380 - * number in the shadow PMD is the page we just allocated. */ 337 + /* 338 + * And we copy the flags to the shadow PMD entry. The page 339 + * number in the shadow PMD is the page we just allocated. 340 + */ 381 341 native_set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags(gpmd))); 382 342 } 383 343 384 - /* OK, now we look at the lower level in the Guest page table: keep its 385 - * address, because we might update it later. */ 344 + /* 345 + * OK, now we look at the lower level in the Guest page table: keep its 346 + * address, because we might update it later. 347 + */ 386 348 gpte_ptr = gpte_addr(cpu, gpmd, vaddr); 387 349 #else 388 - /* OK, now we look at the lower level in the Guest page table: keep its 389 - * address, because we might update it later. */ 350 + /* 351 + * OK, now we look at the lower level in the Guest page table: keep its 352 + * address, because we might update it later. 353 + */ 390 354 gpte_ptr = gpte_addr(cpu, gpgd, vaddr); 391 355 #endif 356 + 357 + /* Read the actual PTE value. */ 392 358 gpte = lgread(cpu, gpte_ptr, pte_t); 393 359 394 360 /* If this page isn't in the Guest page tables, we can't page it in. */ 395 361 if (!(pte_flags(gpte) & _PAGE_PRESENT)) 396 362 return false; 397 363 398 - /* Check they're not trying to write to a page the Guest wants 399 - * read-only (bit 2 of errcode == write). */ 364 + /* 365 + * Check they're not trying to write to a page the Guest wants 366 + * read-only (bit 2 of errcode == write). 367 + */ 400 368 if ((errcode & 2) && !(pte_flags(gpte) & _PAGE_RW)) 401 369 return false; 402 370 ··· 414 362 if ((errcode & 4) && !(pte_flags(gpte) & _PAGE_USER)) 415 363 return false; 416 364 417 - /* Check that the Guest PTE flags are OK, and the page number is below 418 - * the pfn_limit (ie. not mapping the Launcher binary). */ 365 + /* 366 + * Check that the Guest PTE flags are OK, and the page number is below 367 + * the pfn_limit (ie. not mapping the Launcher binary). 368 + */ 419 369 check_gpte(cpu, gpte); 420 370 421 371 /* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */ ··· 427 373 428 374 /* Get the pointer to the shadow PTE entry we're going to set. */ 429 375 spte = spte_addr(cpu, *spgd, vaddr); 430 - /* If there was a valid shadow PTE entry here before, we release it. 431 - * This can happen with a write to a previously read-only entry. */ 376 + 377 + /* 378 + * If there was a valid shadow PTE entry here before, we release it. 379 + * This can happen with a write to a previously read-only entry. 380 + */ 432 381 release_pte(*spte); 433 382 434 - /* If this is a write, we insist that the Guest page is writable (the 435 - * final arg to gpte_to_spte()). */ 383 + /* 384 + * If this is a write, we insist that the Guest page is writable (the 385 + * final arg to gpte_to_spte()). 386 + */ 436 387 if (pte_dirty(gpte)) 437 388 *spte = gpte_to_spte(cpu, gpte, 1); 438 389 else 439 - /* If this is a read, don't set the "writable" bit in the page 390 + /* 391 + * If this is a read, don't set the "writable" bit in the page 440 392 * table entry, even if the Guest says it's writable. That way 441 393 * we will come back here when a write does actually occur, so 442 - * we can update the Guest's _PAGE_DIRTY flag. */ 394 + * we can update the Guest's _PAGE_DIRTY flag. 395 + */ 443 396 native_set_pte(spte, gpte_to_spte(cpu, pte_wrprotect(gpte), 0)); 444 397 445 - /* Finally, we write the Guest PTE entry back: we've set the 446 - * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. */ 398 + /* 399 + * Finally, we write the Guest PTE entry back: we've set the 400 + * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. 401 + */ 447 402 lgwrite(cpu, gpte_ptr, pte_t, gpte); 448 403 449 - /* The fault is fixed, the page table is populated, the mapping 404 + /* 405 + * The fault is fixed, the page table is populated, the mapping 450 406 * manipulated, the result returned and the code complete. A small 451 407 * delay and a trace of alliteration are the only indications the Guest 452 - * has that a page fault occurred at all. */ 408 + * has that a page fault occurred at all. 409 + */ 453 410 return true; 454 411 } 455 412 ··· 473 408 * mapped, so it's overkill. 474 409 * 475 410 * This is a quick version which answers the question: is this virtual address 476 - * mapped by the shadow page tables, and is it writable? */ 411 + * mapped by the shadow page tables, and is it writable? 412 + */ 477 413 static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr) 478 414 { 479 415 pgd_t *spgd; ··· 494 428 return false; 495 429 #endif 496 430 497 - /* Check the flags on the pte entry itself: it must be present and 498 - * writable. */ 431 + /* 432 + * Check the flags on the pte entry itself: it must be present and 433 + * writable. 434 + */ 499 435 flags = pte_flags(*(spte_addr(cpu, *spgd, vaddr))); 500 436 501 437 return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW); 502 438 } 503 439 504 - /* So, when pin_stack_pages() asks us to pin a page, we check if it's already 440 + /* 441 + * So, when pin_stack_pages() asks us to pin a page, we check if it's already 505 442 * in the page tables, and if not, we call demand_page() with error code 2 506 - * (meaning "write"). */ 443 + * (meaning "write"). 444 + */ 507 445 void pin_page(struct lg_cpu *cpu, unsigned long vaddr) 508 446 { 509 447 if (!page_writable(cpu, vaddr) && !demand_page(cpu, vaddr, 2)) 510 448 kill_guest(cpu, "bad stack page %#lx", vaddr); 511 449 } 450 + /*:*/ 512 451 513 452 #ifdef CONFIG_X86_PAE 514 453 static void release_pmd(pmd_t *spmd) ··· 550 479 } 551 480 552 481 #else /* !CONFIG_X86_PAE */ 553 - /*H:450 If we chase down the release_pgd() code, it looks like this: */ 482 + /*H:450 483 + * If we chase down the release_pgd() code, the non-PAE version looks like 484 + * this. The PAE version is almost identical, but instead of calling 485 + * release_pte it calls release_pmd(), which looks much like this. 486 + */ 554 487 static void release_pgd(pgd_t *spgd) 555 488 { 556 489 /* If the entry's not present, there's nothing to release. */ 557 490 if (pgd_flags(*spgd) & _PAGE_PRESENT) { 558 491 unsigned int i; 559 - /* Converting the pfn to find the actual PTE page is easy: turn 492 + /* 493 + * Converting the pfn to find the actual PTE page is easy: turn 560 494 * the page number into a physical address, then convert to a 561 - * virtual address (easy for kernel pages like this one). */ 495 + * virtual address (easy for kernel pages like this one). 496 + */ 562 497 pte_t *ptepage = __va(pgd_pfn(*spgd) << PAGE_SHIFT); 563 498 /* For each entry in the page, we might need to release it. */ 564 499 for (i = 0; i < PTRS_PER_PTE; i++) ··· 576 499 } 577 500 } 578 501 #endif 579 - /*H:445 We saw flush_user_mappings() twice: once from the flush_user_mappings() 502 + 503 + /*H:445 504 + * We saw flush_user_mappings() twice: once from the flush_user_mappings() 580 505 * hypercall and once in new_pgdir() when we re-used a top-level pgdir page. 581 - * It simply releases every PTE page from 0 up to the Guest's kernel address. */ 506 + * It simply releases every PTE page from 0 up to the Guest's kernel address. 507 + */ 582 508 static void flush_user_mappings(struct lguest *lg, int idx) 583 509 { 584 510 unsigned int i; ··· 590 510 release_pgd(lg->pgdirs[idx].pgdir + i); 591 511 } 592 512 593 - /*H:440 (v) Flushing (throwing away) page tables, 513 + /*H:440 514 + * (v) Flushing (throwing away) page tables, 594 515 * 595 516 * The Guest has a hypercall to throw away the page tables: it's used when a 596 - * large number of mappings have been changed. */ 517 + * large number of mappings have been changed. 518 + */ 597 519 void guest_pagetable_flush_user(struct lg_cpu *cpu) 598 520 { 599 521 /* Drop the userspace part of the current page table. */ ··· 633 551 return pte_pfn(gpte) * PAGE_SIZE | (vaddr & ~PAGE_MASK); 634 552 } 635 553 636 - /* We keep several page tables. This is a simple routine to find the page 554 + /* 555 + * We keep several page tables. This is a simple routine to find the page 637 556 * table (if any) corresponding to this top-level address the Guest has given 638 - * us. */ 557 + * us. 558 + */ 639 559 static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable) 640 560 { 641 561 unsigned int i; ··· 647 563 return i; 648 564 } 649 565 650 - /*H:435 And this is us, creating the new page directory. If we really do 566 + /*H:435 567 + * And this is us, creating the new page directory. If we really do 651 568 * allocate a new one (and so the kernel parts are not there), we set 652 - * blank_pgdir. */ 569 + * blank_pgdir. 570 + */ 653 571 static unsigned int new_pgdir(struct lg_cpu *cpu, 654 572 unsigned long gpgdir, 655 573 int *blank_pgdir) ··· 661 575 pmd_t *pmd_table; 662 576 #endif 663 577 664 - /* We pick one entry at random to throw out. Choosing the Least 665 - * Recently Used might be better, but this is easy. */ 578 + /* 579 + * We pick one entry at random to throw out. Choosing the Least 580 + * Recently Used might be better, but this is easy. 581 + */ 666 582 next = random32() % ARRAY_SIZE(cpu->lg->pgdirs); 667 583 /* If it's never been allocated at all before, try now. */ 668 584 if (!cpu->lg->pgdirs[next].pgdir) { ··· 675 587 next = cpu->cpu_pgd; 676 588 else { 677 589 #ifdef CONFIG_X86_PAE 678 - /* In PAE mode, allocate a pmd page and populate the 679 - * last pgd entry. */ 590 + /* 591 + * In PAE mode, allocate a pmd page and populate the 592 + * last pgd entry. 593 + */ 680 594 pmd_table = (pmd_t *)get_zeroed_page(GFP_KERNEL); 681 595 if (!pmd_table) { 682 596 free_page((long)cpu->lg->pgdirs[next].pgdir); ··· 688 598 set_pgd(cpu->lg->pgdirs[next].pgdir + 689 599 SWITCHER_PGD_INDEX, 690 600 __pgd(__pa(pmd_table) | _PAGE_PRESENT)); 691 - /* This is a blank page, so there are no kernel 692 - * mappings: caller must map the stack! */ 601 + /* 602 + * This is a blank page, so there are no kernel 603 + * mappings: caller must map the stack! 604 + */ 693 605 *blank_pgdir = 1; 694 606 } 695 607 #else ··· 707 615 return next; 708 616 } 709 617 710 - /*H:430 (iv) Switching page tables 618 + /*H:430 619 + * (iv) Switching page tables 711 620 * 712 621 * Now we've seen all the page table setting and manipulation, let's see 713 622 * what happens when the Guest changes page tables (ie. changes the top-level 714 - * pgdir). This occurs on almost every context switch. */ 623 + * pgdir). This occurs on almost every context switch. 624 + */ 715 625 void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable) 716 626 { 717 627 int newpgdir, repin = 0; 718 628 719 629 /* Look to see if we have this one already. */ 720 630 newpgdir = find_pgdir(cpu->lg, pgtable); 721 - /* If not, we allocate or mug an existing one: if it's a fresh one, 722 - * repin gets set to 1. */ 631 + /* 632 + * If not, we allocate or mug an existing one: if it's a fresh one, 633 + * repin gets set to 1. 634 + */ 723 635 if (newpgdir == ARRAY_SIZE(cpu->lg->pgdirs)) 724 636 newpgdir = new_pgdir(cpu, pgtable, &repin); 725 637 /* Change the current pgd index to the new one. */ ··· 733 637 pin_stack_pages(cpu); 734 638 } 735 639 736 - /*H:470 Finally, a routine which throws away everything: all PGD entries in all 640 + /*H:470 641 + * Finally, a routine which throws away everything: all PGD entries in all 737 642 * the shadow page tables, including the Guest's kernel mappings. This is used 738 - * when we destroy the Guest. */ 643 + * when we destroy the Guest. 644 + */ 739 645 static void release_all_pagetables(struct lguest *lg) 740 646 { 741 647 unsigned int i, j; ··· 754 656 spgd = lg->pgdirs[i].pgdir + SWITCHER_PGD_INDEX; 755 657 pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT); 756 658 757 - /* And release the pmd entries of that pmd page, 758 - * except for the switcher pmd. */ 659 + /* 660 + * And release the pmd entries of that pmd page, 661 + * except for the switcher pmd. 662 + */ 759 663 for (k = 0; k < SWITCHER_PMD_INDEX; k++) 760 664 release_pmd(&pmdpage[k]); 761 665 #endif ··· 767 667 } 768 668 } 769 669 770 - /* We also throw away everything when a Guest tells us it's changed a kernel 670 + /* 671 + * We also throw away everything when a Guest tells us it's changed a kernel 771 672 * mapping. Since kernel mappings are in every page table, it's easiest to 772 673 * throw them all away. This traps the Guest in amber for a while as 773 - * everything faults back in, but it's rare. */ 674 + * everything faults back in, but it's rare. 675 + */ 774 676 void guest_pagetable_clear_all(struct lg_cpu *cpu) 775 677 { 776 678 release_all_pagetables(cpu->lg); ··· 780 678 pin_stack_pages(cpu); 781 679 } 782 680 /*:*/ 783 - /*M:009 Since we throw away all mappings when a kernel mapping changes, our 681 + 682 + /*M:009 683 + * Since we throw away all mappings when a kernel mapping changes, our 784 684 * performance sucks for guests using highmem. In fact, a guest with 785 685 * PAGE_OFFSET 0xc0000000 (the default) and more than about 700MB of RAM is 786 686 * usually slower than a Guest with less memory. 787 687 * 788 688 * This, of course, cannot be fixed. It would take some kind of... well, I 789 - * don't know, but the term "puissant code-fu" comes to mind. :*/ 689 + * don't know, but the term "puissant code-fu" comes to mind. 690 + :*/ 790 691 791 - /*H:420 This is the routine which actually sets the page table entry for then 692 + /*H:420 693 + * This is the routine which actually sets the page table entry for then 792 694 * "idx"'th shadow page table. 793 695 * 794 696 * Normally, we can just throw out the old entry and replace it with 0: if they ··· 821 715 spmd = spmd_addr(cpu, *spgd, vaddr); 822 716 if (pmd_flags(*spmd) & _PAGE_PRESENT) { 823 717 #endif 824 - /* Otherwise, we start by releasing 825 - * the existing entry. */ 718 + /* Otherwise, start by releasing the existing entry. */ 826 719 pte_t *spte = spte_addr(cpu, *spgd, vaddr); 827 720 release_pte(*spte); 828 721 829 - /* If they're setting this entry as dirty or accessed, 830 - * we might as well put that entry they've given us 831 - * in now. This shaves 10% off a 832 - * copy-on-write micro-benchmark. */ 722 + /* 723 + * If they're setting this entry as dirty or accessed, 724 + * we might as well put that entry they've given us in 725 + * now. This shaves 10% off a copy-on-write 726 + * micro-benchmark. 727 + */ 833 728 if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) { 834 729 check_gpte(cpu, gpte); 835 730 native_set_pte(spte, 836 731 gpte_to_spte(cpu, gpte, 837 732 pte_flags(gpte) & _PAGE_DIRTY)); 838 - } else 839 - /* Otherwise kill it and we can demand_page() 840 - * it in later. */ 733 + } else { 734 + /* 735 + * Otherwise kill it and we can demand_page() 736 + * it in later. 737 + */ 841 738 native_set_pte(spte, __pte(0)); 739 + } 842 740 #ifdef CONFIG_X86_PAE 843 741 } 844 742 #endif 845 743 } 846 744 } 847 745 848 - /*H:410 Updating a PTE entry is a little trickier. 746 + /*H:410 747 + * Updating a PTE entry is a little trickier. 849 748 * 850 749 * We keep track of several different page tables (the Guest uses one for each 851 750 * process, so it makes sense to cache at least a few). Each of these have ··· 859 748 * all the page tables, not just the current one. This is rare. 860 749 * 861 750 * The benefit is that when we have to track a new page table, we can keep all 862 - * the kernel mappings. This speeds up context switch immensely. */ 751 + * the kernel mappings. This speeds up context switch immensely. 752 + */ 863 753 void guest_set_pte(struct lg_cpu *cpu, 864 754 unsigned long gpgdir, unsigned long vaddr, pte_t gpte) 865 755 { 866 - /* Kernel mappings must be changed on all top levels. Slow, but doesn't 867 - * happen often. */ 756 + /* 757 + * Kernel mappings must be changed on all top levels. Slow, but doesn't 758 + * happen often. 759 + */ 868 760 if (vaddr >= cpu->lg->kernel_address) { 869 761 unsigned int i; 870 762 for (i = 0; i < ARRAY_SIZE(cpu->lg->pgdirs); i++) ··· 909 795 /* ... throw it away. */ 910 796 release_pgd(lg->pgdirs[pgdir].pgdir + idx); 911 797 } 798 + 912 799 #ifdef CONFIG_X86_PAE 800 + /* For setting a mid-level, we just throw everything away. It's easy. */ 913 801 void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx) 914 802 { 915 803 guest_pagetable_clear_all(&lg->cpus[0]); 916 804 } 917 805 #endif 918 806 919 - /* Once we know how much memory we have we can construct simple identity 920 - * (which set virtual == physical) and linear mappings 921 - * which will get the Guest far enough into the boot to create its own. 807 + /*H:505 808 + * To get through boot, we construct simple identity page mappings (which 809 + * set virtual == physical) and linear mappings which will get the Guest far 810 + * enough into the boot to create its own. The linear mapping means we 811 + * simplify the Guest boot, but it makes assumptions about their PAGE_OFFSET, 812 + * as you'll see. 922 813 * 923 814 * We lay them out of the way, just below the initrd (which is why we need to 924 - * know its size here). */ 815 + * know its size here). 816 + */ 925 817 static unsigned long setup_pagetables(struct lguest *lg, 926 818 unsigned long mem, 927 819 unsigned long initrd_size) ··· 945 825 unsigned int phys_linear; 946 826 #endif 947 827 948 - /* We have mapped_pages frames to map, so we need 949 - * linear_pages page tables to map them. */ 828 + /* 829 + * We have mapped_pages frames to map, so we need linear_pages page 830 + * tables to map them. 831 + */ 950 832 mapped_pages = mem / PAGE_SIZE; 951 833 linear_pages = (mapped_pages + PTRS_PER_PTE - 1) / PTRS_PER_PTE; 952 834 ··· 959 837 linear = (void *)pgdir - linear_pages * PAGE_SIZE; 960 838 961 839 #ifdef CONFIG_X86_PAE 840 + /* 841 + * And the single mid page goes below that. We only use one, but 842 + * that's enough to map 1G, which definitely gets us through boot. 843 + */ 962 844 pmds = (void *)linear - PAGE_SIZE; 963 845 #endif 964 - /* Linear mapping is easy: put every page's address into the 965 - * mapping in order. */ 846 + /* 847 + * Linear mapping is easy: put every page's address into the 848 + * mapping in order. 849 + */ 966 850 for (i = 0; i < mapped_pages; i++) { 967 851 pte_t pte; 968 852 pte = pfn_pte(i, __pgprot(_PAGE_PRESENT|_PAGE_RW|_PAGE_USER)); ··· 976 848 return -EFAULT; 977 849 } 978 850 979 - /* The top level points to the linear page table pages above. 980 - * We setup the identity and linear mappings here. */ 981 851 #ifdef CONFIG_X86_PAE 852 + /* 853 + * Make the Guest PMD entries point to the corresponding place in the 854 + * linear mapping (up to one page worth of PMD). 855 + */ 982 856 for (i = j = 0; i < mapped_pages && j < PTRS_PER_PMD; 983 857 i += PTRS_PER_PTE, j++) { 858 + /* FIXME: native_set_pmd is overkill here. */ 984 859 native_set_pmd(&pmd, __pmd(((unsigned long)(linear + i) 985 860 - mem_base) | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER)); 986 861 ··· 991 860 return -EFAULT; 992 861 } 993 862 863 + /* One PGD entry, pointing to that PMD page. */ 994 864 set_pgd(&pgd, __pgd(((u32)pmds - mem_base) | _PAGE_PRESENT)); 865 + /* Copy it in as the first PGD entry (ie. addresses 0-1G). */ 995 866 if (copy_to_user(&pgdir[0], &pgd, sizeof(pgd)) != 0) 996 867 return -EFAULT; 868 + /* 869 + * And the third PGD entry (ie. addresses 3G-4G). 870 + * 871 + * FIXME: This assumes that PAGE_OFFSET for the Guest is 0xC0000000. 872 + */ 997 873 if (copy_to_user(&pgdir[3], &pgd, sizeof(pgd)) != 0) 998 874 return -EFAULT; 999 875 #else 876 + /* 877 + * The top level points to the linear page table pages above. 878 + * We setup the identity and linear mappings here. 879 + */ 1000 880 phys_linear = (unsigned long)linear - mem_base; 1001 881 for (i = 0; i < mapped_pages; i += PTRS_PER_PTE) { 1002 882 pgd_t pgd; 883 + /* 884 + * Create a PGD entry which points to the right part of the 885 + * linear PTE pages. 886 + */ 1003 887 pgd = __pgd((phys_linear + i * sizeof(pte_t)) | 1004 888 (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER)); 1005 889 890 + /* 891 + * Copy it into the PGD page at 0 and PAGE_OFFSET. 892 + */ 1006 893 if (copy_to_user(&pgdir[i / PTRS_PER_PTE], &pgd, sizeof(pgd)) 1007 894 || copy_to_user(&pgdir[pgd_index(PAGE_OFFSET) 1008 895 + i / PTRS_PER_PTE], ··· 1029 880 } 1030 881 #endif 1031 882 1032 - /* We return the top level (guest-physical) address: remember where 1033 - * this is. */ 883 + /* 884 + * We return the top level (guest-physical) address: we remember where 885 + * this is to write it into lguest_data when the Guest initializes. 886 + */ 1034 887 return (unsigned long)pgdir - mem_base; 1035 888 } 1036 889 1037 - /*H:500 (vii) Setting up the page tables initially. 890 + /*H:500 891 + * (vii) Setting up the page tables initially. 1038 892 * 1039 893 * When a Guest is first created, the Launcher tells us where the toplevel of 1040 - * its first page table is. We set some things up here: */ 894 + * its first page table is. We set some things up here: 895 + */ 1041 896 int init_guest_pagetable(struct lguest *lg) 1042 897 { 1043 898 u64 mem; ··· 1051 898 pgd_t *pgd; 1052 899 pmd_t *pmd_table; 1053 900 #endif 1054 - /* Get the Guest memory size and the ramdisk size from the boot header 1055 - * located at lg->mem_base (Guest address 0). */ 901 + /* 902 + * Get the Guest memory size and the ramdisk size from the boot header 903 + * located at lg->mem_base (Guest address 0). 904 + */ 1056 905 if (copy_from_user(&mem, &boot->e820_map[0].size, sizeof(mem)) 1057 906 || get_user(initrd_size, &boot->hdr.ramdisk_size)) 1058 907 return -EFAULT; 1059 908 1060 - /* We start on the first shadow page table, and give it a blank PGD 1061 - * page. */ 909 + /* 910 + * We start on the first shadow page table, and give it a blank PGD 911 + * page. 912 + */ 1062 913 lg->pgdirs[0].gpgdir = setup_pagetables(lg, mem, initrd_size); 1063 914 if (IS_ERR_VALUE(lg->pgdirs[0].gpgdir)) 1064 915 return lg->pgdirs[0].gpgdir; 1065 916 lg->pgdirs[0].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL); 1066 917 if (!lg->pgdirs[0].pgdir) 1067 918 return -ENOMEM; 919 + 1068 920 #ifdef CONFIG_X86_PAE 921 + /* For PAE, we also create the initial mid-level. */ 1069 922 pgd = lg->pgdirs[0].pgdir; 1070 923 pmd_table = (pmd_t *) get_zeroed_page(GFP_KERNEL); 1071 924 if (!pmd_table) ··· 1080 921 set_pgd(pgd + SWITCHER_PGD_INDEX, 1081 922 __pgd(__pa(pmd_table) | _PAGE_PRESENT)); 1082 923 #endif 924 + 925 + /* This is the current page table. */ 1083 926 lg->cpus[0].cpu_pgd = 0; 1084 927 return 0; 1085 928 } 1086 929 1087 - /* When the Guest calls LHCALL_LGUEST_INIT we do more setup. */ 930 + /*H:508 When the Guest calls LHCALL_LGUEST_INIT we do more setup. */ 1088 931 void page_table_guest_data_init(struct lg_cpu *cpu) 1089 932 { 1090 933 /* We get the kernel address: above this is all kernel memory. */ 1091 934 if (get_user(cpu->lg->kernel_address, 1092 935 &cpu->lg->lguest_data->kernel_address) 1093 - /* We tell the Guest that it can't use the top 2 or 4 MB 1094 - * of virtual addresses used by the Switcher. */ 936 + /* 937 + * We tell the Guest that it can't use the top 2 or 4 MB 938 + * of virtual addresses used by the Switcher. 939 + */ 1095 940 || put_user(RESERVE_MEM * 1024 * 1024, 1096 941 &cpu->lg->lguest_data->reserve_mem) 1097 942 || put_user(cpu->lg->pgdirs[0].gpgdir, 1098 943 &cpu->lg->lguest_data->pgdir)) 1099 944 kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); 1100 945 1101 - /* In flush_user_mappings() we loop from 0 to 946 + /* 947 + * In flush_user_mappings() we loop from 0 to 1102 948 * "pgd_index(lg->kernel_address)". This assumes it won't hit the 1103 - * Switcher mappings, so check that now. */ 949 + * Switcher mappings, so check that now. 950 + */ 1104 951 #ifdef CONFIG_X86_PAE 1105 952 if (pgd_index(cpu->lg->kernel_address) == SWITCHER_PGD_INDEX && 1106 953 pmd_index(cpu->lg->kernel_address) == SWITCHER_PMD_INDEX) ··· 1129 964 free_page((long)lg->pgdirs[i].pgdir); 1130 965 } 1131 966 1132 - /*H:480 (vi) Mapping the Switcher when the Guest is about to run. 967 + /*H:480 968 + * (vi) Mapping the Switcher when the Guest is about to run. 1133 969 * 1134 970 * The Switcher and the two pages for this CPU need to be visible in the 1135 971 * Guest (and not the pages for other CPUs). We have the appropriate PTE pages 1136 972 * for each CPU already set up, we just need to hook them in now we know which 1137 - * Guest is about to run on this CPU. */ 973 + * Guest is about to run on this CPU. 974 + */ 1138 975 void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) 1139 976 { 1140 977 pte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages); ··· 1147 980 pmd_t switcher_pmd; 1148 981 pmd_t *pmd_table; 1149 982 983 + /* FIXME: native_set_pmd is overkill here. */ 1150 984 native_set_pmd(&switcher_pmd, pfn_pmd(__pa(switcher_pte_page) >> 1151 985 PAGE_SHIFT, PAGE_KERNEL_EXEC)); 1152 986 987 + /* Figure out where the pmd page is, by reading the PGD, and converting 988 + * it to a virtual address. */ 1153 989 pmd_table = __va(pgd_pfn(cpu->lg-> 1154 990 pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX]) 1155 991 << PAGE_SHIFT); 992 + /* Now write it into the shadow page table. */ 1156 993 native_set_pmd(&pmd_table[SWITCHER_PMD_INDEX], switcher_pmd); 1157 994 #else 1158 995 pgd_t switcher_pgd; 1159 996 1160 - /* Make the last PGD entry for this Guest point to the Switcher's PTE 1161 - * page for this CPU (with appropriate flags). */ 997 + /* 998 + * Make the last PGD entry for this Guest point to the Switcher's PTE 999 + * page for this CPU (with appropriate flags). 1000 + */ 1162 1001 switcher_pgd = __pgd(__pa(switcher_pte_page) | __PAGE_KERNEL_EXEC); 1163 1002 1164 1003 cpu->lg->pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd; 1165 1004 1166 1005 #endif 1167 - /* We also change the Switcher PTE page. When we're running the Guest, 1006 + /* 1007 + * We also change the Switcher PTE page. When we're running the Guest, 1168 1008 * we want the Guest's "regs" page to appear where the first Switcher 1169 1009 * page for this CPU is. This is an optimization: when the Switcher 1170 1010 * saves the Guest registers, it saves them into the first page of this 1171 1011 * CPU's "struct lguest_pages": if we make sure the Guest's register 1172 1012 * page is already mapped there, we don't have to copy them out 1173 - * again. */ 1013 + * again. 1014 + */ 1174 1015 pfn = __pa(cpu->regs_page) >> PAGE_SHIFT; 1175 1016 native_set_pte(&regs_pte, pfn_pte(pfn, PAGE_KERNEL)); 1176 1017 native_set_pte(&switcher_pte_page[pte_index((unsigned long)pages)], ··· 1194 1019 free_page((long)switcher_pte_page(i)); 1195 1020 } 1196 1021 1197 - /*H:520 Setting up the Switcher PTE page for given CPU is fairly easy, given 1022 + /*H:520 1023 + * Setting up the Switcher PTE page for given CPU is fairly easy, given 1198 1024 * the CPU number and the "struct page"s for the Switcher code itself. 1199 1025 * 1200 - * Currently the Switcher is less than a page long, so "pages" is always 1. */ 1026 + * Currently the Switcher is less than a page long, so "pages" is always 1. 1027 + */ 1201 1028 static __init void populate_switcher_pte_page(unsigned int cpu, 1202 1029 struct page *switcher_page[], 1203 1030 unsigned int pages) ··· 1220 1043 native_set_pte(&pte[i], pfn_pte(page_to_pfn(switcher_page[i]), 1221 1044 __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW))); 1222 1045 1223 - /* The second page contains the "struct lguest_ro_state", and is 1224 - * read-only. */ 1046 + /* 1047 + * The second page contains the "struct lguest_ro_state", and is 1048 + * read-only. 1049 + */ 1225 1050 native_set_pte(&pte[i+1], pfn_pte(page_to_pfn(switcher_page[i+1]), 1226 1051 __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED))); 1227 1052 } 1228 1053 1229 - /* We've made it through the page table code. Perhaps our tired brains are 1054 + /* 1055 + * We've made it through the page table code. Perhaps our tired brains are 1230 1056 * still processing the details, or perhaps we're simply glad it's over. 1231 1057 * 1232 1058 * If nothing else, note that all this complexity in juggling shadow page tables ··· 1238 1058 * uses exotic direct Guest pagetable manipulation, and why both Intel and AMD 1239 1059 * have implemented shadow page table support directly into hardware. 1240 1060 * 1241 - * There is just one file remaining in the Host. */ 1061 + * There is just one file remaining in the Host. 1062 + */ 1242 1063 1243 - /*H:510 At boot or module load time, init_pagetables() allocates and populates 1244 - * the Switcher PTE page for each CPU. */ 1064 + /*H:510 1065 + * At boot or module load time, init_pagetables() allocates and populates 1066 + * the Switcher PTE page for each CPU. 1067 + */ 1245 1068 __init int init_pagetables(struct page **switcher_page, unsigned int pages) 1246 1069 { 1247 1070 unsigned int i;

+69 -37

drivers/lguest/segments.c

··· 1 - /*P:600 The x86 architecture has segments, which involve a table of descriptors 1 + /*P:600 2 + * The x86 architecture has segments, which involve a table of descriptors 2 3 * which can be used to do funky things with virtual address interpretation. 3 4 * We originally used to use segments so the Guest couldn't alter the 4 5 * Guest<->Host Switcher, and then we had to trim Guest segments, and restore ··· 9 8 * 10 9 * In these modern times, the segment handling code consists of simple sanity 11 10 * checks, and the worst you'll experience reading this code is butterfly-rash 12 - * from frolicking through its parklike serenity. :*/ 11 + * from frolicking through its parklike serenity. 12 + :*/ 13 13 #include "lg.h" 14 14 15 15 /*H:600 ··· 43 41 * begin. 44 42 */ 45 43 46 - /* There are several entries we don't let the Guest set. The TSS entry is the 44 + /* 45 + * There are several entries we don't let the Guest set. The TSS entry is the 47 46 * "Task State Segment" which controls all kinds of delicate things. The 48 47 * LGUEST_CS and LGUEST_DS entries are reserved for the Switcher, and the 49 - * the Guest can't be trusted to deal with double faults. */ 48 + * the Guest can't be trusted to deal with double faults. 49 + */ 50 50 static bool ignored_gdt(unsigned int num) 51 51 { 52 52 return (num == GDT_ENTRY_TSS ··· 57 53 || num == GDT_ENTRY_DOUBLEFAULT_TSS); 58 54 } 59 55 60 - /*H:630 Once the Guest gave us new GDT entries, we fix them up a little. We 56 + /*H:630 57 + * Once the Guest gave us new GDT entries, we fix them up a little. We 61 58 * don't care if they're invalid: the worst that can happen is a General 62 59 * Protection Fault in the Switcher when it restores a Guest segment register 63 60 * which tries to use that entry. Then we kill the Guest for causing such a 64 - * mess: the message will be "unhandled trap 256". */ 61 + * mess: the message will be "unhandled trap 256". 62 + */ 65 63 static void fixup_gdt_table(struct lg_cpu *cpu, unsigned start, unsigned end) 66 64 { 67 65 unsigned int i; 68 66 69 67 for (i = start; i < end; i++) { 70 - /* We never copy these ones to real GDT, so we don't care what 71 - * they say */ 68 + /* 69 + * We never copy these ones to real GDT, so we don't care what 70 + * they say 71 + */ 72 72 if (ignored_gdt(i)) 73 73 continue; 74 74 75 - /* Segment descriptors contain a privilege level: the Guest is 75 + /* 76 + * Segment descriptors contain a privilege level: the Guest is 76 77 * sometimes careless and leaves this as 0, even though it's 77 - * running at privilege level 1. If so, we fix it here. */ 78 + * running at privilege level 1. If so, we fix it here. 79 + */ 78 80 if ((cpu->arch.gdt[i].b & 0x00006000) == 0) 79 81 cpu->arch.gdt[i].b |= (GUEST_PL << 13); 80 82 81 - /* Each descriptor has an "accessed" bit. If we don't set it 83 + /* 84 + * Each descriptor has an "accessed" bit. If we don't set it 82 85 * now, the CPU will try to set it when the Guest first loads 83 86 * that entry into a segment register. But the GDT isn't 84 - * writable by the Guest, so bad things can happen. */ 87 + * writable by the Guest, so bad things can happen. 88 + */ 85 89 cpu->arch.gdt[i].b |= 0x00000100; 86 90 } 87 91 } 88 92 89 - /*H:610 Like the IDT, we never simply use the GDT the Guest gives us. We keep 93 + /*H:610 94 + * Like the IDT, we never simply use the GDT the Guest gives us. We keep 90 95 * a GDT for each CPU, and copy across the Guest's entries each time we want to 91 96 * run the Guest on that CPU. 92 97 * 93 98 * This routine is called at boot or modprobe time for each CPU to set up the 94 99 * constant GDT entries: the ones which are the same no matter what Guest we're 95 - * running. */ 100 + * running. 101 + */ 96 102 void setup_default_gdt_entries(struct lguest_ro_state *state) 97 103 { 98 104 struct desc_struct *gdt = state->guest_gdt; ··· 112 98 gdt[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT; 113 99 gdt[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT; 114 100 115 - /* The TSS segment refers to the TSS entry for this particular CPU. 101 + /* 102 + * The TSS segment refers to the TSS entry for this particular CPU. 116 103 * Forgive the magic flags: the 0x8900 means the entry is Present, it's 117 104 * privilege level 0 Available 386 TSS system segment, and the 0x67 118 - * means Saturn is eclipsed by Mercury in the twelfth house. */ 105 + * means Saturn is eclipsed by Mercury in the twelfth house. 106 + */ 119 107 gdt[GDT_ENTRY_TSS].a = 0x00000067 | (tss << 16); 120 108 gdt[GDT_ENTRY_TSS].b = 0x00008900 | (tss & 0xFF000000) 121 109 | ((tss >> 16) & 0x000000FF); 122 110 } 123 111 124 - /* This routine sets up the initial Guest GDT for booting. All entries start 125 - * as 0 (unusable). */ 112 + /* 113 + * This routine sets up the initial Guest GDT for booting. All entries start 114 + * as 0 (unusable). 115 + */ 126 116 void setup_guest_gdt(struct lg_cpu *cpu) 127 117 { 128 - /* Start with full 0-4G segments... */ 118 + /* 119 + * Start with full 0-4G segments...except the Guest is allowed to use 120 + * them, so set the privilege level appropriately in the flags. 121 + */ 129 122 cpu->arch.gdt[GDT_ENTRY_KERNEL_CS] = FULL_EXEC_SEGMENT; 130 123 cpu->arch.gdt[GDT_ENTRY_KERNEL_DS] = FULL_SEGMENT; 131 - /* ...except the Guest is allowed to use them, so set the privilege 132 - * level appropriately in the flags. */ 133 124 cpu->arch.gdt[GDT_ENTRY_KERNEL_CS].b |= (GUEST_PL << 13); 134 125 cpu->arch.gdt[GDT_ENTRY_KERNEL_DS].b |= (GUEST_PL << 13); 135 126 } 136 127 137 - /*H:650 An optimization of copy_gdt(), for just the three "thead-local storage" 138 - * entries. */ 128 + /*H:650 129 + * An optimization of copy_gdt(), for just the three "thead-local storage" 130 + * entries. 131 + */ 139 132 void copy_gdt_tls(const struct lg_cpu *cpu, struct desc_struct *gdt) 140 133 { 141 134 unsigned int i; ··· 151 130 gdt[i] = cpu->arch.gdt[i]; 152 131 } 153 132 154 - /*H:640 When the Guest is run on a different CPU, or the GDT entries have 155 - * changed, copy_gdt() is called to copy the Guest's GDT entries across to this 156 - * CPU's GDT. */ 133 + /*H:640 134 + * When the Guest is run on a different CPU, or the GDT entries have changed, 135 + * copy_gdt() is called to copy the Guest's GDT entries across to this CPU's 136 + * GDT. 137 + */ 157 138 void copy_gdt(const struct lg_cpu *cpu, struct desc_struct *gdt) 158 139 { 159 140 unsigned int i; 160 141 161 - /* The default entries from setup_default_gdt_entries() are not 162 - * replaced. See ignored_gdt() above. */ 142 + /* 143 + * The default entries from setup_default_gdt_entries() are not 144 + * replaced. See ignored_gdt() above. 145 + */ 163 146 for (i = 0; i < GDT_ENTRIES; i++) 164 147 if (!ignored_gdt(i)) 165 148 gdt[i] = cpu->arch.gdt[i]; 166 149 } 167 150 168 - /*H:620 This is where the Guest asks us to load a new GDT entry 169 - * (LHCALL_LOAD_GDT_ENTRY). We tweak the entry and copy it in. */ 151 + /*H:620 152 + * This is where the Guest asks us to load a new GDT entry 153 + * (LHCALL_LOAD_GDT_ENTRY). We tweak the entry and copy it in. 154 + */ 170 155 void load_guest_gdt_entry(struct lg_cpu *cpu, u32 num, u32 lo, u32 hi) 171 156 { 172 - /* We assume the Guest has the same number of GDT entries as the 173 - * Host, otherwise we'd have to dynamically allocate the Guest GDT. */ 157 + /* 158 + * We assume the Guest has the same number of GDT entries as the 159 + * Host, otherwise we'd have to dynamically allocate the Guest GDT. 160 + */ 174 161 if (num >= ARRAY_SIZE(cpu->arch.gdt)) 175 162 kill_guest(cpu, "too many gdt entries %i", num); 176 163 ··· 186 157 cpu->arch.gdt[num].a = lo; 187 158 cpu->arch.gdt[num].b = hi; 188 159 fixup_gdt_table(cpu, num, num+1); 189 - /* Mark that the GDT changed so the core knows it has to copy it again, 190 - * even if the Guest is run on the same CPU. */ 160 + /* 161 + * Mark that the GDT changed so the core knows it has to copy it again, 162 + * even if the Guest is run on the same CPU. 163 + */ 191 164 cpu->changed |= CHANGED_GDT; 192 165 } 193 166 194 - /* This is the fast-track version for just changing the three TLS entries. 167 + /* 168 + * This is the fast-track version for just changing the three TLS entries. 195 169 * Remember that this happens on every context switch, so it's worth 196 170 * optimizing. But wouldn't it be neater to have a single hypercall to cover 197 - * both cases? */ 171 + * both cases? 172 + */ 198 173 void guest_load_tls(struct lg_cpu *cpu, unsigned long gtls) 199 174 { 200 175 struct desc_struct *tls = &cpu->arch.gdt[GDT_ENTRY_TLS_MIN]; ··· 208 175 /* Note that just the TLS entries have changed. */ 209 176 cpu->changed |= CHANGED_GDT_TLS; 210 177 } 211 - /*:*/ 212 178 213 179 /*H:660 214 180 * With this, we have finished the Host.

+246 -128

drivers/lguest/x86/core.c

··· 17 17 * along with this program; if not, write to the Free Software 18 18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 19 19 */ 20 - /*P:450 This file contains the x86-specific lguest code. It used to be all 20 + /*P:450 21 + * This file contains the x86-specific lguest code. It used to be all 21 22 * mixed in with drivers/lguest/core.c but several foolhardy code slashers 22 23 * wrestled most of the dependencies out to here in preparation for porting 23 24 * lguest to other architectures (see what I mean by foolhardy?). 24 25 * 25 26 * This also contains a couple of non-obvious setup and teardown pieces which 26 - * were implemented after days of debugging pain. :*/ 27 + * were implemented after days of debugging pain. 28 + :*/ 27 29 #include <linux/kernel.h> 28 30 #include <linux/start_kernel.h> 29 31 #include <linux/string.h> ··· 84 82 */ 85 83 static void copy_in_guest_info(struct lg_cpu *cpu, struct lguest_pages *pages) 86 84 { 87 - /* Copying all this data can be quite expensive. We usually run the 85 + /* 86 + * Copying all this data can be quite expensive. We usually run the 88 87 * same Guest we ran last time (and that Guest hasn't run anywhere else 89 88 * meanwhile). If that's not the case, we pretend everything in the 90 - * Guest has changed. */ 89 + * Guest has changed. 90 + */ 91 91 if (__get_cpu_var(last_cpu) != cpu || cpu->last_pages != pages) { 92 92 __get_cpu_var(last_cpu) = cpu; 93 93 cpu->last_pages = pages; 94 94 cpu->changed = CHANGED_ALL; 95 95 } 96 96 97 - /* These copies are pretty cheap, so we do them unconditionally: */ 98 - /* Save the current Host top-level page directory. */ 97 + /* 98 + * These copies are pretty cheap, so we do them unconditionally: */ 99 + /* Save the current Host top-level page directory. 100 + */ 99 101 pages->state.host_cr3 = __pa(current->mm->pgd); 100 - /* Set up the Guest's page tables to see this CPU's pages (and no 101 - * other CPU's pages). */ 102 + /* 103 + * Set up the Guest's page tables to see this CPU's pages (and no 104 + * other CPU's pages). 105 + */ 102 106 map_switcher_in_guest(cpu, pages); 103 - /* Set up the two "TSS" members which tell the CPU what stack to use 107 + /* 108 + * Set up the two "TSS" members which tell the CPU what stack to use 104 109 * for traps which do directly into the Guest (ie. traps at privilege 105 - * level 1). */ 110 + * level 1). 111 + */ 106 112 pages->state.guest_tss.sp1 = cpu->esp1; 107 113 pages->state.guest_tss.ss1 = cpu->ss1; 108 114 ··· 135 125 /* This is a dummy value we need for GCC's sake. */ 136 126 unsigned int clobber; 137 127 138 - /* Copy the guest-specific information into this CPU's "struct 139 - * lguest_pages". */ 128 + /* 129 + * Copy the guest-specific information into this CPU's "struct 130 + * lguest_pages". 131 + */ 140 132 copy_in_guest_info(cpu, pages); 141 133 142 - /* Set the trap number to 256 (impossible value). If we fault while 134 + /* 135 + * Set the trap number to 256 (impossible value). If we fault while 143 136 * switching to the Guest (bad segment registers or bug), this will 144 - * cause us to abort the Guest. */ 137 + * cause us to abort the Guest. 138 + */ 145 139 cpu->regs->trapnum = 256; 146 140 147 - /* Now: we push the "eflags" register on the stack, then do an "lcall". 141 + /* 142 + * Now: we push the "eflags" register on the stack, then do an "lcall". 148 143 * This is how we change from using the kernel code segment to using 149 144 * the dedicated lguest code segment, as well as jumping into the 150 145 * Switcher. 151 146 * 152 147 * The lcall also pushes the old code segment (KERNEL_CS) onto the 153 148 * stack, then the address of this call. This stack layout happens to 154 - * exactly match the stack layout created by an interrupt... */ 149 + * exactly match the stack layout created by an interrupt... 150 + */ 155 151 asm volatile("pushf; lcall *lguest_entry" 156 - /* This is how we tell GCC that %eax ("a") and %ebx ("b") 157 - * are changed by this routine. The "=" means output. */ 152 + /* 153 + * This is how we tell GCC that %eax ("a") and %ebx ("b") 154 + * are changed by this routine. The "=" means output. 155 + */ 158 156 : "=a"(clobber), "=b"(clobber) 159 - /* %eax contains the pages pointer. ("0" refers to the 157 + /* 158 + * %eax contains the pages pointer. ("0" refers to the 160 159 * 0-th argument above, ie "a"). %ebx contains the 161 160 * physical address of the Guest's top-level page 162 - * directory. */ 161 + * directory. 162 + */ 163 163 : "0"(pages), "1"(__pa(cpu->lg->pgdirs[cpu->cpu_pgd].pgdir)) 164 - /* We tell gcc that all these registers could change, 164 + /* 165 + * We tell gcc that all these registers could change, 165 166 * which means we don't have to save and restore them in 166 - * the Switcher. */ 167 + * the Switcher. 168 + */ 167 169 : "memory", "%edx", "%ecx", "%edi", "%esi"); 168 170 } 169 171 /*:*/ 170 172 171 - /*M:002 There are hooks in the scheduler which we can register to tell when we 173 + /*M:002 174 + * There are hooks in the scheduler which we can register to tell when we 172 175 * get kicked off the CPU (preempt_notifier_register()). This would allow us 173 176 * to lazily disable SYSENTER which would regain some performance, and should 174 177 * also simplify copy_in_guest_info(). Note that we'd still need to restore 175 178 * things when we exit to Launcher userspace, but that's fairly easy. 176 179 * 177 - * We could also try using this hooks for PGE, but that might be too expensive. 180 + * We could also try using these hooks for PGE, but that might be too expensive. 178 181 * 179 - * The hooks were designed for KVM, but we can also put them to good use. :*/ 182 + * The hooks were designed for KVM, but we can also put them to good use. 183 + :*/ 180 184 181 - /*H:040 This is the i386-specific code to setup and run the Guest. Interrupts 182 - * are disabled: we own the CPU. */ 185 + /*H:040 186 + * This is the i386-specific code to setup and run the Guest. Interrupts 187 + * are disabled: we own the CPU. 188 + */ 183 189 void lguest_arch_run_guest(struct lg_cpu *cpu) 184 190 { 185 - /* Remember the awfully-named TS bit? If the Guest has asked to set it 191 + /* 192 + * Remember the awfully-named TS bit? If the Guest has asked to set it 186 193 * we set it now, so we can trap and pass that trap to the Guest if it 187 - * uses the FPU. */ 194 + * uses the FPU. 195 + */ 188 196 if (cpu->ts) 189 197 unlazy_fpu(current); 190 198 191 - /* SYSENTER is an optimized way of doing system calls. We can't allow 199 + /* 200 + * SYSENTER is an optimized way of doing system calls. We can't allow 192 201 * it because it always jumps to privilege level 0. A normal Guest 193 202 * won't try it because we don't advertise it in CPUID, but a malicious 194 203 * Guest (or malicious Guest userspace program) could, so we tell the 195 - * CPU to disable it before running the Guest. */ 204 + * CPU to disable it before running the Guest. 205 + */ 196 206 if (boot_cpu_has(X86_FEATURE_SEP)) 197 207 wrmsr(MSR_IA32_SYSENTER_CS, 0, 0); 198 208 199 - /* Now we actually run the Guest. It will return when something 209 + /* 210 + * Now we actually run the Guest. It will return when something 200 211 * interesting happens, and we can examine its registers to see what it 201 - * was doing. */ 212 + * was doing. 213 + */ 202 214 run_guest_once(cpu, lguest_pages(raw_smp_processor_id())); 203 215 204 - /* Note that the "regs" structure contains two extra entries which are 216 + /* 217 + * Note that the "regs" structure contains two extra entries which are 205 218 * not really registers: a trap number which says what interrupt or 206 219 * trap made the switcher code come back, and an error code which some 207 - * traps set. */ 220 + * traps set. 221 + */ 208 222 209 223 /* Restore SYSENTER if it's supposed to be on. */ 210 224 if (boot_cpu_has(X86_FEATURE_SEP)) 211 225 wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); 212 226 213 - /* If the Guest page faulted, then the cr2 register will tell us the 227 + /* 228 + * If the Guest page faulted, then the cr2 register will tell us the 214 229 * bad virtual address. We have to grab this now, because once we 215 230 * re-enable interrupts an interrupt could fault and thus overwrite 216 - * cr2, or we could even move off to a different CPU. */ 231 + * cr2, or we could even move off to a different CPU. 232 + */ 217 233 if (cpu->regs->trapnum == 14) 218 234 cpu->arch.last_pagefault = read_cr2(); 219 - /* Similarly, if we took a trap because the Guest used the FPU, 235 + /* 236 + * Similarly, if we took a trap because the Guest used the FPU, 220 237 * we have to restore the FPU it expects to see. 221 238 * math_state_restore() may sleep and we may even move off to 222 239 * a different CPU. So all the critical stuff should be done 223 - * before this. */ 240 + * before this. 241 + */ 224 242 else if (cpu->regs->trapnum == 7) 225 243 math_state_restore(); 226 244 } 227 245 228 - /*H:130 Now we've examined the hypercall code; our Guest can make requests. 246 + /*H:130 247 + * Now we've examined the hypercall code; our Guest can make requests. 229 248 * Our Guest is usually so well behaved; it never tries to do things it isn't 230 249 * allowed to, and uses hypercalls instead. Unfortunately, Linux's paravirtual 231 250 * infrastructure isn't quite complete, because it doesn't contain replacements ··· 264 225 * 265 226 * When the Guest uses one of these instructions, we get a trap (General 266 227 * Protection Fault) and come here. We see if it's one of those troublesome 267 - * instructions and skip over it. We return true if we did. */ 228 + * instructions and skip over it. We return true if we did. 229 + */ 268 230 static int emulate_insn(struct lg_cpu *cpu) 269 231 { 270 232 u8 insn; 271 233 unsigned int insnlen = 0, in = 0, shift = 0; 272 - /* The eip contains the *virtual* address of the Guest's instruction: 273 - * guest_pa just subtracts the Guest's page_offset. */ 234 + /* 235 + * The eip contains the *virtual* address of the Guest's instruction: 236 + * guest_pa just subtracts the Guest's page_offset. 237 + */ 274 238 unsigned long physaddr = guest_pa(cpu, cpu->regs->eip); 275 239 276 - /* This must be the Guest kernel trying to do something, not userspace! 240 + /* 241 + * This must be the Guest kernel trying to do something, not userspace! 277 242 * The bottom two bits of the CS segment register are the privilege 278 - * level. */ 243 + * level. 244 + */ 279 245 if ((cpu->regs->cs & 3) != GUEST_PL) 280 246 return 0; 281 247 282 248 /* Decoding x86 instructions is icky. */ 283 249 insn = lgread(cpu, physaddr, u8); 284 250 285 - /* 0x66 is an "operand prefix". It means it's using the upper 16 bits 286 - of the eax register. */ 251 + /* 252 + * 0x66 is an "operand prefix". It means it's using the upper 16 bits 253 + * of the eax register. 254 + */ 287 255 if (insn == 0x66) { 288 256 shift = 16; 289 257 /* The instruction is 1 byte so far, read the next byte. */ ··· 298 252 insn = lgread(cpu, physaddr + insnlen, u8); 299 253 } 300 254 301 - /* We can ignore the lower bit for the moment and decode the 4 opcodes 302 - * we need to emulate. */ 255 + /* 256 + * We can ignore the lower bit for the moment and decode the 4 opcodes 257 + * we need to emulate. 258 + */ 303 259 switch (insn & 0xFE) { 304 260 case 0xE4: /* in <next byte>,%al */ 305 261 insnlen += 2; ··· 322 274 return 0; 323 275 } 324 276 325 - /* If it was an "IN" instruction, they expect the result to be read 277 + /* 278 + * If it was an "IN" instruction, they expect the result to be read 326 279 * into %eax, so we change %eax. We always return all-ones, which 327 - * traditionally means "there's nothing there". */ 280 + * traditionally means "there's nothing there". 281 + */ 328 282 if (in) { 329 283 /* Lower bit tells is whether it's a 16 or 32 bit access */ 330 284 if (insn & 0x1) ··· 340 290 return 1; 341 291 } 342 292 343 - /* Our hypercalls mechanism used to be based on direct software interrupts. 293 + /* 294 + * Our hypercalls mechanism used to be based on direct software interrupts. 344 295 * After Anthony's "Refactor hypercall infrastructure" kvm patch, we decided to 345 296 * change over to using kvm hypercalls. 346 297 * ··· 369 318 */ 370 319 static void rewrite_hypercall(struct lg_cpu *cpu) 371 320 { 372 - /* This are the opcodes we use to patch the Guest. The opcode for "int 321 + /* 322 + * This are the opcodes we use to patch the Guest. The opcode for "int 373 323 * $0x1f" is "0xcd 0x1f" but vmcall instruction is 3 bytes long, so we 374 - * complete the sequence with a NOP (0x90). */ 324 + * complete the sequence with a NOP (0x90). 325 + */ 375 326 u8 insn[3] = {0xcd, 0x1f, 0x90}; 376 327 377 328 __lgwrite(cpu, guest_pa(cpu, cpu->regs->eip), insn, sizeof(insn)); 378 - /* The above write might have caused a copy of that page to be made 329 + /* 330 + * The above write might have caused a copy of that page to be made 379 331 * (if it was read-only). We need to make sure the Guest has 380 332 * up-to-date pagetables. As this doesn't happen often, we can just 381 - * drop them all. */ 333 + * drop them all. 334 + */ 382 335 guest_pagetable_clear_all(cpu); 383 336 } 384 337 ··· 390 335 { 391 336 u8 insn[3]; 392 337 393 - /* This must be the Guest kernel trying to do something. 338 + /* 339 + * This must be the Guest kernel trying to do something. 394 340 * The bottom two bits of the CS segment register are the privilege 395 - * level. */ 341 + * level. 342 + */ 396 343 if ((cpu->regs->cs & 3) != GUEST_PL) 397 344 return false; 398 345 ··· 408 351 { 409 352 switch (cpu->regs->trapnum) { 410 353 case 13: /* We've intercepted a General Protection Fault. */ 411 - /* Check if this was one of those annoying IN or OUT 354 + /* 355 + * Check if this was one of those annoying IN or OUT 412 356 * instructions which we need to emulate. If so, we just go 413 - * back into the Guest after we've done it. */ 357 + * back into the Guest after we've done it. 358 + */ 414 359 if (cpu->regs->errcode == 0) { 415 360 if (emulate_insn(cpu)) 416 361 return; 417 362 } 418 - /* If KVM is active, the vmcall instruction triggers a 419 - * General Protection Fault. Normally it triggers an 420 - * invalid opcode fault (6): */ 363 + /* 364 + * If KVM is active, the vmcall instruction triggers a General 365 + * Protection Fault. Normally it triggers an invalid opcode 366 + * fault (6): 367 + */ 421 368 case 6: 422 - /* We need to check if ring == GUEST_PL and 423 - * faulting instruction == vmcall. */ 369 + /* 370 + * We need to check if ring == GUEST_PL and faulting 371 + * instruction == vmcall. 372 + */ 424 373 if (is_hypercall(cpu)) { 425 374 rewrite_hypercall(cpu); 426 375 return; 427 376 } 428 377 break; 429 378 case 14: /* We've intercepted a Page Fault. */ 430 - /* The Guest accessed a virtual address that wasn't mapped. 379 + /* 380 + * The Guest accessed a virtual address that wasn't mapped. 431 381 * This happens a lot: we don't actually set up most of the page 432 382 * tables for the Guest at all when we start: as it runs it asks 433 383 * for more and more, and we set them up as required. In this 434 384 * case, we don't even tell the Guest that the fault happened. 435 385 * 436 386 * The errcode tells whether this was a read or a write, and 437 - * whether kernel or userspace code. */ 387 + * whether kernel or userspace code. 388 + */ 438 389 if (demand_page(cpu, cpu->arch.last_pagefault, 439 390 cpu->regs->errcode)) 440 391 return; 441 392 442 - /* OK, it's really not there (or not OK): the Guest needs to 393 + /* 394 + * OK, it's really not there (or not OK): the Guest needs to 443 395 * know. We write out the cr2 value so it knows where the 444 396 * fault occurred. 445 397 * 446 398 * Note that if the Guest were really messed up, this could 447 399 * happen before it's done the LHCALL_LGUEST_INIT hypercall, so 448 - * lg->lguest_data could be NULL */ 400 + * lg->lguest_data could be NULL 401 + */ 449 402 if (cpu->lg->lguest_data && 450 403 put_user(cpu->arch.last_pagefault, 451 404 &cpu->lg->lguest_data->cr2)) 452 405 kill_guest(cpu, "Writing cr2"); 453 406 break; 454 407 case 7: /* We've intercepted a Device Not Available fault. */ 455 - /* If the Guest doesn't want to know, we already restored the 456 - * Floating Point Unit, so we just continue without telling 457 - * it. */ 408 + /* 409 + * If the Guest doesn't want to know, we already restored the 410 + * Floating Point Unit, so we just continue without telling it. 411 + */ 458 412 if (!cpu->ts) 459 413 return; 460 414 break; 461 415 case 32 ... 255: 462 - /* These values mean a real interrupt occurred, in which case 416 + /* 417 + * These values mean a real interrupt occurred, in which case 463 418 * the Host handler has already been run. We just do a 464 419 * friendly check if another process should now be run, then 465 - * return to run the Guest again */ 420 + * return to run the Guest again 421 + */ 466 422 cond_resched(); 467 423 return; 468 424 case LGUEST_TRAP_ENTRY: 469 - /* Our 'struct hcall_args' maps directly over our regs: we set 470 - * up the pointer now to indicate a hypercall is pending. */ 425 + /* 426 + * Our 'struct hcall_args' maps directly over our regs: we set 427 + * up the pointer now to indicate a hypercall is pending. 428 + */ 471 429 cpu->hcall = (struct hcall_args *)cpu->regs; 472 430 return; 473 431 } 474 432 475 433 /* We didn't handle the trap, so it needs to go to the Guest. */ 476 434 if (!deliver_trap(cpu, cpu->regs->trapnum)) 477 - /* If the Guest doesn't have a handler (either it hasn't 435 + /* 436 + * If the Guest doesn't have a handler (either it hasn't 478 437 * registered any yet, or it's one of the faults we don't let 479 - * it handle), it dies with this cryptic error message. */ 438 + * it handle), it dies with this cryptic error message. 439 + */ 480 440 kill_guest(cpu, "unhandled trap %li at %#lx (%#lx)", 481 441 cpu->regs->trapnum, cpu->regs->eip, 482 442 cpu->regs->trapnum == 14 ? cpu->arch.last_pagefault 483 443 : cpu->regs->errcode); 484 444 } 485 445 486 - /* Now we can look at each of the routines this calls, in increasing order of 446 + /* 447 + * Now we can look at each of the routines this calls, in increasing order of 487 448 * complexity: do_hypercalls(), emulate_insn(), maybe_do_interrupt(), 488 449 * deliver_trap() and demand_page(). After all those, we'll be ready to 489 450 * examine the Switcher, and our philosophical understanding of the Host/Guest 490 - * duality will be complete. :*/ 451 + * duality will be complete. 452 + :*/ 491 453 static void adjust_pge(void *on) 492 454 { 493 455 if (on) ··· 515 439 write_cr4(read_cr4() & ~X86_CR4_PGE); 516 440 } 517 441 518 - /*H:020 Now the Switcher is mapped and every thing else is ready, we need to do 519 - * some more i386-specific initialization. */ 442 + /*H:020 443 + * Now the Switcher is mapped and every thing else is ready, we need to do 444 + * some more i386-specific initialization. 445 + */ 520 446 void __init lguest_arch_host_init(void) 521 447 { 522 448 int i; 523 449 524 - /* Most of the i386/switcher.S doesn't care that it's been moved; on 450 + /* 451 + * Most of the i386/switcher.S doesn't care that it's been moved; on 525 452 * Intel, jumps are relative, and it doesn't access any references to 526 453 * external code or data. 527 454 * ··· 532 453 * addresses are placed in a table (default_idt_entries), so we need to 533 454 * update the table with the new addresses. switcher_offset() is a 534 455 * convenience function which returns the distance between the 535 - * compiled-in switcher code and the high-mapped copy we just made. */ 456 + * compiled-in switcher code and the high-mapped copy we just made. 457 + */ 536 458 for (i = 0; i < IDT_ENTRIES; i++) 537 459 default_idt_entries[i] += switcher_offset(); 538 460 ··· 548 468 for_each_possible_cpu(i) { 549 469 /* lguest_pages() returns this CPU's two pages. */ 550 470 struct lguest_pages *pages = lguest_pages(i); 551 - /* This is a convenience pointer to make the code fit one 552 - * statement to a line. */ 471 + /* This is a convenience pointer to make the code neater. */ 553 472 struct lguest_ro_state *state = &pages->state; 554 473 555 - /* The Global Descriptor Table: the Host has a different one 474 + /* 475 + * The Global Descriptor Table: the Host has a different one 556 476 * for each CPU. We keep a descriptor for the GDT which says 557 477 * where it is and how big it is (the size is actually the last 558 - * byte, not the size, hence the "-1"). */ 478 + * byte, not the size, hence the "-1"). 479 + */ 559 480 state->host_gdt_desc.size = GDT_SIZE-1; 560 481 state->host_gdt_desc.address = (long)get_cpu_gdt_table(i); 561 482 562 - /* All CPUs on the Host use the same Interrupt Descriptor 483 + /* 484 + * All CPUs on the Host use the same Interrupt Descriptor 563 485 * Table, so we just use store_idt(), which gets this CPU's IDT 564 - * descriptor. */ 486 + * descriptor. 487 + */ 565 488 store_idt(&state->host_idt_desc); 566 489 567 - /* The descriptors for the Guest's GDT and IDT can be filled 490 + /* 491 + * The descriptors for the Guest's GDT and IDT can be filled 568 492 * out now, too. We copy the GDT & IDT into ->guest_gdt and 569 - * ->guest_idt before actually running the Guest. */ 493 + * ->guest_idt before actually running the Guest. 494 + */ 570 495 state->guest_idt_desc.size = sizeof(state->guest_idt)-1; 571 496 state->guest_idt_desc.address = (long)&state->guest_idt; 572 497 state->guest_gdt_desc.size = sizeof(state->guest_gdt)-1; 573 498 state->guest_gdt_desc.address = (long)&state->guest_gdt; 574 499 575 - /* We know where we want the stack to be when the Guest enters 500 + /* 501 + * We know where we want the stack to be when the Guest enters 576 502 * the Switcher: in pages->regs. The stack grows upwards, so 577 - * we start it at the end of that structure. */ 503 + * we start it at the end of that structure. 504 + */ 578 505 state->guest_tss.sp0 = (long)(&pages->regs + 1); 579 - /* And this is the GDT entry to use for the stack: we keep a 580 - * couple of special LGUEST entries. */ 506 + /* 507 + * And this is the GDT entry to use for the stack: we keep a 508 + * couple of special LGUEST entries. 509 + */ 581 510 state->guest_tss.ss0 = LGUEST_DS; 582 511 583 - /* x86 can have a finegrained bitmap which indicates what I/O 512 + /* 513 + * x86 can have a finegrained bitmap which indicates what I/O 584 514 * ports the process can use. We set it to the end of our 585 - * structure, meaning "none". */ 515 + * structure, meaning "none". 516 + */ 586 517 state->guest_tss.io_bitmap_base = sizeof(state->guest_tss); 587 518 588 - /* Some GDT entries are the same across all Guests, so we can 589 - * set them up now. */ 519 + /* 520 + * Some GDT entries are the same across all Guests, so we can 521 + * set them up now. 522 + */ 590 523 setup_default_gdt_entries(state); 591 524 /* Most IDT entries are the same for all Guests, too.*/ 592 525 setup_default_idt_entries(state, default_idt_entries); 593 526 594 - /* The Host needs to be able to use the LGUEST segments on this 595 - * CPU, too, so put them in the Host GDT. */ 527 + /* 528 + * The Host needs to be able to use the LGUEST segments on this 529 + * CPU, too, so put them in the Host GDT. 530 + */ 596 531 get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT; 597 532 get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT; 598 533 } 599 534 600 - /* In the Switcher, we want the %cs segment register to use the 535 + /* 536 + * In the Switcher, we want the %cs segment register to use the 601 537 * LGUEST_CS GDT entry: we've put that in the Host and Guest GDTs, so 602 538 * it will be undisturbed when we switch. To change %cs and jump we 603 - * need this structure to feed to Intel's "lcall" instruction. */ 539 + * need this structure to feed to Intel's "lcall" instruction. 540 + */ 604 541 lguest_entry.offset = (long)switch_to_guest + switcher_offset(); 605 542 lguest_entry.segment = LGUEST_CS; 606 543 607 - /* Finally, we need to turn off "Page Global Enable". PGE is an 544 + /* 545 + * Finally, we need to turn off "Page Global Enable". PGE is an 608 546 * optimization where page table entries are specially marked to show 609 547 * they never change. The Host kernel marks all the kernel pages this 610 548 * way because it's always present, even when userspace is running. ··· 632 534 * you'll get really weird bugs that you'll chase for two days. 633 535 * 634 536 * I used to turn PGE off every time we switched to the Guest and back 635 - * on when we return, but that slowed the Switcher down noticibly. */ 537 + * on when we return, but that slowed the Switcher down noticibly. 538 + */ 636 539 637 - /* We don't need the complexity of CPUs coming and going while we're 638 - * doing this. */ 540 + /* 541 + * We don't need the complexity of CPUs coming and going while we're 542 + * doing this. 543 + */ 639 544 get_online_cpus(); 640 545 if (cpu_has_pge) { /* We have a broader idea of "global". */ 641 546 /* Remember that this was originally set (for cleanup). */ 642 547 cpu_had_pge = 1; 643 - /* adjust_pge is a helper function which sets or unsets the PGE 644 - * bit on its CPU, depending on the argument (0 == unset). */ 548 + /* 549 + * adjust_pge is a helper function which sets or unsets the PGE 550 + * bit on its CPU, depending on the argument (0 == unset). 551 + */ 645 552 on_each_cpu(adjust_pge, (void *)0, 1); 646 553 /* Turn off the feature in the global feature set. */ 647 554 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE); ··· 693 590 { 694 591 u32 tsc_speed; 695 592 696 - /* The pointer to the Guest's "struct lguest_data" is the only argument. 697 - * We check that address now. */ 593 + /* 594 + * The pointer to the Guest's "struct lguest_data" is the only argument. 595 + * We check that address now. 596 + */ 698 597 if (!lguest_address_ok(cpu->lg, cpu->hcall->arg1, 699 598 sizeof(*cpu->lg->lguest_data))) 700 599 return -EFAULT; 701 600 702 - /* Having checked it, we simply set lg->lguest_data to point straight 601 + /* 602 + * Having checked it, we simply set lg->lguest_data to point straight 703 603 * into the Launcher's memory at the right place and then use 704 604 * copy_to_user/from_user from now on, instead of lgread/write. I put 705 605 * this in to show that I'm not immune to writing stupid 706 - * optimizations. */ 606 + * optimizations. 607 + */ 707 608 cpu->lg->lguest_data = cpu->lg->mem_base + cpu->hcall->arg1; 708 609 709 - /* We insist that the Time Stamp Counter exist and doesn't change with 610 + /* 611 + * We insist that the Time Stamp Counter exist and doesn't change with 710 612 * cpu frequency. Some devious chip manufacturers decided that TSC 711 613 * changes could be handled in software. I decided that time going 712 614 * backwards might be good for benchmarks, but it's bad for users. 713 615 * 714 616 * We also insist that the TSC be stable: the kernel detects unreliable 715 - * TSCs for its own purposes, and we use that here. */ 617 + * TSCs for its own purposes, and we use that here. 618 + */ 716 619 if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && !check_tsc_unstable()) 717 620 tsc_speed = tsc_khz; 718 621 else ··· 734 625 } 735 626 /*:*/ 736 627 737 - /*L:030 lguest_arch_setup_regs() 628 + /*L:030 629 + * lguest_arch_setup_regs() 738 630 * 739 631 * Most of the Guest's registers are left alone: we used get_zeroed_page() to 740 - * allocate the structure, so they will be 0. */ 632 + * allocate the structure, so they will be 0. 633 + */ 741 634 void lguest_arch_setup_regs(struct lg_cpu *cpu, unsigned long start) 742 635 { 743 636 struct lguest_regs *regs = cpu->regs; 744 637 745 - /* There are four "segment" registers which the Guest needs to boot: 638 + /* 639 + * There are four "segment" registers which the Guest needs to boot: 746 640 * The "code segment" register (cs) refers to the kernel code segment 747 641 * __KERNEL_CS, and the "data", "extra" and "stack" segment registers 748 642 * refer to the kernel data segment __KERNEL_DS. 749 643 * 750 644 * The privilege level is packed into the lower bits. The Guest runs 751 - * at privilege level 1 (GUEST_PL).*/ 645 + * at privilege level 1 (GUEST_PL). 646 + */ 752 647 regs->ds = regs->es = regs->ss = __KERNEL_DS|GUEST_PL; 753 648 regs->cs = __KERNEL_CS|GUEST_PL; 754 649 755 - /* The "eflags" register contains miscellaneous flags. Bit 1 (0x002) 650 + /* 651 + * The "eflags" register contains miscellaneous flags. Bit 1 (0x002) 756 652 * is supposed to always be "1". Bit 9 (0x200) controls whether 757 653 * interrupts are enabled. We always leave interrupts enabled while 758 - * running the Guest. */ 654 + * running the Guest. 655 + */ 759 656 regs->eflags = X86_EFLAGS_IF | 0x2; 760 657 761 - /* The "Extended Instruction Pointer" register says where the Guest is 762 - * running. */ 658 + /* 659 + * The "Extended Instruction Pointer" register says where the Guest is 660 + * running. 661 + */ 763 662 regs->eip = start; 764 663 765 - /* %esi points to our boot information, at physical address 0, so don't 766 - * touch it. */ 664 + /* 665 + * %esi points to our boot information, at physical address 0, so don't 666 + * touch it. 667 + */ 767 668 768 - /* There are a couple of GDT entries the Guest expects when first 769 - * booting. */ 669 + /* There are a couple of GDT entries the Guest expects at boot. */ 770 670 setup_guest_gdt(cpu); 771 671 }

+14 -8

drivers/lguest/x86/switcher_32.S

··· 1 - /*P:900 This is the Switcher: code which sits at 0xFFC00000 astride both the 2 - * Host and Guest to do the low-level Guest<->Host switch. It is as simple as 3 - * it can be made, but it's naturally very specific to x86. 1 + /*P:900 2 + * This is the Switcher: code which sits at 0xFFC00000 (or 0xFFE00000) astride 3 + * both the Host and Guest to do the low-level Guest<->Host switch. It is as 4 + * simple as it can be made, but it's naturally very specific to x86. 4 5 * 5 6 * You have now completed Preparation. If this has whet your appetite; if you 6 7 * are feeling invigorated and refreshed then the next, more challenging stage 7 - * can be found in "make Guest". :*/ 8 + * can be found in "make Guest". 9 + :*/ 8 10 9 - /*M:012 Lguest is meant to be simple: my rule of thumb is that 1% more LOC must 11 + /*M:012 12 + * Lguest is meant to be simple: my rule of thumb is that 1% more LOC must 10 13 * gain at least 1% more performance. Since neither LOC nor performance can be 11 14 * measured beforehand, it generally means implementing a feature then deciding 12 15 * if it's worth it. And once it's implemented, who can say no? ··· 34 31 * Host (which is actually really easy). 35 32 * 36 33 * Two questions remain. Would the performance gain outweigh the complexity? 37 - * And who would write the verse documenting it? :*/ 34 + * And who would write the verse documenting it? 35 + :*/ 38 36 39 - /*M:011 Lguest64 handles NMI. This gave me NMI envy (until I looked at their 37 + /*M:011 38 + * Lguest64 handles NMI. This gave me NMI envy (until I looked at their 40 39 * code). It's worth doing though, since it would let us use oprofile in the 41 - * Host when a Guest is running. :*/ 40 + * Host when a Guest is running. 41 + :*/ 42 42 43 43 /*S:100 44 44 * Welcome to the Switcher itself!

+143 -111

drivers/virtio/virtio_pci.c

··· 52 52 char (*msix_names)[256]; 53 53 /* Number of available vectors */ 54 54 unsigned msix_vectors; 55 - /* Vectors allocated */ 55 + /* Vectors allocated, excluding per-vq vectors if any */ 56 56 unsigned msix_used_vectors; 57 + /* Whether we have vector per vq */ 58 + bool per_vq_vectors; 57 59 }; 58 60 59 61 /* Constants for MSI-X */ ··· 260 258 261 259 for (i = 0; i < vp_dev->msix_used_vectors; ++i) 262 260 free_irq(vp_dev->msix_entries[i].vector, vp_dev); 263 - vp_dev->msix_used_vectors = 0; 264 261 265 262 if (vp_dev->msix_enabled) { 266 263 /* Disable the vector used for configuration */ ··· 268 267 /* Flush the write out to device */ 269 268 ioread16(vp_dev->ioaddr + VIRTIO_MSI_CONFIG_VECTOR); 270 269 271 - vp_dev->msix_enabled = 0; 272 270 pci_disable_msix(vp_dev->pci_dev); 271 + vp_dev->msix_enabled = 0; 272 + vp_dev->msix_vectors = 0; 273 273 } 274 + 275 + vp_dev->msix_used_vectors = 0; 276 + kfree(vp_dev->msix_names); 277 + vp_dev->msix_names = NULL; 278 + kfree(vp_dev->msix_entries); 279 + vp_dev->msix_entries = NULL; 274 280 } 275 281 276 - static int vp_enable_msix(struct pci_dev *dev, struct msix_entry *entries, 277 - int *options, int noptions) 278 - { 279 - int i; 280 - for (i = 0; i < noptions; ++i) 281 - if (!pci_enable_msix(dev, entries, options[i])) 282 - return options[i]; 283 - return -EBUSY; 284 - } 285 - 286 - static int vp_request_vectors(struct virtio_device *vdev, unsigned max_vqs) 282 + static int vp_request_vectors(struct virtio_device *vdev, int nvectors, 283 + bool per_vq_vectors) 287 284 { 288 285 struct virtio_pci_device *vp_dev = to_vp_device(vdev); 289 286 const char *name = dev_name(&vp_dev->vdev.dev); 290 287 unsigned i, v; 291 288 int err = -ENOMEM; 292 - /* We want at most one vector per queue and one for config changes. 293 - * Fallback to separate vectors for config and a shared for queues. 294 - * Finally fall back to regular interrupts. */ 295 - int options[] = { max_vqs + 1, 2 }; 296 - int nvectors = max(options[0], options[1]); 297 289 298 - vp_dev->msix_entries = kmalloc(nvectors * sizeof *vp_dev->msix_entries, 299 - GFP_KERNEL); 300 - if (!vp_dev->msix_entries) 301 - goto error_entries; 302 - vp_dev->msix_names = kmalloc(nvectors * sizeof *vp_dev->msix_names, 303 - GFP_KERNEL); 304 - if (!vp_dev->msix_names) 305 - goto error_names; 306 - 307 - for (i = 0; i < nvectors; ++i) 308 - vp_dev->msix_entries[i].entry = i; 309 - 310 - err = vp_enable_msix(vp_dev->pci_dev, vp_dev->msix_entries, 311 - options, ARRAY_SIZE(options)); 312 - if (err < 0) { 313 - /* Can't allocate enough MSI-X vectors, use regular interrupt */ 290 + if (!nvectors) { 291 + /* Can't allocate MSI-X vectors, use regular interrupt */ 314 292 vp_dev->msix_vectors = 0; 315 293 err = request_irq(vp_dev->pci_dev->irq, vp_interrupt, 316 294 IRQF_SHARED, name, vp_dev); 317 295 if (err) 318 - goto error_irq; 296 + return err; 319 297 vp_dev->intx_enabled = 1; 320 - } else { 321 - vp_dev->msix_vectors = err; 322 - vp_dev->msix_enabled = 1; 323 - 324 - /* Set the vector used for configuration */ 325 - v = vp_dev->msix_used_vectors; 326 - snprintf(vp_dev->msix_names[v], sizeof *vp_dev->msix_names, 327 - "%s-config", name); 328 - err = request_irq(vp_dev->msix_entries[v].vector, 329 - vp_config_changed, 0, vp_dev->msix_names[v], 330 - vp_dev); 331 - if (err) 332 - goto error_irq; 333 - ++vp_dev->msix_used_vectors; 334 - 335 - iowrite16(v, vp_dev->ioaddr + VIRTIO_MSI_CONFIG_VECTOR); 336 - /* Verify we had enough resources to assign the vector */ 337 - v = ioread16(vp_dev->ioaddr + VIRTIO_MSI_CONFIG_VECTOR); 338 - if (v == VIRTIO_MSI_NO_VECTOR) { 339 - err = -EBUSY; 340 - goto error_irq; 341 - } 298 + return 0; 342 299 } 343 300 344 - if (vp_dev->msix_vectors && vp_dev->msix_vectors != max_vqs + 1) { 301 + vp_dev->msix_entries = kmalloc(nvectors * sizeof *vp_dev->msix_entries, 302 + GFP_KERNEL); 303 + if (!vp_dev->msix_entries) 304 + goto error; 305 + vp_dev->msix_names = kmalloc(nvectors * sizeof *vp_dev->msix_names, 306 + GFP_KERNEL); 307 + if (!vp_dev->msix_names) 308 + goto error; 309 + 310 + for (i = 0; i < nvectors; ++i) 311 + vp_dev->msix_entries[i].entry = i; 312 + 313 + err = pci_enable_msix(vp_dev->pci_dev, vp_dev->msix_entries, nvectors); 314 + if (err > 0) 315 + err = -ENOSPC; 316 + if (err) 317 + goto error; 318 + vp_dev->msix_vectors = nvectors; 319 + vp_dev->msix_enabled = 1; 320 + 321 + /* Set the vector used for configuration */ 322 + v = vp_dev->msix_used_vectors; 323 + snprintf(vp_dev->msix_names[v], sizeof *vp_dev->msix_names, 324 + "%s-config", name); 325 + err = request_irq(vp_dev->msix_entries[v].vector, 326 + vp_config_changed, 0, vp_dev->msix_names[v], 327 + vp_dev); 328 + if (err) 329 + goto error; 330 + ++vp_dev->msix_used_vectors; 331 + 332 + iowrite16(v, vp_dev->ioaddr + VIRTIO_MSI_CONFIG_VECTOR); 333 + /* Verify we had enough resources to assign the vector */ 334 + v = ioread16(vp_dev->ioaddr + VIRTIO_MSI_CONFIG_VECTOR); 335 + if (v == VIRTIO_MSI_NO_VECTOR) { 336 + err = -EBUSY; 337 + goto error; 338 + } 339 + 340 + if (!per_vq_vectors) { 345 341 /* Shared vector for all VQs */ 346 342 v = vp_dev->msix_used_vectors; 347 343 snprintf(vp_dev->msix_names[v], sizeof *vp_dev->msix_names, ··· 347 349 vp_vring_interrupt, 0, vp_dev->msix_names[v], 348 350 vp_dev); 349 351 if (err) 350 - goto error_irq; 352 + goto error; 351 353 ++vp_dev->msix_used_vectors; 352 354 } 353 355 return 0; 354 - error_irq: 356 + error: 355 357 vp_free_vectors(vdev); 356 - kfree(vp_dev->msix_names); 357 - error_names: 358 - kfree(vp_dev->msix_entries); 359 - error_entries: 360 358 return err; 361 359 } 362 360 363 361 static struct virtqueue *vp_find_vq(struct virtio_device *vdev, unsigned index, 364 362 void (*callback)(struct virtqueue *vq), 365 - const char *name) 363 + const char *name, 364 + u16 vector) 366 365 { 367 366 struct virtio_pci_device *vp_dev = to_vp_device(vdev); 368 367 struct virtio_pci_vq_info *info; 369 368 struct virtqueue *vq; 370 369 unsigned long flags, size; 371 - u16 num, vector; 370 + u16 num; 372 371 int err; 373 372 374 373 /* Select the queue we're interested in */ ··· 384 389 385 390 info->queue_index = index; 386 391 info->num = num; 387 - info->vector = VIRTIO_MSI_NO_VECTOR; 392 + info->vector = vector; 388 393 389 394 size = PAGE_ALIGN(vring_size(num, VIRTIO_PCI_VRING_ALIGN)); 390 395 info->queue = alloc_pages_exact(size, GFP_KERNEL|__GFP_ZERO); ··· 408 413 vq->priv = info; 409 414 info->vq = vq; 410 415 411 - /* allocate per-vq vector if available and necessary */ 412 - if (callback && vp_dev->msix_used_vectors < vp_dev->msix_vectors) { 413 - vector = vp_dev->msix_used_vectors; 414 - snprintf(vp_dev->msix_names[vector], sizeof *vp_dev->msix_names, 415 - "%s-%s", dev_name(&vp_dev->vdev.dev), name); 416 - err = request_irq(vp_dev->msix_entries[vector].vector, 417 - vring_interrupt, 0, 418 - vp_dev->msix_names[vector], vq); 419 - if (err) 420 - goto out_request_irq; 421 - info->vector = vector; 422 - ++vp_dev->msix_used_vectors; 423 - } else 424 - vector = VP_MSIX_VQ_VECTOR; 425 - 426 - if (callback && vp_dev->msix_enabled) { 416 + if (vector != VIRTIO_MSI_NO_VECTOR) { 427 417 iowrite16(vector, vp_dev->ioaddr + VIRTIO_MSI_QUEUE_VECTOR); 428 418 vector = ioread16(vp_dev->ioaddr + VIRTIO_MSI_QUEUE_VECTOR); 429 419 if (vector == VIRTIO_MSI_NO_VECTOR) { ··· 424 444 return vq; 425 445 426 446 out_assign: 427 - if (info->vector != VIRTIO_MSI_NO_VECTOR) { 428 - free_irq(vp_dev->msix_entries[info->vector].vector, vq); 429 - --vp_dev->msix_used_vectors; 430 - } 431 - out_request_irq: 432 447 vring_del_virtqueue(vq); 433 448 out_activate_queue: 434 449 iowrite32(0, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN); ··· 437 462 { 438 463 struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev); 439 464 struct virtio_pci_vq_info *info = vq->priv; 440 - unsigned long size; 465 + unsigned long flags, size; 466 + 467 + spin_lock_irqsave(&vp_dev->lock, flags); 468 + list_del(&info->node); 469 + spin_unlock_irqrestore(&vp_dev->lock, flags); 441 470 442 471 iowrite16(info->queue_index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_SEL); 443 - 444 - if (info->vector != VIRTIO_MSI_NO_VECTOR) 445 - free_irq(vp_dev->msix_entries[info->vector].vector, vq); 446 472 447 473 if (vp_dev->msix_enabled) { 448 474 iowrite16(VIRTIO_MSI_NO_VECTOR, ··· 465 489 /* the config->del_vqs() implementation */ 466 490 static void vp_del_vqs(struct virtio_device *vdev) 467 491 { 492 + struct virtio_pci_device *vp_dev = to_vp_device(vdev); 468 493 struct virtqueue *vq, *n; 494 + struct virtio_pci_vq_info *info; 469 495 470 - list_for_each_entry_safe(vq, n, &vdev->vqs, list) 496 + list_for_each_entry_safe(vq, n, &vdev->vqs, list) { 497 + info = vq->priv; 498 + if (vp_dev->per_vq_vectors) 499 + free_irq(vp_dev->msix_entries[info->vector].vector, vq); 471 500 vp_del_vq(vq); 501 + } 502 + vp_dev->per_vq_vectors = false; 472 503 473 504 vp_free_vectors(vdev); 505 + } 506 + 507 + static int vp_try_to_find_vqs(struct virtio_device *vdev, unsigned nvqs, 508 + struct virtqueue *vqs[], 509 + vq_callback_t *callbacks[], 510 + const char *names[], 511 + int nvectors, 512 + bool per_vq_vectors) 513 + { 514 + struct virtio_pci_device *vp_dev = to_vp_device(vdev); 515 + u16 vector; 516 + int i, err, allocated_vectors; 517 + 518 + err = vp_request_vectors(vdev, nvectors, per_vq_vectors); 519 + if (err) 520 + goto error_request; 521 + 522 + vp_dev->per_vq_vectors = per_vq_vectors; 523 + allocated_vectors = vp_dev->msix_used_vectors; 524 + for (i = 0; i < nvqs; ++i) { 525 + if (!callbacks[i] || !vp_dev->msix_enabled) 526 + vector = VIRTIO_MSI_NO_VECTOR; 527 + else if (vp_dev->per_vq_vectors) 528 + vector = allocated_vectors++; 529 + else 530 + vector = VP_MSIX_VQ_VECTOR; 531 + vqs[i] = vp_find_vq(vdev, i, callbacks[i], names[i], vector); 532 + if (IS_ERR(vqs[i])) { 533 + err = PTR_ERR(vqs[i]); 534 + goto error_find; 535 + } 536 + /* allocate per-vq irq if available and necessary */ 537 + if (vp_dev->per_vq_vectors && vector != VIRTIO_MSI_NO_VECTOR) { 538 + snprintf(vp_dev->msix_names[vector], sizeof *vp_dev->msix_names, 539 + "%s-%s", dev_name(&vp_dev->vdev.dev), names[i]); 540 + err = request_irq(vp_dev->msix_entries[vector].vector, 541 + vring_interrupt, 0, 542 + vp_dev->msix_names[vector], vqs[i]); 543 + if (err) { 544 + vp_del_vq(vqs[i]); 545 + goto error_find; 546 + } 547 + } 548 + } 549 + return 0; 550 + 551 + error_find: 552 + vp_del_vqs(vdev); 553 + 554 + error_request: 555 + return err; 474 556 } 475 557 476 558 /* the config->find_vqs() implementation */ ··· 538 504 const char *names[]) 539 505 { 540 506 int vectors = 0; 541 - int i, err; 507 + int i, uninitialized_var(err); 542 508 543 509 /* How many vectors would we like? */ 544 510 for (i = 0; i < nvqs; ++i) 545 511 if (callbacks[i]) 546 512 ++vectors; 547 513 548 - err = vp_request_vectors(vdev, vectors); 549 - if (err) 550 - goto error_request; 551 - 552 - for (i = 0; i < nvqs; ++i) { 553 - vqs[i] = vp_find_vq(vdev, i, callbacks[i], names[i]); 554 - if (IS_ERR(vqs[i])) 555 - goto error_find; 556 - } 557 - return 0; 558 - 559 - error_find: 560 - vp_del_vqs(vdev); 561 - 562 - error_request: 563 - return PTR_ERR(vqs[i]); 514 + /* We want at most one vector per queue and one for config changes. */ 515 + err = vp_try_to_find_vqs(vdev, nvqs, vqs, callbacks, names, 516 + vectors + 1, true); 517 + if (!err) 518 + return 0; 519 + /* Fallback to separate vectors for config and a shared for queues. */ 520 + err = vp_try_to_find_vqs(vdev, nvqs, vqs, callbacks, names, 521 + 2, false); 522 + if (!err) 523 + return 0; 524 + /* Finally fall back to regular interrupts. */ 525 + err = vp_try_to_find_vqs(vdev, nvqs, vqs, callbacks, names, 526 + 0, false); 527 + return err; 564 528 } 565 529 566 530 static struct virtio_config_ops virtio_pci_config_ops = {

+25 -14

include/linux/lguest.h

··· 1 - /* Things the lguest guest needs to know. Note: like all lguest interfaces, 2 - * this is subject to wild and random change between versions. */ 1 + /* 2 + * Things the lguest guest needs to know. Note: like all lguest interfaces, 3 + * this is subject to wild and random change between versions. 4 + */ 3 5 #ifndef _LINUX_LGUEST_H 4 6 #define _LINUX_LGUEST_H 5 7 ··· 13 11 #define LG_CLOCK_MIN_DELTA 100UL 14 12 #define LG_CLOCK_MAX_DELTA ULONG_MAX 15 13 16 - /*G:031 The second method of communicating with the Host is to via "struct 14 + /*G:031 15 + * The second method of communicating with the Host is to via "struct 17 16 * lguest_data". Once the Guest's initialization hypercall tells the Host where 18 - * this is, the Guest and Host both publish information in it. :*/ 19 - struct lguest_data 20 - { 21 - /* 512 == enabled (same as eflags in normal hardware). The Guest 22 - * changes interrupts so often that a hypercall is too slow. */ 17 + * this is, the Guest and Host both publish information in it. 18 + :*/ 19 + struct lguest_data { 20 + /* 21 + * 512 == enabled (same as eflags in normal hardware). The Guest 22 + * changes interrupts so often that a hypercall is too slow. 23 + */ 23 24 unsigned int irq_enabled; 24 25 /* Fine-grained interrupt disabling by the Guest */ 25 26 DECLARE_BITMAP(blocked_interrupts, LGUEST_IRQS); 26 27 27 - /* The Host writes the virtual address of the last page fault here, 28 + /* 29 + * The Host writes the virtual address of the last page fault here, 28 30 * which saves the Guest a hypercall. CR2 is the native register where 29 - * this address would normally be found. */ 31 + * this address would normally be found. 32 + */ 30 33 unsigned long cr2; 31 34 32 35 /* Wallclock time set by the Host. */ 33 36 struct timespec time; 34 37 35 - /* Interrupt pending set by the Host. The Guest should do a hypercall 36 - * if it re-enables interrupts and sees this set (to X86_EFLAGS_IF). */ 38 + /* 39 + * Interrupt pending set by the Host. The Guest should do a hypercall 40 + * if it re-enables interrupts and sees this set (to X86_EFLAGS_IF). 41 + */ 37 42 int irq_pending; 38 43 39 - /* Async hypercall ring. Instead of directly making hypercalls, we can 44 + /* 45 + * Async hypercall ring. Instead of directly making hypercalls, we can 40 46 * place them in here for processing the next time the Host wants. 41 - * This batching can be quite efficient. */ 47 + * This batching can be quite efficient. 48 + */ 42 49 43 50 /* 0xFF == done (set by Host), 0 == pending (set by Guest). */ 44 51 u8 hcall_status[LHCALL_RING_SIZE];

+12 -6

include/linux/lguest_launcher.h

··· 29 29 __u8 type; 30 30 /* The number of virtqueues (first in config array) */ 31 31 __u8 num_vq; 32 - /* The number of bytes of feature bits. Multiply by 2: one for host 33 - * features and one for Guest acknowledgements. */ 32 + /* 33 + * The number of bytes of feature bits. Multiply by 2: one for host 34 + * features and one for Guest acknowledgements. 35 + */ 34 36 __u8 feature_len; 35 37 /* The number of bytes of the config array after virtqueues. */ 36 38 __u8 config_len; ··· 41 39 __u8 config[0]; 42 40 }; 43 41 44 - /*D:135 This is how we expect the device configuration field for a virtqueue 45 - * to be laid out in config space. */ 42 + /*D:135 43 + * This is how we expect the device configuration field for a virtqueue 44 + * to be laid out in config space. 45 + */ 46 46 struct lguest_vqconfig { 47 47 /* The number of entries in the virtio_ring */ 48 48 __u16 num; ··· 65 61 LHREQ_EVENTFD, /* + address, fd. */ 66 62 }; 67 63 68 - /* The alignment to use between consumer and producer parts of vring. 69 - * x86 pagesize for historical reasons. */ 64 + /* 65 + * The alignment to use between consumer and producer parts of vring. 66 + * x86 pagesize for historical reasons. 67 + */ 70 68 #define LGUEST_VRING_ALIGN 4096 71 69 #endif /* _LINUX_LGUEST_LAUNCHER */

+2 -4

include/linux/virtio_blk.h

··· 20 20 21 21 #define VIRTIO_BLK_ID_BYTES (sizeof(__u16[256])) /* IDENTIFY DATA */ 22 22 23 - struct virtio_blk_config 24 - { 23 + struct virtio_blk_config { 25 24 /* The capacity (in 512-byte sectors). */ 26 25 __u64 capacity; 27 26 /* The maximum segment size (if VIRTIO_BLK_F_SIZE_MAX) */ ··· 49 50 #define VIRTIO_BLK_T_BARRIER 0x80000000 50 51 51 52 /* This is the first element of the read scatter-gather list. */ 52 - struct virtio_blk_outhdr 53 - { 53 + struct virtio_blk_outhdr { 54 54 /* VIRTIO_BLK_T* */ 55 55 __u32 type; 56 56 /* io priority. */

+1 -2

include/linux/virtio_config.h

··· 79 79 * the dev->feature bits if it wants. 80 80 */ 81 81 typedef void vq_callback_t(struct virtqueue *); 82 - struct virtio_config_ops 83 - { 82 + struct virtio_config_ops { 84 83 void (*get)(struct virtio_device *vdev, unsigned offset, 85 84 void *buf, unsigned len); 86 85 void (*set)(struct virtio_device *vdev, unsigned offset,

+2 -4

include/linux/virtio_net.h

··· 31 31 32 32 #define VIRTIO_NET_S_LINK_UP 1 /* Link is up */ 33 33 34 - struct virtio_net_config 35 - { 34 + struct virtio_net_config { 36 35 /* The config defining mac address (if VIRTIO_NET_F_MAC) */ 37 36 __u8 mac[6]; 38 37 /* See VIRTIO_NET_F_STATUS and VIRTIO_NET_S_* above */ ··· 40 41 41 42 /* This is the first element of the scatter-gather list. If you don't 42 43 * specify GSO or CSUM features, you can simply ignore the header. */ 43 - struct virtio_net_hdr 44 - { 44 + struct virtio_net_hdr { 45 45 #define VIRTIO_NET_HDR_F_NEEDS_CSUM 1 // Use csum_start, csum_offset 46 46 __u8 flags; 47 47 #define VIRTIO_NET_HDR_GSO_NONE 0 // Not a GSO frame

+4 -8

include/linux/virtio_ring.h

··· 30 30 #define VIRTIO_RING_F_INDIRECT_DESC 28 31 31 32 32 /* Virtio ring descriptors: 16 bytes. These can chain together via "next". */ 33 - struct vring_desc 34 - { 33 + struct vring_desc { 35 34 /* Address (guest-physical). */ 36 35 __u64 addr; 37 36 /* Length. */ ··· 41 42 __u16 next; 42 43 }; 43 44 44 - struct vring_avail 45 - { 45 + struct vring_avail { 46 46 __u16 flags; 47 47 __u16 idx; 48 48 __u16 ring[]; 49 49 }; 50 50 51 51 /* u32 is used here for ids for padding reasons. */ 52 - struct vring_used_elem 53 - { 52 + struct vring_used_elem { 54 53 /* Index of start of used descriptor chain. */ 55 54 __u32 id; 56 55 /* Total length of the descriptor chain which was used (written to) */ 57 56 __u32 len; 58 57 }; 59 58 60 - struct vring_used 61 - { 59 + struct vring_used { 62 60 __u16 flags; 63 61 __u16 idx; 64 62 struct vring_used_elem ring[];

Configure Feed

Configure Feed