Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux-2.6-for-linus

* git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux-2.6-for-linus:
lguest: turn Waker into a thread, not a process
lguest: Enlarge virtio rings
lguest: Use GSO/IFF_VNET_HDR extensions on tun/tap
lguest: Remove 'network: no dma buffer!' warning
lguest: Adaptive timeout
lguest: Tell Guest net not to notify us on every packet xmit
lguest: net block unneeded receive queue update notifications
lguest: wrap last_avail accesses.
lguest: use cpu capability accessors
lguest: virtio-rng support
lguest: Support assigning a MAC address
lguest: Don't leak /dev/zero fd
lguest: fix verbose printing of device features.
lguest: fix switcher_page leak on unload
lguest: Guest int3 fix
lguest: set max_pfn_mapped, growl loudly at Yinghai Lu

+407 -140
+384 -131
Documentation/lguest/lguest.c
··· 36 36 #include <sched.h> 37 37 #include <limits.h> 38 38 #include <stddef.h> 39 + #include <signal.h> 39 40 #include "linux/lguest_launcher.h" 40 41 #include "linux/virtio_config.h" 41 42 #include "linux/virtio_net.h" 42 43 #include "linux/virtio_blk.h" 43 44 #include "linux/virtio_console.h" 45 + #include "linux/virtio_rng.h" 44 46 #include "linux/virtio_ring.h" 45 47 #include "asm-x86/bootparam.h" 46 48 /*L:110 We can ignore the 39 include files we need for this program, but I do ··· 66 64 #endif 67 65 /* We can have up to 256 pages for devices. */ 68 66 #define DEVICE_PAGES 256 69 - /* This will occupy 2 pages: it must be a power of 2. */ 70 - #define VIRTQUEUE_NUM 128 67 + /* This will occupy 3 pages: it must be a power of 2. */ 68 + #define VIRTQUEUE_NUM 256 71 69 72 70 /*L:120 verbose is both a global flag and a macro. The C preprocessor allows 73 71 * this, and although I wouldn't recommend it, it works quite nicely here. */ ··· 76 74 do { if (verbose) printf(args); } while(0) 77 75 /*:*/ 78 76 79 - /* The pipe to send commands to the waker process */ 80 - static int waker_fd; 77 + /* File descriptors for the Waker. */ 78 + struct { 79 + int pipe[2]; 80 + int lguest_fd; 81 + } waker_fds; 82 + 81 83 /* The pointer to the start of guest memory. */ 82 84 static void *guest_base; 83 85 /* The maximum guest physical address allowed, and maximum possible. */ 84 86 static unsigned long guest_limit, guest_max; 87 + /* The pipe for signal hander to write to. */ 88 + static int timeoutpipe[2]; 89 + static unsigned int timeout_usec = 500; 85 90 86 91 /* a per-cpu variable indicating whose vcpu is currently running */ 87 92 static unsigned int __thread cpu_id; ··· 164 155 /* Last available index we saw. */ 165 156 u16 last_avail_idx; 166 157 167 - /* The routine to call when the Guest pings us. */ 168 - void (*handle_output)(int fd, struct virtqueue *me); 158 + /* The routine to call when the Guest pings us, or timeout. */ 159 + void (*handle_output)(int fd, struct virtqueue *me, bool timeout); 169 160 170 161 /* Outstanding buffers */ 171 162 unsigned int inflight; 163 + 164 + /* Is this blocked awaiting a timer? */ 165 + bool blocked; 172 166 }; 173 167 174 168 /* Remember the arguments to the program so we can "reboot" */ ··· 202 190 return iov->iov_base; 203 191 } 204 192 193 + /* Wrapper for the last available index. Makes it easier to change. */ 194 + #define lg_last_avail(vq) ((vq)->last_avail_idx) 195 + 205 196 /* The virtio configuration space is defined to be little-endian. x86 is 206 197 * little-endian too, but it's nice to be explicit so we have these helpers. */ 207 198 #define cpu_to_le16(v16) (v16) ··· 213 198 #define le16_to_cpu(v16) (v16) 214 199 #define le32_to_cpu(v32) (v32) 215 200 #define le64_to_cpu(v64) (v64) 201 + 202 + /* Is this iovec empty? */ 203 + static bool iov_empty(const struct iovec iov[], unsigned int num_iov) 204 + { 205 + unsigned int i; 206 + 207 + for (i = 0; i < num_iov; i++) 208 + if (iov[i].iov_len) 209 + return false; 210 + return true; 211 + } 212 + 213 + /* Take len bytes from the front of this iovec. */ 214 + static void iov_consume(struct iovec iov[], unsigned num_iov, unsigned len) 215 + { 216 + unsigned int i; 217 + 218 + for (i = 0; i < num_iov; i++) { 219 + unsigned int used; 220 + 221 + used = iov[i].iov_len < len ? iov[i].iov_len : len; 222 + iov[i].iov_base += used; 223 + iov[i].iov_len -= used; 224 + len -= used; 225 + } 226 + assert(len == 0); 227 + } 216 228 217 229 /* The device virtqueue descriptors are followed by feature bitmasks. */ 218 230 static u8 *get_feature_bits(struct device *dev) ··· 296 254 PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, fd, 0); 297 255 if (addr == MAP_FAILED) 298 256 err(1, "Mmaping %u pages of /dev/zero", num); 257 + close(fd); 299 258 300 259 return addr; 301 260 } ··· 583 540 * watch, but handing a file descriptor mask through to the kernel is fairly 584 541 * icky. 585 542 * 586 - * Instead, we fork off a process which watches the file descriptors and writes 543 + * Instead, we clone off a thread which watches the file descriptors and writes 587 544 * the LHREQ_BREAK command to the /dev/lguest file descriptor to tell the Host 588 545 * stop running the Guest. This causes the Launcher to return from the 589 546 * /dev/lguest read with -EAGAIN, where it will write to /dev/lguest to reset 590 547 * the LHREQ_BREAK and wake us up again. 591 548 * 592 549 * This, of course, is merely a different *kind* of icky. 550 + * 551 + * Given my well-known antipathy to threads, I'd prefer to use processes. But 552 + * it's easier to share Guest memory with threads, and trivial to share the 553 + * devices.infds as the Launcher changes it. 593 554 */ 594 - static void wake_parent(int pipefd, int lguest_fd) 555 + static int waker(void *unused) 595 556 { 596 - /* Add the pipe from the Launcher to the fdset in the device_list, so 597 - * we watch it, too. */ 598 - add_device_fd(pipefd); 557 + /* Close the write end of the pipe: only the Launcher has it open. */ 558 + close(waker_fds.pipe[1]); 599 559 600 560 for (;;) { 601 561 fd_set rfds = devices.infds; 602 562 unsigned long args[] = { LHREQ_BREAK, 1 }; 563 + unsigned int maxfd = devices.max_infd; 564 + 565 + /* We also listen to the pipe from the Launcher. */ 566 + FD_SET(waker_fds.pipe[0], &rfds); 567 + if (waker_fds.pipe[0] > maxfd) 568 + maxfd = waker_fds.pipe[0]; 603 569 604 570 /* Wait until input is ready from one of the devices. */ 605 - select(devices.max_infd+1, &rfds, NULL, NULL, NULL); 606 - /* Is it a message from the Launcher? */ 607 - if (FD_ISSET(pipefd, &rfds)) { 608 - int fd; 609 - /* If read() returns 0, it means the Launcher has 610 - * exited. We silently follow. */ 611 - if (read(pipefd, &fd, sizeof(fd)) == 0) 612 - exit(0); 613 - /* Otherwise it's telling us to change what file 614 - * descriptors we're to listen to. Positive means 615 - * listen to a new one, negative means stop 616 - * listening. */ 617 - if (fd >= 0) 618 - FD_SET(fd, &devices.infds); 619 - else 620 - FD_CLR(-fd - 1, &devices.infds); 621 - } else /* Send LHREQ_BREAK command. */ 622 - pwrite(lguest_fd, args, sizeof(args), cpu_id); 571 + select(maxfd+1, &rfds, NULL, NULL, NULL); 572 + 573 + /* Message from Launcher? */ 574 + if (FD_ISSET(waker_fds.pipe[0], &rfds)) { 575 + char c; 576 + /* If this fails, then assume Launcher has exited. 577 + * Don't do anything on exit: we're just a thread! */ 578 + if (read(waker_fds.pipe[0], &c, 1) != 1) 579 + _exit(0); 580 + continue; 581 + } 582 + 583 + /* Send LHREQ_BREAK command to snap the Launcher out of it. */ 584 + pwrite(waker_fds.lguest_fd, args, sizeof(args), cpu_id); 623 585 } 586 + return 0; 624 587 } 625 588 626 589 /* This routine just sets up a pipe to the Waker process. */ 627 - static int setup_waker(int lguest_fd) 590 + static void setup_waker(int lguest_fd) 628 591 { 629 - int pipefd[2], child; 592 + /* This pipe is closed when Launcher dies, telling Waker. */ 593 + if (pipe(waker_fds.pipe) != 0) 594 + err(1, "Creating pipe for Waker"); 630 595 631 - /* We create a pipe to talk to the Waker, and also so it knows when the 632 - * Launcher dies (and closes pipe). */ 633 - pipe(pipefd); 634 - child = fork(); 635 - if (child == -1) 636 - err(1, "forking"); 596 + /* Waker also needs to know the lguest fd */ 597 + waker_fds.lguest_fd = lguest_fd; 637 598 638 - if (child == 0) { 639 - /* We are the Waker: close the "writing" end of our copy of the 640 - * pipe and start waiting for input. */ 641 - close(pipefd[1]); 642 - wake_parent(pipefd[0], lguest_fd); 643 - } 644 - /* Close the reading end of our copy of the pipe. */ 645 - close(pipefd[0]); 646 - 647 - /* Here is the fd used to talk to the waker. */ 648 - return pipefd[1]; 599 + if (clone(waker, malloc(4096) + 4096, CLONE_VM | SIGCHLD, NULL) == -1) 600 + err(1, "Creating Waker"); 649 601 } 650 602 651 603 /* ··· 699 661 unsigned int *out_num, unsigned int *in_num) 700 662 { 701 663 unsigned int i, head; 664 + u16 last_avail; 702 665 703 666 /* Check it isn't doing very strange things with descriptor numbers. */ 704 - if ((u16)(vq->vring.avail->idx - vq->last_avail_idx) > vq->vring.num) 667 + last_avail = lg_last_avail(vq); 668 + if ((u16)(vq->vring.avail->idx - last_avail) > vq->vring.num) 705 669 errx(1, "Guest moved used index from %u to %u", 706 - vq->last_avail_idx, vq->vring.avail->idx); 670 + last_avail, vq->vring.avail->idx); 707 671 708 672 /* If there's nothing new since last we looked, return invalid. */ 709 - if (vq->vring.avail->idx == vq->last_avail_idx) 673 + if (vq->vring.avail->idx == last_avail) 710 674 return vq->vring.num; 711 675 712 676 /* Grab the next descriptor number they're advertising, and increment 713 677 * the index we've seen. */ 714 - head = vq->vring.avail->ring[vq->last_avail_idx++ % vq->vring.num]; 678 + head = vq->vring.avail->ring[last_avail % vq->vring.num]; 679 + lg_last_avail(vq)++; 715 680 716 681 /* If their number is silly, that's a fatal mistake. */ 717 682 if (head >= vq->vring.num) ··· 862 821 unsigned long args[] = { LHREQ_BREAK, 0 }; 863 822 /* Close the fd so Waker will know it has to 864 823 * exit. */ 865 - close(waker_fd); 866 - /* Just in case waker is blocked in BREAK, send 824 + close(waker_fds.pipe[1]); 825 + /* Just in case Waker is blocked in BREAK, send 867 826 * unbreak now. */ 868 827 write(fd, args, sizeof(args)); 869 828 exit(2); ··· 880 839 881 840 /* Handling output for console is simple: we just get all the output buffers 882 841 * and write them to stdout. */ 883 - static void handle_console_output(int fd, struct virtqueue *vq) 842 + static void handle_console_output(int fd, struct virtqueue *vq, bool timeout) 884 843 { 885 844 unsigned int head, out, in; 886 845 int len; ··· 895 854 } 896 855 } 897 856 857 + static void block_vq(struct virtqueue *vq) 858 + { 859 + struct itimerval itm; 860 + 861 + vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY; 862 + vq->blocked = true; 863 + 864 + itm.it_interval.tv_sec = 0; 865 + itm.it_interval.tv_usec = 0; 866 + itm.it_value.tv_sec = 0; 867 + itm.it_value.tv_usec = timeout_usec; 868 + 869 + setitimer(ITIMER_REAL, &itm, NULL); 870 + } 871 + 898 872 /* 899 873 * The Network 900 874 * ··· 917 861 * and write them (ignoring the first element) to this device's file descriptor 918 862 * (/dev/net/tun). 919 863 */ 920 - static void handle_net_output(int fd, struct virtqueue *vq) 864 + static void handle_net_output(int fd, struct virtqueue *vq, bool timeout) 921 865 { 922 - unsigned int head, out, in; 866 + unsigned int head, out, in, num = 0; 923 867 int len; 924 868 struct iovec iov[vq->vring.num]; 869 + static int last_timeout_num; 925 870 926 871 /* Keep getting output buffers from the Guest until we run out. */ 927 872 while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) { 928 873 if (in) 929 874 errx(1, "Input buffers in output queue?"); 930 - /* Check header, but otherwise ignore it (we told the Guest we 931 - * supported no features, so it shouldn't have anything 932 - * interesting). */ 933 - (void)convert(&iov[0], struct virtio_net_hdr); 934 - len = writev(vq->dev->fd, iov+1, out-1); 875 + len = writev(vq->dev->fd, iov, out); 876 + if (len < 0) 877 + err(1, "Writing network packet to tun"); 935 878 add_used_and_trigger(fd, vq, head, len); 879 + num++; 880 + } 881 + 882 + /* Block further kicks and set up a timer if we saw anything. */ 883 + if (!timeout && num) 884 + block_vq(vq); 885 + 886 + if (timeout) { 887 + if (num < last_timeout_num) 888 + timeout_usec += 10; 889 + else if (timeout_usec > 1) 890 + timeout_usec--; 891 + last_timeout_num = num; 936 892 } 937 893 } 938 894 ··· 955 887 unsigned int head, in_num, out_num; 956 888 int len; 957 889 struct iovec iov[dev->vq->vring.num]; 958 - struct virtio_net_hdr *hdr; 959 890 960 891 /* First we need a network buffer from the Guests's recv virtqueue. */ 961 892 head = get_vq_desc(dev->vq, iov, &out_num, &in_num); ··· 963 896 * early, the Guest won't be ready yet. Wait until the device 964 897 * status says it's ready. */ 965 898 /* FIXME: Actually want DRIVER_ACTIVE here. */ 966 - if (dev->desc->status & VIRTIO_CONFIG_S_DRIVER_OK) 967 - warn("network: no dma buffer!"); 899 + 900 + /* Now tell it we want to know if new things appear. */ 901 + dev->vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY; 902 + wmb(); 903 + 968 904 /* We'll turn this back on if input buffers are registered. */ 969 905 return false; 970 906 } else if (out_num) 971 907 errx(1, "Output buffers in network recv queue?"); 972 908 973 - /* First element is the header: we set it to 0 (no features). */ 974 - hdr = convert(&iov[0], struct virtio_net_hdr); 975 - hdr->flags = 0; 976 - hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE; 977 - 978 909 /* Read the packet from the device directly into the Guest's buffer. */ 979 - len = readv(dev->fd, iov+1, in_num-1); 910 + len = readv(dev->fd, iov, in_num); 980 911 if (len <= 0) 981 912 err(1, "reading network"); 982 913 983 914 /* Tell the Guest about the new packet. */ 984 - add_used_and_trigger(fd, dev->vq, head, sizeof(*hdr) + len); 915 + add_used_and_trigger(fd, dev->vq, head, len); 985 916 986 917 verbose("tun input packet len %i [%02x %02x] (%s)\n", len, 987 918 ((u8 *)iov[1].iov_base)[0], ((u8 *)iov[1].iov_base)[1], ··· 992 927 /*L:215 This is the callback attached to the network and console input 993 928 * virtqueues: it ensures we try again, in case we stopped console or net 994 929 * delivery because Guest didn't have any buffers. */ 995 - static void enable_fd(int fd, struct virtqueue *vq) 930 + static void enable_fd(int fd, struct virtqueue *vq, bool timeout) 996 931 { 997 932 add_device_fd(vq->dev->fd); 998 - /* Tell waker to listen to it again */ 999 - write(waker_fd, &vq->dev->fd, sizeof(vq->dev->fd)); 933 + /* Snap the Waker out of its select loop. */ 934 + write(waker_fds.pipe[1], "", 1); 935 + } 936 + 937 + static void net_enable_fd(int fd, struct virtqueue *vq, bool timeout) 938 + { 939 + /* We don't need to know again when Guest refills receive buffer. */ 940 + vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY; 941 + enable_fd(fd, vq, timeout); 1000 942 } 1001 943 1002 944 /* When the Guest tells us they updated the status field, we handle it. */ ··· 1023 951 for (vq = dev->vq; vq; vq = vq->next) { 1024 952 memset(vq->vring.desc, 0, 1025 953 vring_size(vq->config.num, getpagesize())); 1026 - vq->last_avail_idx = 0; 954 + lg_last_avail(vq) = 0; 1027 955 } 1028 956 } else if (dev->desc->status & VIRTIO_CONFIG_S_FAILED) { 1029 957 warnx("Device %s configuration FAILED", dev->name); ··· 1032 960 1033 961 verbose("Device %s OK: offered", dev->name); 1034 962 for (i = 0; i < dev->desc->feature_len; i++) 1035 - verbose(" %08x", get_feature_bits(dev)[i]); 963 + verbose(" %02x", get_feature_bits(dev)[i]); 1036 964 verbose(", accepted"); 1037 965 for (i = 0; i < dev->desc->feature_len; i++) 1038 - verbose(" %08x", get_feature_bits(dev) 966 + verbose(" %02x", get_feature_bits(dev) 1039 967 [dev->desc->feature_len+i]); 1040 968 1041 969 if (dev->ready) ··· 1072 1000 if (strcmp(vq->dev->name, "console") != 0) 1073 1001 verbose("Output to %s\n", vq->dev->name); 1074 1002 if (vq->handle_output) 1075 - vq->handle_output(fd, vq); 1003 + vq->handle_output(fd, vq, false); 1076 1004 return; 1077 1005 } 1078 1006 } ··· 1086 1014 strnlen(from_guest_phys(addr), guest_limit - addr)); 1087 1015 } 1088 1016 1017 + static void handle_timeout(int fd) 1018 + { 1019 + char buf[32]; 1020 + struct device *i; 1021 + struct virtqueue *vq; 1022 + 1023 + /* Clear the pipe */ 1024 + read(timeoutpipe[0], buf, sizeof(buf)); 1025 + 1026 + /* Check each device and virtqueue: flush blocked ones. */ 1027 + for (i = devices.dev; i; i = i->next) { 1028 + for (vq = i->vq; vq; vq = vq->next) { 1029 + if (!vq->blocked) 1030 + continue; 1031 + 1032 + vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY; 1033 + vq->blocked = false; 1034 + if (vq->handle_output) 1035 + vq->handle_output(fd, vq, true); 1036 + } 1037 + } 1038 + } 1039 + 1089 1040 /* This is called when the Waker wakes us up: check for incoming file 1090 1041 * descriptors. */ 1091 1042 static void handle_input(int fd) ··· 1119 1024 for (;;) { 1120 1025 struct device *i; 1121 1026 fd_set fds = devices.infds; 1027 + int num; 1122 1028 1029 + num = select(devices.max_infd+1, &fds, NULL, NULL, &poll); 1030 + /* Could get interrupted */ 1031 + if (num < 0) 1032 + continue; 1123 1033 /* If nothing is ready, we're done. */ 1124 - if (select(devices.max_infd+1, &fds, NULL, NULL, &poll) == 0) 1034 + if (num == 0) 1125 1035 break; 1126 1036 1127 1037 /* Otherwise, call the device(s) which have readable file 1128 1038 * descriptors and a method of handling them. */ 1129 1039 for (i = devices.dev; i; i = i->next) { 1130 1040 if (i->handle_input && FD_ISSET(i->fd, &fds)) { 1131 - int dev_fd; 1132 1041 if (i->handle_input(fd, i)) 1133 1042 continue; 1134 1043 ··· 1142 1043 * buffers to deliver into. Console also uses 1143 1044 * it when it discovers that stdin is closed. */ 1144 1045 FD_CLR(i->fd, &devices.infds); 1145 - /* Tell waker to ignore it too, by sending a 1146 - * negative fd number (-1, since 0 is a valid 1147 - * FD number). */ 1148 - dev_fd = -i->fd - 1; 1149 - write(waker_fd, &dev_fd, sizeof(dev_fd)); 1150 1046 } 1151 1047 } 1048 + 1049 + /* Is this the timeout fd? */ 1050 + if (FD_ISSET(timeoutpipe[0], &fds)) 1051 + handle_timeout(fd); 1152 1052 } 1153 1053 } 1154 1054 ··· 1196 1098 /* Each device descriptor is followed by the description of its virtqueues. We 1197 1099 * specify how many descriptors the virtqueue is to have. */ 1198 1100 static void add_virtqueue(struct device *dev, unsigned int num_descs, 1199 - void (*handle_output)(int fd, struct virtqueue *me)) 1101 + void (*handle_output)(int, struct virtqueue *, bool)) 1200 1102 { 1201 1103 unsigned int pages; 1202 1104 struct virtqueue **i, *vq = malloc(sizeof(*vq)); ··· 1212 1114 vq->last_avail_idx = 0; 1213 1115 vq->dev = dev; 1214 1116 vq->inflight = 0; 1117 + vq->blocked = false; 1215 1118 1216 1119 /* Initialize the configuration. */ 1217 1120 vq->config.num = num_descs; ··· 1345 1246 } 1346 1247 /*:*/ 1347 1248 1249 + static void timeout_alarm(int sig) 1250 + { 1251 + write(timeoutpipe[1], "", 1); 1252 + } 1253 + 1254 + static void setup_timeout(void) 1255 + { 1256 + if (pipe(timeoutpipe) != 0) 1257 + err(1, "Creating timeout pipe"); 1258 + 1259 + if (fcntl(timeoutpipe[1], F_SETFL, 1260 + fcntl(timeoutpipe[1], F_GETFL) | O_NONBLOCK) != 0) 1261 + err(1, "Making timeout pipe nonblocking"); 1262 + 1263 + add_device_fd(timeoutpipe[0]); 1264 + signal(SIGALRM, timeout_alarm); 1265 + } 1266 + 1348 1267 /*M:010 Inter-guest networking is an interesting area. Simplest is to have a 1349 1268 * --sharenet=<name> option which opens or creates a named pipe. This can be 1350 1269 * used to send packets to another guest in a 1:1 manner. ··· 1381 1264 1382 1265 static u32 str2ip(const char *ipaddr) 1383 1266 { 1384 - unsigned int byte[4]; 1267 + unsigned int b[4]; 1385 1268 1386 - sscanf(ipaddr, "%u.%u.%u.%u", &byte[0], &byte[1], &byte[2], &byte[3]); 1387 - return (byte[0] << 24) | (byte[1] << 16) | (byte[2] << 8) | byte[3]; 1269 + if (sscanf(ipaddr, "%u.%u.%u.%u", &b[0], &b[1], &b[2], &b[3]) != 4) 1270 + errx(1, "Failed to parse IP address '%s'", ipaddr); 1271 + return (b[0] << 24) | (b[1] << 16) | (b[2] << 8) | b[3]; 1272 + } 1273 + 1274 + static void str2mac(const char *macaddr, unsigned char mac[6]) 1275 + { 1276 + unsigned int m[6]; 1277 + if (sscanf(macaddr, "%02x:%02x:%02x:%02x:%02x:%02x", 1278 + &m[0], &m[1], &m[2], &m[3], &m[4], &m[5]) != 6) 1279 + errx(1, "Failed to parse mac address '%s'", macaddr); 1280 + mac[0] = m[0]; 1281 + mac[1] = m[1]; 1282 + mac[2] = m[2]; 1283 + mac[3] = m[3]; 1284 + mac[4] = m[4]; 1285 + mac[5] = m[5]; 1388 1286 } 1389 1287 1390 1288 /* This code is "adapted" from libbridge: it attaches the Host end of the ··· 1420 1288 errx(1, "interface %s does not exist!", if_name); 1421 1289 1422 1290 strncpy(ifr.ifr_name, br_name, IFNAMSIZ); 1291 + ifr.ifr_name[IFNAMSIZ-1] = '\0'; 1423 1292 ifr.ifr_ifindex = ifidx; 1424 1293 if (ioctl(fd, SIOCBRADDIF, &ifr) < 0) 1425 1294 err(1, "can't add %s to bridge %s", if_name, br_name); ··· 1429 1296 /* This sets up the Host end of the network device with an IP address, brings 1430 1297 * it up so packets will flow, the copies the MAC address into the hwaddr 1431 1298 * pointer. */ 1432 - static void configure_device(int fd, const char *devname, u32 ipaddr, 1433 - unsigned char hwaddr[6]) 1299 + static void configure_device(int fd, const char *tapif, u32 ipaddr) 1434 1300 { 1435 1301 struct ifreq ifr; 1436 1302 struct sockaddr_in *sin = (struct sockaddr_in *)&ifr.ifr_addr; 1437 1303 1438 - /* Don't read these incantations. Just cut & paste them like I did! */ 1439 1304 memset(&ifr, 0, sizeof(ifr)); 1440 - strcpy(ifr.ifr_name, devname); 1305 + strcpy(ifr.ifr_name, tapif); 1306 + 1307 + /* Don't read these incantations. Just cut & paste them like I did! */ 1441 1308 sin->sin_family = AF_INET; 1442 1309 sin->sin_addr.s_addr = htonl(ipaddr); 1443 1310 if (ioctl(fd, SIOCSIFADDR, &ifr) != 0) 1444 - err(1, "Setting %s interface address", devname); 1311 + err(1, "Setting %s interface address", tapif); 1445 1312 ifr.ifr_flags = IFF_UP; 1446 1313 if (ioctl(fd, SIOCSIFFLAGS, &ifr) != 0) 1447 - err(1, "Bringing interface %s up", devname); 1314 + err(1, "Bringing interface %s up", tapif); 1315 + } 1316 + 1317 + static void get_mac(int fd, const char *tapif, unsigned char hwaddr[6]) 1318 + { 1319 + struct ifreq ifr; 1320 + 1321 + memset(&ifr, 0, sizeof(ifr)); 1322 + strcpy(ifr.ifr_name, tapif); 1448 1323 1449 1324 /* SIOC stands for Socket I/O Control. G means Get (vs S for Set 1450 1325 * above). IF means Interface, and HWADDR is hardware address. 1451 1326 * Simple! */ 1452 1327 if (ioctl(fd, SIOCGIFHWADDR, &ifr) != 0) 1453 - err(1, "getting hw address for %s", devname); 1328 + err(1, "getting hw address for %s", tapif); 1454 1329 memcpy(hwaddr, ifr.ifr_hwaddr.sa_data, 6); 1455 1330 } 1456 1331 1457 - /*L:195 Our network is a Host<->Guest network. This can either use bridging or 1458 - * routing, but the principle is the same: it uses the "tun" device to inject 1459 - * packets into the Host as if they came in from a normal network card. We 1460 - * just shunt packets between the Guest and the tun device. */ 1461 - static void setup_tun_net(const char *arg) 1332 + static int get_tun_device(char tapif[IFNAMSIZ]) 1462 1333 { 1463 - struct device *dev; 1464 1334 struct ifreq ifr; 1465 - int netfd, ipfd; 1466 - u32 ip; 1467 - const char *br_name = NULL; 1468 - struct virtio_net_config conf; 1335 + int netfd; 1336 + 1337 + /* Start with this zeroed. Messy but sure. */ 1338 + memset(&ifr, 0, sizeof(ifr)); 1469 1339 1470 1340 /* We open the /dev/net/tun device and tell it we want a tap device. A 1471 1341 * tap device is like a tun device, only somehow different. To tell 1472 1342 * the truth, I completely blundered my way through this code, but it 1473 1343 * works now! */ 1474 1344 netfd = open_or_die("/dev/net/tun", O_RDWR); 1475 - memset(&ifr, 0, sizeof(ifr)); 1476 - ifr.ifr_flags = IFF_TAP | IFF_NO_PI; 1345 + ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR; 1477 1346 strcpy(ifr.ifr_name, "tap%d"); 1478 1347 if (ioctl(netfd, TUNSETIFF, &ifr) != 0) 1479 1348 err(1, "configuring /dev/net/tun"); 1349 + 1350 + if (ioctl(netfd, TUNSETOFFLOAD, 1351 + TUN_F_CSUM|TUN_F_TSO4|TUN_F_TSO6|TUN_F_TSO_ECN) != 0) 1352 + err(1, "Could not set features for tun device"); 1353 + 1480 1354 /* We don't need checksums calculated for packets coming in this 1481 1355 * device: trust us! */ 1482 1356 ioctl(netfd, TUNSETNOCSUM, 1); 1357 + 1358 + memcpy(tapif, ifr.ifr_name, IFNAMSIZ); 1359 + return netfd; 1360 + } 1361 + 1362 + /*L:195 Our network is a Host<->Guest network. This can either use bridging or 1363 + * routing, but the principle is the same: it uses the "tun" device to inject 1364 + * packets into the Host as if they came in from a normal network card. We 1365 + * just shunt packets between the Guest and the tun device. */ 1366 + static void setup_tun_net(char *arg) 1367 + { 1368 + struct device *dev; 1369 + int netfd, ipfd; 1370 + u32 ip = INADDR_ANY; 1371 + bool bridging = false; 1372 + char tapif[IFNAMSIZ], *p; 1373 + struct virtio_net_config conf; 1374 + 1375 + netfd = get_tun_device(tapif); 1483 1376 1484 1377 /* First we create a new network device. */ 1485 1378 dev = new_device("net", VIRTIO_ID_NET, netfd, handle_tun_input); 1486 1379 1487 1380 /* Network devices need a receive and a send queue, just like 1488 1381 * console. */ 1489 - add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd); 1382 + add_virtqueue(dev, VIRTQUEUE_NUM, net_enable_fd); 1490 1383 add_virtqueue(dev, VIRTQUEUE_NUM, handle_net_output); 1491 1384 1492 1385 /* We need a socket to perform the magic network ioctls to bring up the ··· 1523 1364 1524 1365 /* If the command line was --tunnet=bridge:<name> do bridging. */ 1525 1366 if (!strncmp(BRIDGE_PFX, arg, strlen(BRIDGE_PFX))) { 1526 - ip = INADDR_ANY; 1527 - br_name = arg + strlen(BRIDGE_PFX); 1528 - add_to_bridge(ipfd, ifr.ifr_name, br_name); 1529 - } else /* It is an IP address to set up the device with */ 1367 + arg += strlen(BRIDGE_PFX); 1368 + bridging = true; 1369 + } 1370 + 1371 + /* A mac address may follow the bridge name or IP address */ 1372 + p = strchr(arg, ':'); 1373 + if (p) { 1374 + str2mac(p+1, conf.mac); 1375 + *p = '\0'; 1376 + } else { 1377 + p = arg + strlen(arg); 1378 + /* None supplied; query the randomly assigned mac. */ 1379 + get_mac(ipfd, tapif, conf.mac); 1380 + } 1381 + 1382 + /* arg is now either an IP address or a bridge name */ 1383 + if (bridging) 1384 + add_to_bridge(ipfd, tapif, arg); 1385 + else 1530 1386 ip = str2ip(arg); 1531 1387 1532 - /* Set up the tun device, and get the mac address for the interface. */ 1533 - configure_device(ipfd, ifr.ifr_name, ip, conf.mac); 1388 + /* Set up the tun device. */ 1389 + configure_device(ipfd, tapif, ip); 1534 1390 1535 1391 /* Tell Guest what MAC address to use. */ 1536 1392 add_feature(dev, VIRTIO_NET_F_MAC); 1537 1393 add_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY); 1394 + /* Expect Guest to handle everything except UFO */ 1395 + add_feature(dev, VIRTIO_NET_F_CSUM); 1396 + add_feature(dev, VIRTIO_NET_F_GUEST_CSUM); 1397 + add_feature(dev, VIRTIO_NET_F_MAC); 1398 + add_feature(dev, VIRTIO_NET_F_GUEST_TSO4); 1399 + add_feature(dev, VIRTIO_NET_F_GUEST_TSO6); 1400 + add_feature(dev, VIRTIO_NET_F_GUEST_ECN); 1401 + add_feature(dev, VIRTIO_NET_F_HOST_TSO4); 1402 + add_feature(dev, VIRTIO_NET_F_HOST_TSO6); 1403 + add_feature(dev, VIRTIO_NET_F_HOST_ECN); 1538 1404 set_config(dev, sizeof(conf), &conf); 1539 1405 1540 1406 /* We don't need the socket any more; setup is done. */ 1541 1407 close(ipfd); 1542 1408 1543 - verbose("device %u: tun net %u.%u.%u.%u\n", 1544 - devices.device_num++, 1545 - (u8)(ip>>24),(u8)(ip>>16),(u8)(ip>>8),(u8)ip); 1546 - if (br_name) 1547 - verbose("attached to bridge: %s\n", br_name); 1409 + devices.device_num++; 1410 + 1411 + if (bridging) 1412 + verbose("device %u: tun %s attached to bridge: %s\n", 1413 + devices.device_num, tapif, arg); 1414 + else 1415 + verbose("device %u: tun %s: %s\n", 1416 + devices.device_num, tapif, arg); 1548 1417 } 1549 1418 1550 1419 /* Our block (disk) device should be really simple: the Guest asks for a block ··· 1737 1550 } 1738 1551 1739 1552 /* When the Guest submits some I/O, we just need to wake the I/O thread. */ 1740 - static void handle_virtblk_output(int fd, struct virtqueue *vq) 1553 + static void handle_virtblk_output(int fd, struct virtqueue *vq, bool timeout) 1741 1554 { 1742 1555 struct vblk_info *vblk = vq->dev->priv; 1743 1556 char c = 0; ··· 1808 1621 verbose("device %u: virtblock %llu sectors\n", 1809 1622 devices.device_num, le64_to_cpu(conf.capacity)); 1810 1623 } 1624 + 1625 + /* Our random number generator device reads from /dev/random into the Guest's 1626 + * input buffers. The usual case is that the Guest doesn't want random numbers 1627 + * and so has no buffers although /dev/random is still readable, whereas 1628 + * console is the reverse. 1629 + * 1630 + * The same logic applies, however. */ 1631 + static bool handle_rng_input(int fd, struct device *dev) 1632 + { 1633 + int len; 1634 + unsigned int head, in_num, out_num, totlen = 0; 1635 + struct iovec iov[dev->vq->vring.num]; 1636 + 1637 + /* First we need a buffer from the Guests's virtqueue. */ 1638 + head = get_vq_desc(dev->vq, iov, &out_num, &in_num); 1639 + 1640 + /* If they're not ready for input, stop listening to this file 1641 + * descriptor. We'll start again once they add an input buffer. */ 1642 + if (head == dev->vq->vring.num) 1643 + return false; 1644 + 1645 + if (out_num) 1646 + errx(1, "Output buffers in rng?"); 1647 + 1648 + /* This is why we convert to iovecs: the readv() call uses them, and so 1649 + * it reads straight into the Guest's buffer. We loop to make sure we 1650 + * fill it. */ 1651 + while (!iov_empty(iov, in_num)) { 1652 + len = readv(dev->fd, iov, in_num); 1653 + if (len <= 0) 1654 + err(1, "Read from /dev/random gave %i", len); 1655 + iov_consume(iov, in_num, len); 1656 + totlen += len; 1657 + } 1658 + 1659 + /* Tell the Guest about the new input. */ 1660 + add_used_and_trigger(fd, dev->vq, head, totlen); 1661 + 1662 + /* Everything went OK! */ 1663 + return true; 1664 + } 1665 + 1666 + /* And this creates a "hardware" random number device for the Guest. */ 1667 + static void setup_rng(void) 1668 + { 1669 + struct device *dev; 1670 + int fd; 1671 + 1672 + fd = open_or_die("/dev/random", O_RDONLY); 1673 + 1674 + /* The device responds to return from I/O thread. */ 1675 + dev = new_device("rng", VIRTIO_ID_RNG, fd, handle_rng_input); 1676 + 1677 + /* The device has one virtqueue, where the Guest places inbufs. */ 1678 + add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd); 1679 + 1680 + verbose("device %u: rng\n", devices.device_num++); 1681 + } 1811 1682 /* That's the end of device setup. */ 1812 1683 1813 1684 /*L:230 Reboot is pretty easy: clean up and exec() the Launcher afresh. */ ··· 1873 1628 { 1874 1629 unsigned int i; 1875 1630 1876 - /* Closing pipes causes the Waker thread and io_threads to die, and 1877 - * closing /dev/lguest cleans up the Guest. Since we don't track all 1878 - * open fds, we simply close everything beyond stderr. */ 1631 + /* Since we don't track all open fds, we simply close everything beyond 1632 + * stderr. */ 1879 1633 for (i = 3; i < FD_SETSIZE; i++) 1880 1634 close(i); 1635 + 1636 + /* The exec automatically gets rid of the I/O and Waker threads. */ 1881 1637 execv(main_args[0], main_args); 1882 1638 err(1, "Could not exec %s", main_args[0]); 1883 1639 } ··· 1909 1663 /* ERESTART means that we need to reboot the guest */ 1910 1664 } else if (errno == ERESTART) { 1911 1665 restart_guest(); 1912 - /* EAGAIN means the Waker wanted us to look at some input. 1666 + /* EAGAIN means a signal (timeout). 1913 1667 * Anything else means a bug or incompatible change. */ 1914 1668 } else if (errno != EAGAIN) 1915 1669 err(1, "Running guest failed"); ··· 1937 1691 { "verbose", 0, NULL, 'v' }, 1938 1692 { "tunnet", 1, NULL, 't' }, 1939 1693 { "block", 1, NULL, 'b' }, 1694 + { "rng", 0, NULL, 'r' }, 1940 1695 { "initrd", 1, NULL, 'i' }, 1941 1696 { NULL }, 1942 1697 }; 1943 1698 static void usage(void) 1944 1699 { 1945 1700 errx(1, "Usage: lguest [--verbose] " 1946 - "[--tunnet=(<ipaddr>|bridge:<bridgename>)\n" 1701 + "[--tunnet=(<ipaddr>:<macaddr>|bridge:<bridgename>:<macaddr>)\n" 1947 1702 "|--block=<filename>|--initrd=<filename>]...\n" 1948 1703 "<mem-in-mb> vmlinux [args...]"); 1949 1704 } ··· 2012 1765 case 'b': 2013 1766 setup_block_file(optarg); 2014 1767 break; 1768 + case 'r': 1769 + setup_rng(); 1770 + break; 2015 1771 case 'i': 2016 1772 initrd_name = optarg; 2017 1773 break; ··· 2032 1782 2033 1783 /* We always have a console device */ 2034 1784 setup_console(); 1785 + 1786 + /* We can timeout waiting for Guest network transmit. */ 1787 + setup_timeout(); 2035 1788 2036 1789 /* Now we load the kernel */ 2037 1790 start = load_kernel(open_or_die(argv[optind+1], O_RDONLY)); ··· 2079 1826 * /dev/lguest file descriptor. */ 2080 1827 lguest_fd = tell_kernel(pgdir, start); 2081 1828 2082 - /* We fork off a child process, which wakes the Launcher whenever one 2083 - * of the input file descriptors needs attention. We call this the 2084 - * Waker, and we'll cover it in a moment. */ 2085 - waker_fd = setup_waker(lguest_fd); 1829 + /* We clone off a thread, which wakes the Launcher whenever one of the 1830 + * input file descriptors needs attention. We call this the Waker, and 1831 + * we'll cover it in a moment. */ 1832 + setup_waker(lguest_fd); 2086 1833 2087 1834 /* Finally, run the Guest. This doesn't return. */ 2088 1835 run_guest(lguest_fd);
+3
arch/x86/lguest/boot.c
··· 1014 1014 init_pg_tables_start = __pa(pg0); 1015 1015 init_pg_tables_end = __pa(pg0); 1016 1016 1017 + /* As described in head_32.S, we map the first 128M of memory. */ 1018 + max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT; 1019 + 1017 1020 /* Load the %fs segment register (the per-cpu segment register) with 1018 1021 * the normal data segment to get through booting. */ 1019 1022 asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_DS) : "memory");
+1
drivers/lguest/core.c
··· 135 135 /* Now we just need to free the pages we copied the switcher into */ 136 136 for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) 137 137 __free_pages(switcher_page[i], 0); 138 + kfree(switcher_page); 138 139 } 139 140 140 141 /*H:032
+17 -7
drivers/lguest/interrupts_and_traps.c
··· 406 406 * deliver_trap() to bounce it back into the Guest. */ 407 407 static void default_idt_entry(struct desc_struct *idt, 408 408 int trap, 409 - const unsigned long handler) 409 + const unsigned long handler, 410 + const struct desc_struct *base) 410 411 { 411 412 /* A present interrupt gate. */ 412 413 u32 flags = 0x8e00; ··· 416 415 * the Guest to use the "int" instruction to trigger it. */ 417 416 if (trap == LGUEST_TRAP_ENTRY) 418 417 flags |= (GUEST_PL << 13); 418 + else if (base) 419 + /* Copy priv. level from what Guest asked for. This allows 420 + * debug (int 3) traps from Guest userspace, for example. */ 421 + flags |= (base->b & 0x6000); 419 422 420 423 /* Now pack it into the IDT entry in its weird format. */ 421 424 idt->a = (LGUEST_CS<<16) | (handler&0x0000FFFF); ··· 433 428 unsigned int i; 434 429 435 430 for (i = 0; i < ARRAY_SIZE(state->guest_idt); i++) 436 - default_idt_entry(&state->guest_idt[i], i, def[i]); 431 + default_idt_entry(&state->guest_idt[i], i, def[i], NULL); 437 432 } 438 433 439 434 /*H:240 We don't use the IDT entries in the "struct lguest" directly, instead ··· 447 442 /* We can simply copy the direct traps, otherwise we use the default 448 443 * ones in the Switcher: they will return to the Host. */ 449 444 for (i = 0; i < ARRAY_SIZE(cpu->arch.idt); i++) { 445 + const struct desc_struct *gidt = &cpu->arch.idt[i]; 446 + 450 447 /* If no Guest can ever override this trap, leave it alone. */ 451 448 if (!direct_trap(i)) 452 449 continue; ··· 456 449 /* Only trap gates (type 15) can go direct to the Guest. 457 450 * Interrupt gates (type 14) disable interrupts as they are 458 451 * entered, which we never let the Guest do. Not present 459 - * entries (type 0x0) also can't go direct, of course. */ 460 - if (idt_type(cpu->arch.idt[i].a, cpu->arch.idt[i].b) == 0xF) 461 - idt[i] = cpu->arch.idt[i]; 452 + * entries (type 0x0) also can't go direct, of course. 453 + * 454 + * If it can't go direct, we still need to copy the priv. level: 455 + * they might want to give userspace access to a software 456 + * interrupt. */ 457 + if (idt_type(gidt->a, gidt->b) == 0xF) 458 + idt[i] = *gidt; 462 459 else 463 - /* Reset it to the default. */ 464 - default_idt_entry(&idt[i], i, def[i]); 460 + default_idt_entry(&idt[i], i, def[i], gidt); 465 461 } 466 462 } 467 463
+2 -2
drivers/lguest/x86/core.c
··· 480 480 * bit on its CPU, depending on the argument (0 == unset). */ 481 481 on_each_cpu(adjust_pge, (void *)0, 1); 482 482 /* Turn off the feature in the global feature set. */ 483 - clear_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability); 483 + clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE); 484 484 } 485 485 put_online_cpus(); 486 486 }; ··· 491 491 /* If we had PGE before we started, turn it back on now. */ 492 492 get_online_cpus(); 493 493 if (cpu_had_pge) { 494 - set_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability); 494 + set_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE); 495 495 /* adjust_pge's argument "1" means set PGE. */ 496 496 on_each_cpu(adjust_pge, (void *)1, 1); 497 497 }