tools/testing/selftests/kvm/lib/x86/processor.c at master

tjh.dev / kernel
fork
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork
kernel / tools / testing / selftests / kvm / lib / x86 / processor.c
at master 1471 lines 40 kB view raw
wrap content
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Copyright (C) 2018, Google LLC.
   4 */
   5
   6#include "linux/bitmap.h"
   7#include "test_util.h"
   8#include "kvm_util.h"
   9#include "pmu.h"
  10#include "processor.h"
  11#include "smm.h"
  12#include "svm_util.h"
  13#include "sev.h"
  14#include "vmx.h"
  15
  16#ifndef NUM_INTERRUPTS
  17#define NUM_INTERRUPTS 256
  18#endif
  19
  20#define KERNEL_CS	0x8
  21#define KERNEL_DS	0x10
  22#define KERNEL_TSS	0x18
  23
  24gva_t exception_handlers;
  25bool host_cpu_is_amd;
  26bool host_cpu_is_intel;
  27bool host_cpu_is_hygon;
  28bool host_cpu_is_amd_compatible;
  29bool is_forced_emulation_enabled;
  30u64 guest_tsc_khz;
  31
  32const char *ex_str(int vector)
  33{
  34	switch (vector) {
  35#define VEC_STR(v) case v##_VECTOR: return "#" #v
  36	case DE_VECTOR: return "no exception";
  37	case KVM_MAGIC_DE_VECTOR: return "#DE";
  38	VEC_STR(DB);
  39	VEC_STR(NMI);
  40	VEC_STR(BP);
  41	VEC_STR(OF);
  42	VEC_STR(BR);
  43	VEC_STR(UD);
  44	VEC_STR(NM);
  45	VEC_STR(DF);
  46	VEC_STR(TS);
  47	VEC_STR(NP);
  48	VEC_STR(SS);
  49	VEC_STR(GP);
  50	VEC_STR(PF);
  51	VEC_STR(MF);
  52	VEC_STR(AC);
  53	VEC_STR(MC);
  54	VEC_STR(XM);
  55	VEC_STR(VE);
  56	VEC_STR(CP);
  57	VEC_STR(HV);
  58	VEC_STR(VC);
  59	VEC_STR(SX);
  60	default: return "#??";
  61#undef VEC_STR
  62	}
  63}
  64
  65static void regs_dump(FILE *stream, struct kvm_regs *regs, u8 indent)
  66{
  67	fprintf(stream, "%*srax: 0x%.16llx rbx: 0x%.16llx "
  68		"rcx: 0x%.16llx rdx: 0x%.16llx\n",
  69		indent, "",
  70		regs->rax, regs->rbx, regs->rcx, regs->rdx);
  71	fprintf(stream, "%*srsi: 0x%.16llx rdi: 0x%.16llx "
  72		"rsp: 0x%.16llx rbp: 0x%.16llx\n",
  73		indent, "",
  74		regs->rsi, regs->rdi, regs->rsp, regs->rbp);
  75	fprintf(stream, "%*sr8:  0x%.16llx r9:  0x%.16llx "
  76		"r10: 0x%.16llx r11: 0x%.16llx\n",
  77		indent, "",
  78		regs->r8, regs->r9, regs->r10, regs->r11);
  79	fprintf(stream, "%*sr12: 0x%.16llx r13: 0x%.16llx "
  80		"r14: 0x%.16llx r15: 0x%.16llx\n",
  81		indent, "",
  82		regs->r12, regs->r13, regs->r14, regs->r15);
  83	fprintf(stream, "%*srip: 0x%.16llx rfl: 0x%.16llx\n",
  84		indent, "",
  85		regs->rip, regs->rflags);
  86}
  87
  88static void segment_dump(FILE *stream, struct kvm_segment *segment,
  89			 u8 indent)
  90{
  91	fprintf(stream, "%*sbase: 0x%.16llx limit: 0x%.8x "
  92		"selector: 0x%.4x type: 0x%.2x\n",
  93		indent, "", segment->base, segment->limit,
  94		segment->selector, segment->type);
  95	fprintf(stream, "%*spresent: 0x%.2x dpl: 0x%.2x "
  96		"db: 0x%.2x s: 0x%.2x l: 0x%.2x\n",
  97		indent, "", segment->present, segment->dpl,
  98		segment->db, segment->s, segment->l);
  99	fprintf(stream, "%*sg: 0x%.2x avl: 0x%.2x "
 100		"unusable: 0x%.2x padding: 0x%.2x\n",
 101		indent, "", segment->g, segment->avl,
 102		segment->unusable, segment->padding);
 103}
 104
 105static void dtable_dump(FILE *stream, struct kvm_dtable *dtable,
 106			u8 indent)
 107{
 108	fprintf(stream, "%*sbase: 0x%.16llx limit: 0x%.4x "
 109		"padding: 0x%.4x 0x%.4x 0x%.4x\n",
 110		indent, "", dtable->base, dtable->limit,
 111		dtable->padding[0], dtable->padding[1], dtable->padding[2]);
 112}
 113
 114static void sregs_dump(FILE *stream, struct kvm_sregs *sregs, u8 indent)
 115{
 116	unsigned int i;
 117
 118	fprintf(stream, "%*scs:\n", indent, "");
 119	segment_dump(stream, &sregs->cs, indent + 2);
 120	fprintf(stream, "%*sds:\n", indent, "");
 121	segment_dump(stream, &sregs->ds, indent + 2);
 122	fprintf(stream, "%*ses:\n", indent, "");
 123	segment_dump(stream, &sregs->es, indent + 2);
 124	fprintf(stream, "%*sfs:\n", indent, "");
 125	segment_dump(stream, &sregs->fs, indent + 2);
 126	fprintf(stream, "%*sgs:\n", indent, "");
 127	segment_dump(stream, &sregs->gs, indent + 2);
 128	fprintf(stream, "%*sss:\n", indent, "");
 129	segment_dump(stream, &sregs->ss, indent + 2);
 130	fprintf(stream, "%*str:\n", indent, "");
 131	segment_dump(stream, &sregs->tr, indent + 2);
 132	fprintf(stream, "%*sldt:\n", indent, "");
 133	segment_dump(stream, &sregs->ldt, indent + 2);
 134
 135	fprintf(stream, "%*sgdt:\n", indent, "");
 136	dtable_dump(stream, &sregs->gdt, indent + 2);
 137	fprintf(stream, "%*sidt:\n", indent, "");
 138	dtable_dump(stream, &sregs->idt, indent + 2);
 139
 140	fprintf(stream, "%*scr0: 0x%.16llx cr2: 0x%.16llx "
 141		"cr3: 0x%.16llx cr4: 0x%.16llx\n",
 142		indent, "",
 143		sregs->cr0, sregs->cr2, sregs->cr3, sregs->cr4);
 144	fprintf(stream, "%*scr8: 0x%.16llx efer: 0x%.16llx "
 145		"apic_base: 0x%.16llx\n",
 146		indent, "",
 147		sregs->cr8, sregs->efer, sregs->apic_base);
 148
 149	fprintf(stream, "%*sinterrupt_bitmap:\n", indent, "");
 150	for (i = 0; i < (KVM_NR_INTERRUPTS + 63) / 64; i++) {
 151		fprintf(stream, "%*s%.16llx\n", indent + 2, "",
 152			sregs->interrupt_bitmap[i]);
 153	}
 154}
 155
 156bool kvm_is_tdp_enabled(void)
 157{
 158	if (host_cpu_is_intel)
 159		return get_kvm_intel_param_bool("ept");
 160	else
 161		return get_kvm_amd_param_bool("npt");
 162}
 163
 164static void virt_mmu_init(struct kvm_vm *vm, struct kvm_mmu *mmu,
 165			  struct pte_masks *pte_masks)
 166{
 167	/* If needed, create the top-level page table. */
 168	if (!mmu->pgd_created) {
 169		mmu->pgd = vm_alloc_page_table(vm);
 170		mmu->pgd_created = true;
 171		mmu->arch.pte_masks = *pte_masks;
 172	}
 173
 174	TEST_ASSERT(mmu->pgtable_levels == 4 || mmu->pgtable_levels == 5,
 175		    "Selftests MMU only supports 4-level and 5-level paging, not %u-level paging",
 176		    mmu->pgtable_levels);
 177}
 178
 179void virt_arch_pgd_alloc(struct kvm_vm *vm)
 180{
 181	TEST_ASSERT(vm->mode == VM_MODE_PXXVYY_4K,
 182		    "Unknown or unsupported guest mode: 0x%x", vm->mode);
 183
 184	struct pte_masks pte_masks = (struct pte_masks){
 185		.present	=	BIT_ULL(0),
 186		.writable	=	BIT_ULL(1),
 187		.user		=	BIT_ULL(2),
 188		.accessed	=	BIT_ULL(5),
 189		.dirty		=	BIT_ULL(6),
 190		.huge		=	BIT_ULL(7),
 191		.nx		=	BIT_ULL(63),
 192		.executable	=	0,
 193		.c		=	vm->arch.c_bit,
 194		.s		=	vm->arch.s_bit,
 195	};
 196
 197	virt_mmu_init(vm, &vm->mmu, &pte_masks);
 198}
 199
 200void tdp_mmu_init(struct kvm_vm *vm, int pgtable_levels,
 201		  struct pte_masks *pte_masks)
 202{
 203	TEST_ASSERT(!vm->stage2_mmu.pgtable_levels, "TDP MMU already initialized");
 204
 205	vm->stage2_mmu.pgtable_levels = pgtable_levels;
 206	virt_mmu_init(vm, &vm->stage2_mmu, pte_masks);
 207}
 208
 209static void *virt_get_pte(struct kvm_vm *vm, struct kvm_mmu *mmu,
 210			  u64 *parent_pte, gva_t gva, int level)
 211{
 212	u64 pt_gpa = PTE_GET_PA(*parent_pte);
 213	u64 *page_table = addr_gpa2hva(vm, pt_gpa);
 214	int index = (gva >> PG_LEVEL_SHIFT(level)) & 0x1ffu;
 215
 216	TEST_ASSERT((*parent_pte == mmu->pgd) || is_present_pte(mmu, parent_pte),
 217		    "Parent PTE (level %d) not PRESENT for gva: 0x%08lx",
 218		    level + 1, gva);
 219
 220	return &page_table[index];
 221}
 222
 223static u64 *virt_create_upper_pte(struct kvm_vm *vm,
 224				  struct kvm_mmu *mmu,
 225				  u64 *parent_pte,
 226				  gva_t gva,
 227				  gpa_t gpa,
 228				  int current_level,
 229				  int target_level)
 230{
 231	u64 *pte = virt_get_pte(vm, mmu, parent_pte, gva, current_level);
 232
 233	gpa = vm_untag_gpa(vm, gpa);
 234
 235	if (!is_present_pte(mmu, pte)) {
 236		*pte = PTE_PRESENT_MASK(mmu) | PTE_READABLE_MASK(mmu) |
 237		       PTE_WRITABLE_MASK(mmu) | PTE_EXECUTABLE_MASK(mmu) |
 238		       PTE_ALWAYS_SET_MASK(mmu);
 239		if (current_level == target_level)
 240			*pte |= PTE_HUGE_MASK(mmu) | (gpa & PHYSICAL_PAGE_MASK);
 241		else
 242			*pte |= vm_alloc_page_table(vm) & PHYSICAL_PAGE_MASK;
 243	} else {
 244		/*
 245		 * Entry already present.  Assert that the caller doesn't want
 246		 * a hugepage at this level, and that there isn't a hugepage at
 247		 * this level.
 248		 */
 249		TEST_ASSERT(current_level != target_level,
 250			    "Cannot create hugepage at level: %u, gva: 0x%lx",
 251			    current_level, gva);
 252		TEST_ASSERT(!is_huge_pte(mmu, pte),
 253			    "Cannot create page table at level: %u, gva: 0x%lx",
 254			    current_level, gva);
 255	}
 256	return pte;
 257}
 258
 259void __virt_pg_map(struct kvm_vm *vm, struct kvm_mmu *mmu, gva_t gva,
 260		   gpa_t gpa, int level)
 261{
 262	const u64 pg_size = PG_LEVEL_SIZE(level);
 263	u64 *pte = &mmu->pgd;
 264	int current_level;
 265
 266	TEST_ASSERT(vm->mode == VM_MODE_PXXVYY_4K,
 267		    "Unknown or unsupported guest mode: 0x%x", vm->mode);
 268
 269	TEST_ASSERT((gva % pg_size) == 0,
 270		    "Virtual address not aligned,\n"
 271		    "gva: 0x%lx page size: 0x%lx", gva, pg_size);
 272	TEST_ASSERT(sparsebit_is_set(vm->vpages_valid, (gva >> vm->page_shift)),
 273		    "Invalid virtual address, gva: 0x%lx", gva);
 274	TEST_ASSERT((gpa % pg_size) == 0,
 275		    "Physical address not aligned,\n"
 276		    "  gpa: 0x%lx page size: 0x%lx", gpa, pg_size);
 277	TEST_ASSERT((gpa >> vm->page_shift) <= vm->max_gfn,
 278		    "Physical address beyond maximum supported,\n"
 279		    "  gpa: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x",
 280		    gpa, vm->max_gfn, vm->page_size);
 281	TEST_ASSERT(vm_untag_gpa(vm, gpa) == gpa,
 282		    "Unexpected bits in gpa: %lx", gpa);
 283
 284	TEST_ASSERT(!PTE_EXECUTABLE_MASK(mmu) || !PTE_NX_MASK(mmu),
 285		    "X and NX bit masks cannot be used simultaneously");
 286
 287	/*
 288	 * Allocate upper level page tables, if not already present.  Return
 289	 * early if a hugepage was created.
 290	 */
 291	for (current_level = mmu->pgtable_levels;
 292	     current_level > PG_LEVEL_4K;
 293	     current_level--) {
 294		pte = virt_create_upper_pte(vm, mmu, pte, gva, gpa,
 295					    current_level, level);
 296		if (is_huge_pte(mmu, pte))
 297			return;
 298	}
 299
 300	/* Fill in page table entry. */
 301	pte = virt_get_pte(vm, mmu, pte, gva, PG_LEVEL_4K);
 302	TEST_ASSERT(!is_present_pte(mmu, pte),
 303		    "PTE already present for 4k page at gva: 0x%lx", gva);
 304	*pte = PTE_PRESENT_MASK(mmu) | PTE_READABLE_MASK(mmu) |
 305	       PTE_WRITABLE_MASK(mmu) | PTE_EXECUTABLE_MASK(mmu) |
 306	       PTE_ALWAYS_SET_MASK(mmu) | (gpa & PHYSICAL_PAGE_MASK);
 307
 308	/*
 309	 * Neither SEV nor TDX supports shared page tables, so only the final
 310	 * leaf PTE needs manually set the C/S-bit.
 311	 */
 312	if (vm_is_gpa_protected(vm, gpa))
 313		*pte |= PTE_C_BIT_MASK(mmu);
 314	else
 315		*pte |= PTE_S_BIT_MASK(mmu);
 316}
 317
 318void virt_arch_pg_map(struct kvm_vm *vm, gva_t gva, gpa_t gpa)
 319{
 320	__virt_pg_map(vm, &vm->mmu, gva, gpa, PG_LEVEL_4K);
 321}
 322
 323void virt_map_level(struct kvm_vm *vm, gva_t gva, gpa_t gpa,
 324		    u64 nr_bytes, int level)
 325{
 326	u64 pg_size = PG_LEVEL_SIZE(level);
 327	u64 nr_pages = nr_bytes / pg_size;
 328	int i;
 329
 330	TEST_ASSERT(nr_bytes % pg_size == 0,
 331		    "Region size not aligned: nr_bytes: 0x%lx, page size: 0x%lx",
 332		    nr_bytes, pg_size);
 333
 334	for (i = 0; i < nr_pages; i++) {
 335		__virt_pg_map(vm, &vm->mmu, gva, gpa, level);
 336		sparsebit_set_num(vm->vpages_mapped, gva >> vm->page_shift,
 337				  nr_bytes / PAGE_SIZE);
 338
 339		gva += pg_size;
 340		gpa += pg_size;
 341	}
 342}
 343
 344static bool vm_is_target_pte(struct kvm_mmu *mmu, u64 *pte,
 345			     int *level, int current_level)
 346{
 347	if (is_huge_pte(mmu, pte)) {
 348		TEST_ASSERT(*level == PG_LEVEL_NONE ||
 349			    *level == current_level,
 350			    "Unexpected hugepage at level %d", current_level);
 351		*level = current_level;
 352	}
 353
 354	return *level == current_level;
 355}
 356
 357static u64 *__vm_get_page_table_entry(struct kvm_vm *vm,
 358				      struct kvm_mmu *mmu,
 359				      gva_t gva,
 360				      int *level)
 361{
 362	int va_width = 12 + (mmu->pgtable_levels) * 9;
 363	u64 *pte = &mmu->pgd;
 364	int current_level;
 365
 366	TEST_ASSERT(!vm->arch.is_pt_protected,
 367		    "Walking page tables of protected guests is impossible");
 368
 369	TEST_ASSERT(*level >= PG_LEVEL_NONE && *level <= mmu->pgtable_levels,
 370		    "Invalid PG_LEVEL_* '%d'", *level);
 371
 372	TEST_ASSERT(vm->mode == VM_MODE_PXXVYY_4K,
 373		    "Unknown or unsupported guest mode: 0x%x", vm->mode);
 374	TEST_ASSERT(sparsebit_is_set(vm->vpages_valid, (gva >> vm->page_shift)),
 375		    "Invalid virtual address, gva: 0x%lx", gva);
 376	/*
 377	 * Check that the gva is a sign-extended va_width value.
 378	 */
 379	TEST_ASSERT(gva == (((s64)gva << (64 - va_width) >> (64 - va_width))),
 380		    "Canonical check failed.  The virtual address is invalid.");
 381
 382	for (current_level = mmu->pgtable_levels;
 383	     current_level > PG_LEVEL_4K;
 384	     current_level--) {
 385		pte = virt_get_pte(vm, mmu, pte, gva, current_level);
 386		if (vm_is_target_pte(mmu, pte, level, current_level))
 387			return pte;
 388	}
 389
 390	return virt_get_pte(vm, mmu, pte, gva, PG_LEVEL_4K);
 391}
 392
 393u64 *tdp_get_pte(struct kvm_vm *vm, u64 l2_gpa)
 394{
 395	int level = PG_LEVEL_4K;
 396
 397	return __vm_get_page_table_entry(vm, &vm->stage2_mmu, l2_gpa, &level);
 398}
 399
 400u64 *vm_get_pte(struct kvm_vm *vm, gva_t gva)
 401{
 402	int level = PG_LEVEL_4K;
 403
 404	return __vm_get_page_table_entry(vm, &vm->mmu, gva, &level);
 405}
 406
 407void virt_arch_dump(FILE *stream, struct kvm_vm *vm, u8 indent)
 408{
 409	struct kvm_mmu *mmu = &vm->mmu;
 410	u64 *pml4e, *pml4e_start;
 411	u64 *pdpe, *pdpe_start;
 412	u64 *pde, *pde_start;
 413	u64 *pte, *pte_start;
 414
 415	if (!mmu->pgd_created)
 416		return;
 417
 418	fprintf(stream, "%*s                                          "
 419		"                no\n", indent, "");
 420	fprintf(stream, "%*s      index hvaddr         gpaddr         "
 421		"addr         w exec dirty\n",
 422		indent, "");
 423	pml4e_start = (u64 *)addr_gpa2hva(vm, mmu->pgd);
 424	for (u16 n1 = 0; n1 <= 0x1ffu; n1++) {
 425		pml4e = &pml4e_start[n1];
 426		if (!is_present_pte(mmu, pml4e))
 427			continue;
 428		fprintf(stream, "%*spml4e 0x%-3zx %p 0x%-12lx 0x%-10llx %u "
 429			" %u\n",
 430			indent, "",
 431			pml4e - pml4e_start, pml4e,
 432			addr_hva2gpa(vm, pml4e), PTE_GET_PFN(*pml4e),
 433			is_writable_pte(mmu, pml4e), is_nx_pte(mmu, pml4e));
 434
 435		pdpe_start = addr_gpa2hva(vm, *pml4e & PHYSICAL_PAGE_MASK);
 436		for (u16 n2 = 0; n2 <= 0x1ffu; n2++) {
 437			pdpe = &pdpe_start[n2];
 438			if (!is_present_pte(mmu, pdpe))
 439				continue;
 440			fprintf(stream, "%*spdpe  0x%-3zx %p 0x%-12lx 0x%-10llx "
 441				"%u  %u\n",
 442				indent, "",
 443				pdpe - pdpe_start, pdpe,
 444				addr_hva2gpa(vm, pdpe),
 445				PTE_GET_PFN(*pdpe), is_writable_pte(mmu, pdpe),
 446				is_nx_pte(mmu, pdpe));
 447
 448			pde_start = addr_gpa2hva(vm, *pdpe & PHYSICAL_PAGE_MASK);
 449			for (u16 n3 = 0; n3 <= 0x1ffu; n3++) {
 450				pde = &pde_start[n3];
 451				if (!is_present_pte(mmu, pde))
 452					continue;
 453				fprintf(stream, "%*spde   0x%-3zx %p "
 454					"0x%-12lx 0x%-10llx %u  %u\n",
 455					indent, "", pde - pde_start, pde,
 456					addr_hva2gpa(vm, pde),
 457					PTE_GET_PFN(*pde), is_writable_pte(mmu, pde),
 458					is_nx_pte(mmu, pde));
 459
 460				pte_start = addr_gpa2hva(vm, *pde & PHYSICAL_PAGE_MASK);
 461				for (u16 n4 = 0; n4 <= 0x1ffu; n4++) {
 462					pte = &pte_start[n4];
 463					if (!is_present_pte(mmu, pte))
 464						continue;
 465					fprintf(stream, "%*spte   0x%-3zx %p "
 466						"0x%-12lx 0x%-10llx %u  %u "
 467						"    %u    0x%-10lx\n",
 468						indent, "",
 469						pte - pte_start, pte,
 470						addr_hva2gpa(vm, pte),
 471						PTE_GET_PFN(*pte),
 472						is_writable_pte(mmu, pte),
 473						is_nx_pte(mmu, pte),
 474						is_dirty_pte(mmu, pte),
 475						((u64)n1 << 27)
 476							| ((u64)n2 << 18)
 477							| ((u64)n3 << 9)
 478							| ((u64)n4));
 479				}
 480			}
 481		}
 482	}
 483}
 484
 485void vm_enable_tdp(struct kvm_vm *vm)
 486{
 487	if (kvm_cpu_has(X86_FEATURE_VMX))
 488		vm_enable_ept(vm);
 489	else
 490		vm_enable_npt(vm);
 491}
 492
 493bool kvm_cpu_has_tdp(void)
 494{
 495	return kvm_cpu_has_ept() || kvm_cpu_has_npt();
 496}
 497
 498void __tdp_map(struct kvm_vm *vm, gpa_t l2_gpa, gpa_t gpa, u64 size, int level)
 499{
 500	size_t page_size = PG_LEVEL_SIZE(level);
 501	size_t npages = size / page_size;
 502
 503	TEST_ASSERT(l2_gpa + size > l2_gpa, "L2 GPA overflow");
 504	TEST_ASSERT(gpa + size > gpa, "GPA overflow");
 505
 506	while (npages--) {
 507		__virt_pg_map(vm, &vm->stage2_mmu, l2_gpa, gpa, level);
 508		l2_gpa += page_size;
 509		gpa += page_size;
 510	}
 511}
 512
 513void tdp_map(struct kvm_vm *vm, gpa_t l2_gpa, gpa_t gpa, u64 size)
 514{
 515	__tdp_map(vm, l2_gpa, gpa, size, PG_LEVEL_4K);
 516}
 517
 518/* Prepare an identity extended page table that maps all the
 519 * physical pages in VM.
 520 */
 521void tdp_identity_map_default_memslots(struct kvm_vm *vm)
 522{
 523	u32 s, memslot = 0;
 524	sparsebit_idx_t i, last;
 525	struct userspace_mem_region *region = memslot2region(vm, memslot);
 526
 527	/* Only memslot 0 is mapped here, ensure it's the only one being used */
 528	for (s = 0; s < NR_MEM_REGIONS; s++)
 529		TEST_ASSERT_EQ(vm->memslots[s], 0);
 530
 531	i = (region->region.guest_phys_addr >> vm->page_shift) - 1;
 532	last = i + (region->region.memory_size >> vm->page_shift);
 533	for (;;) {
 534		i = sparsebit_next_clear(region->unused_phy_pages, i);
 535		if (i > last)
 536			break;
 537
 538		tdp_map(vm, (u64)i << vm->page_shift,
 539			(u64)i << vm->page_shift, 1 << vm->page_shift);
 540	}
 541}
 542
 543/* Identity map a region with 1GiB Pages. */
 544void tdp_identity_map_1g(struct kvm_vm *vm, u64 addr, u64 size)
 545{
 546	__tdp_map(vm, addr, addr, size, PG_LEVEL_1G);
 547}
 548
 549/*
 550 * Set Unusable Segment
 551 *
 552 * Input Args: None
 553 *
 554 * Output Args:
 555 *   segp - Pointer to segment register
 556 *
 557 * Return: None
 558 *
 559 * Sets the segment register pointed to by @segp to an unusable state.
 560 */
 561static void kvm_seg_set_unusable(struct kvm_segment *segp)
 562{
 563	memset(segp, 0, sizeof(*segp));
 564	segp->unusable = true;
 565}
 566
 567static void kvm_seg_fill_gdt_64bit(struct kvm_vm *vm, struct kvm_segment *segp)
 568{
 569	void *gdt = addr_gva2hva(vm, vm->arch.gdt);
 570	struct desc64 *desc = gdt + (segp->selector >> 3) * 8;
 571
 572	desc->limit0 = segp->limit & 0xFFFF;
 573	desc->base0 = segp->base & 0xFFFF;
 574	desc->base1 = segp->base >> 16;
 575	desc->type = segp->type;
 576	desc->s = segp->s;
 577	desc->dpl = segp->dpl;
 578	desc->p = segp->present;
 579	desc->limit1 = segp->limit >> 16;
 580	desc->avl = segp->avl;
 581	desc->l = segp->l;
 582	desc->db = segp->db;
 583	desc->g = segp->g;
 584	desc->base2 = segp->base >> 24;
 585	if (!segp->s)
 586		desc->base3 = segp->base >> 32;
 587}
 588
 589static void kvm_seg_set_kernel_code_64bit(struct kvm_segment *segp)
 590{
 591	memset(segp, 0, sizeof(*segp));
 592	segp->selector = KERNEL_CS;
 593	segp->limit = 0xFFFFFFFFu;
 594	segp->s = 0x1; /* kTypeCodeData */
 595	segp->type = 0x08 | 0x01 | 0x02; /* kFlagCode | kFlagCodeAccessed
 596					  * | kFlagCodeReadable
 597					  */
 598	segp->g = true;
 599	segp->l = true;
 600	segp->present = 1;
 601}
 602
 603static void kvm_seg_set_kernel_data_64bit(struct kvm_segment *segp)
 604{
 605	memset(segp, 0, sizeof(*segp));
 606	segp->selector = KERNEL_DS;
 607	segp->limit = 0xFFFFFFFFu;
 608	segp->s = 0x1; /* kTypeCodeData */
 609	segp->type = 0x00 | 0x01 | 0x02; /* kFlagData | kFlagDataAccessed
 610					  * | kFlagDataWritable
 611					  */
 612	segp->g = true;
 613	segp->present = true;
 614}
 615
 616gpa_t addr_arch_gva2gpa(struct kvm_vm *vm, gva_t gva)
 617{
 618	int level = PG_LEVEL_NONE;
 619	u64 *pte = __vm_get_page_table_entry(vm, &vm->mmu, gva, &level);
 620
 621	TEST_ASSERT(is_present_pte(&vm->mmu, pte),
 622		    "Leaf PTE not PRESENT for gva: 0x%08lx", gva);
 623
 624	/*
 625	 * No need for a hugepage mask on the PTE, x86-64 requires the "unused"
 626	 * address bits to be zero.
 627	 */
 628	return vm_untag_gpa(vm, PTE_GET_PA(*pte)) | (gva & ~HUGEPAGE_MASK(level));
 629}
 630
 631static void kvm_seg_set_tss_64bit(gva_t base, struct kvm_segment *segp)
 632{
 633	memset(segp, 0, sizeof(*segp));
 634	segp->base = base;
 635	segp->limit = 0x67;
 636	segp->selector = KERNEL_TSS;
 637	segp->type = 0xb;
 638	segp->present = 1;
 639}
 640
 641static void vcpu_init_sregs(struct kvm_vm *vm, struct kvm_vcpu *vcpu)
 642{
 643	struct kvm_sregs sregs;
 644
 645	TEST_ASSERT(vm->mode == VM_MODE_PXXVYY_4K,
 646		    "Unknown or unsupported guest mode: 0x%x", vm->mode);
 647
 648	/* Set mode specific system register values. */
 649	vcpu_sregs_get(vcpu, &sregs);
 650
 651	sregs.idt.base = vm->arch.idt;
 652	sregs.idt.limit = NUM_INTERRUPTS * sizeof(struct idt_entry) - 1;
 653	sregs.gdt.base = vm->arch.gdt;
 654	sregs.gdt.limit = getpagesize() - 1;
 655
 656	sregs.cr0 = X86_CR0_PE | X86_CR0_NE | X86_CR0_PG;
 657	sregs.cr4 |= X86_CR4_PAE | X86_CR4_OSFXSR;
 658	if (kvm_cpu_has(X86_FEATURE_XSAVE))
 659		sregs.cr4 |= X86_CR4_OSXSAVE;
 660	if (vm->mmu.pgtable_levels == 5)
 661		sregs.cr4 |= X86_CR4_LA57;
 662	sregs.efer |= (EFER_LME | EFER_LMA | EFER_NX);
 663
 664	kvm_seg_set_unusable(&sregs.ldt);
 665	kvm_seg_set_kernel_code_64bit(&sregs.cs);
 666	kvm_seg_set_kernel_data_64bit(&sregs.ds);
 667	kvm_seg_set_kernel_data_64bit(&sregs.es);
 668	kvm_seg_set_kernel_data_64bit(&sregs.gs);
 669	kvm_seg_set_tss_64bit(vm->arch.tss, &sregs.tr);
 670
 671	sregs.cr3 = vm->mmu.pgd;
 672	vcpu_sregs_set(vcpu, &sregs);
 673}
 674
 675static void vcpu_init_xcrs(struct kvm_vm *vm, struct kvm_vcpu *vcpu)
 676{
 677	struct kvm_xcrs xcrs = {
 678		.nr_xcrs = 1,
 679		.xcrs[0].xcr = 0,
 680		.xcrs[0].value = kvm_cpu_supported_xcr0(),
 681	};
 682
 683	if (!kvm_cpu_has(X86_FEATURE_XSAVE))
 684		return;
 685
 686	vcpu_xcrs_set(vcpu, &xcrs);
 687}
 688
 689static void set_idt_entry(struct kvm_vm *vm, int vector, unsigned long addr,
 690			  int dpl, unsigned short selector)
 691{
 692	struct idt_entry *base =
 693		(struct idt_entry *)addr_gva2hva(vm, vm->arch.idt);
 694	struct idt_entry *e = &base[vector];
 695
 696	memset(e, 0, sizeof(*e));
 697	e->offset0 = addr;
 698	e->selector = selector;
 699	e->ist = 0;
 700	e->type = 14;
 701	e->dpl = dpl;
 702	e->p = 1;
 703	e->offset1 = addr >> 16;
 704	e->offset2 = addr >> 32;
 705}
 706
 707static bool kvm_fixup_exception(struct ex_regs *regs)
 708{
 709	if (regs->r9 != KVM_EXCEPTION_MAGIC || regs->rip != regs->r10)
 710		return false;
 711
 712	if (regs->vector == DE_VECTOR)
 713		regs->vector = KVM_MAGIC_DE_VECTOR;
 714
 715	regs->rip = regs->r11;
 716	regs->r9 = regs->vector;
 717	regs->r10 = regs->error_code;
 718	return true;
 719}
 720
 721void route_exception(struct ex_regs *regs)
 722{
 723	typedef void(*handler)(struct ex_regs *);
 724	handler *handlers = (handler *)exception_handlers;
 725
 726	if (handlers && handlers[regs->vector]) {
 727		handlers[regs->vector](regs);
 728		return;
 729	}
 730
 731	if (kvm_fixup_exception(regs))
 732		return;
 733
 734	GUEST_FAIL("Unhandled exception '0x%lx' at guest RIP '0x%lx'",
 735		   regs->vector, regs->rip);
 736}
 737
 738static void vm_init_descriptor_tables(struct kvm_vm *vm)
 739{
 740	extern void *idt_handlers;
 741	struct kvm_segment seg;
 742	int i;
 743
 744	vm->arch.gdt = __vm_alloc_page(vm, MEM_REGION_DATA);
 745	vm->arch.idt = __vm_alloc_page(vm, MEM_REGION_DATA);
 746	vm->handlers = __vm_alloc_page(vm, MEM_REGION_DATA);
 747	vm->arch.tss = __vm_alloc_page(vm, MEM_REGION_DATA);
 748
 749	/* Handlers have the same address in both address spaces.*/
 750	for (i = 0; i < NUM_INTERRUPTS; i++)
 751		set_idt_entry(vm, i, (unsigned long)(&idt_handlers)[i], 0, KERNEL_CS);
 752
 753	*(gva_t *)addr_gva2hva(vm, (gva_t)(&exception_handlers)) = vm->handlers;
 754
 755	kvm_seg_set_kernel_code_64bit(&seg);
 756	kvm_seg_fill_gdt_64bit(vm, &seg);
 757
 758	kvm_seg_set_kernel_data_64bit(&seg);
 759	kvm_seg_fill_gdt_64bit(vm, &seg);
 760
 761	kvm_seg_set_tss_64bit(vm->arch.tss, &seg);
 762	kvm_seg_fill_gdt_64bit(vm, &seg);
 763}
 764
 765void vm_install_exception_handler(struct kvm_vm *vm, int vector,
 766			       void (*handler)(struct ex_regs *))
 767{
 768	gva_t *handlers = (gva_t *)addr_gva2hva(vm, vm->handlers);
 769
 770	handlers[vector] = (gva_t)handler;
 771}
 772
 773void assert_on_unhandled_exception(struct kvm_vcpu *vcpu)
 774{
 775	struct ucall uc;
 776
 777	if (get_ucall(vcpu, &uc) == UCALL_ABORT)
 778		REPORT_GUEST_ASSERT(uc);
 779}
 780
 781void kvm_arch_vm_post_create(struct kvm_vm *vm, unsigned int nr_vcpus)
 782{
 783	int r;
 784
 785	TEST_ASSERT(kvm_has_cap(KVM_CAP_GET_TSC_KHZ),
 786		    "Require KVM_GET_TSC_KHZ to provide udelay() to guest.");
 787
 788	vm_create_irqchip(vm);
 789	vm_init_descriptor_tables(vm);
 790
 791	sync_global_to_guest(vm, host_cpu_is_intel);
 792	sync_global_to_guest(vm, host_cpu_is_amd);
 793	sync_global_to_guest(vm, host_cpu_is_hygon);
 794	sync_global_to_guest(vm, host_cpu_is_amd_compatible);
 795	sync_global_to_guest(vm, is_forced_emulation_enabled);
 796	sync_global_to_guest(vm, pmu_errata_mask);
 797
 798	if (is_sev_vm(vm)) {
 799		struct kvm_sev_init init = { 0 };
 800
 801		vm_sev_ioctl(vm, KVM_SEV_INIT2, &init);
 802	}
 803
 804	r = __vm_ioctl(vm, KVM_GET_TSC_KHZ, NULL);
 805	TEST_ASSERT(r > 0, "KVM_GET_TSC_KHZ did not provide a valid TSC frequency.");
 806	guest_tsc_khz = r;
 807	sync_global_to_guest(vm, guest_tsc_khz);
 808}
 809
 810void vcpu_arch_set_entry_point(struct kvm_vcpu *vcpu, void *guest_code)
 811{
 812	struct kvm_regs regs;
 813
 814	vcpu_regs_get(vcpu, &regs);
 815	regs.rip = (unsigned long) guest_code;
 816	vcpu_regs_set(vcpu, &regs);
 817}
 818
 819struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, u32 vcpu_id)
 820{
 821	struct kvm_mp_state mp_state;
 822	struct kvm_regs regs;
 823	gva_t stack_gva;
 824	struct kvm_vcpu *vcpu;
 825
 826	stack_gva = __vm_alloc(vm, DEFAULT_STACK_PGS * getpagesize(),
 827			       DEFAULT_GUEST_STACK_VADDR_MIN, MEM_REGION_DATA);
 828
 829	stack_gva += DEFAULT_STACK_PGS * getpagesize();
 830
 831	/*
 832	 * Align stack to match calling sequence requirements in section "The
 833	 * Stack Frame" of the System V ABI AMD64 Architecture Processor
 834	 * Supplement, which requires the value (%rsp + 8) to be a multiple of
 835	 * 16 when control is transferred to the function entry point.
 836	 *
 837	 * If this code is ever used to launch a vCPU with 32-bit entry point it
 838	 * may need to subtract 4 bytes instead of 8 bytes.
 839	 */
 840	TEST_ASSERT(IS_ALIGNED(stack_gva, PAGE_SIZE),
 841		    "__vm_alloc() did not provide a page-aligned address");
 842	stack_gva -= 8;
 843
 844	vcpu = __vm_vcpu_add(vm, vcpu_id);
 845	vcpu_init_cpuid(vcpu, kvm_get_supported_cpuid());
 846	vcpu_init_sregs(vm, vcpu);
 847	vcpu_init_xcrs(vm, vcpu);
 848
 849	/* Setup guest general purpose registers */
 850	vcpu_regs_get(vcpu, &regs);
 851	regs.rflags = regs.rflags | 0x2;
 852	regs.rsp = stack_gva;
 853	vcpu_regs_set(vcpu, &regs);
 854
 855	/* Setup the MP state */
 856	mp_state.mp_state = 0;
 857	vcpu_mp_state_set(vcpu, &mp_state);
 858
 859	/*
 860	 * Refresh CPUID after setting SREGS and XCR0, so that KVM's "runtime"
 861	 * updates to guest CPUID, e.g. for OSXSAVE and XSAVE state size, are
 862	 * reflected into selftests' vCPU CPUID cache, i.e. so that the cache
 863	 * is consistent with vCPU state.
 864	 */
 865	vcpu_get_cpuid(vcpu);
 866	return vcpu;
 867}
 868
 869struct kvm_vcpu *vm_arch_vcpu_recreate(struct kvm_vm *vm, u32 vcpu_id)
 870{
 871	struct kvm_vcpu *vcpu = __vm_vcpu_add(vm, vcpu_id);
 872
 873	vcpu_init_cpuid(vcpu, kvm_get_supported_cpuid());
 874
 875	return vcpu;
 876}
 877
 878void vcpu_arch_free(struct kvm_vcpu *vcpu)
 879{
 880	if (vcpu->cpuid)
 881		free(vcpu->cpuid);
 882}
 883
 884/* Do not use kvm_supported_cpuid directly except for validity checks. */
 885static void *kvm_supported_cpuid;
 886
 887const struct kvm_cpuid2 *kvm_get_supported_cpuid(void)
 888{
 889	int kvm_fd;
 890
 891	if (kvm_supported_cpuid)
 892		return kvm_supported_cpuid;
 893
 894	kvm_supported_cpuid = allocate_kvm_cpuid2(MAX_NR_CPUID_ENTRIES);
 895	kvm_fd = open_kvm_dev_path_or_exit();
 896
 897	kvm_ioctl(kvm_fd, KVM_GET_SUPPORTED_CPUID,
 898		  (struct kvm_cpuid2 *)kvm_supported_cpuid);
 899
 900	close(kvm_fd);
 901	return kvm_supported_cpuid;
 902}
 903
 904static u32 __kvm_cpu_has(const struct kvm_cpuid2 *cpuid,
 905			 u32 function, u32 index,
 906			 u8 reg, u8 lo, u8 hi)
 907{
 908	const struct kvm_cpuid_entry2 *entry;
 909	int i;
 910
 911	for (i = 0; i < cpuid->nent; i++) {
 912		entry = &cpuid->entries[i];
 913
 914		/*
 915		 * The output registers in kvm_cpuid_entry2 are in alphabetical
 916		 * order, but kvm_x86_cpu_feature matches that mess, so yay
 917		 * pointer shenanigans!
 918		 */
 919		if (entry->function == function && entry->index == index)
 920			return ((&entry->eax)[reg] & GENMASK(hi, lo)) >> lo;
 921	}
 922
 923	return 0;
 924}
 925
 926bool kvm_cpuid_has(const struct kvm_cpuid2 *cpuid,
 927		   struct kvm_x86_cpu_feature feature)
 928{
 929	return __kvm_cpu_has(cpuid, feature.function, feature.index,
 930			     feature.reg, feature.bit, feature.bit);
 931}
 932
 933u32 kvm_cpuid_property(const struct kvm_cpuid2 *cpuid,
 934		       struct kvm_x86_cpu_property property)
 935{
 936	return __kvm_cpu_has(cpuid, property.function, property.index,
 937			     property.reg, property.lo_bit, property.hi_bit);
 938}
 939
 940u64 kvm_get_feature_msr(u64 msr_index)
 941{
 942	struct {
 943		struct kvm_msrs header;
 944		struct kvm_msr_entry entry;
 945	} buffer = {};
 946	int r, kvm_fd;
 947
 948	buffer.header.nmsrs = 1;
 949	buffer.entry.index = msr_index;
 950	kvm_fd = open_kvm_dev_path_or_exit();
 951
 952	r = __kvm_ioctl(kvm_fd, KVM_GET_MSRS, &buffer.header);
 953	TEST_ASSERT(r == 1, KVM_IOCTL_ERROR(KVM_GET_MSRS, r));
 954
 955	close(kvm_fd);
 956	return buffer.entry.data;
 957}
 958
 959void __vm_xsave_require_permission(u64 xfeature, const char *name)
 960{
 961	int kvm_fd;
 962	u64 bitmask;
 963	long rc;
 964	struct kvm_device_attr attr = {
 965		.group = 0,
 966		.attr = KVM_X86_XCOMP_GUEST_SUPP,
 967		.addr = (unsigned long) &bitmask,
 968	};
 969
 970	TEST_ASSERT(!kvm_supported_cpuid,
 971		    "kvm_get_supported_cpuid() cannot be used before ARCH_REQ_XCOMP_GUEST_PERM");
 972
 973	TEST_ASSERT(is_power_of_2(xfeature),
 974		    "Dynamic XFeatures must be enabled one at a time");
 975
 976	kvm_fd = open_kvm_dev_path_or_exit();
 977	rc = __kvm_ioctl(kvm_fd, KVM_GET_DEVICE_ATTR, &attr);
 978	close(kvm_fd);
 979
 980	if (rc == -1 && (errno == ENXIO || errno == EINVAL))
 981		__TEST_REQUIRE(0, "KVM_X86_XCOMP_GUEST_SUPP not supported");
 982
 983	TEST_ASSERT(rc == 0, "KVM_GET_DEVICE_ATTR(0, KVM_X86_XCOMP_GUEST_SUPP) error: %ld", rc);
 984
 985	__TEST_REQUIRE(bitmask & xfeature,
 986		       "Required XSAVE feature '%s' not supported", name);
 987
 988	TEST_REQUIRE(!syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_GUEST_PERM, ilog2(xfeature)));
 989
 990	rc = syscall(SYS_arch_prctl, ARCH_GET_XCOMP_GUEST_PERM, &bitmask);
 991	TEST_ASSERT(rc == 0, "prctl(ARCH_GET_XCOMP_GUEST_PERM) error: %ld", rc);
 992	TEST_ASSERT(bitmask & xfeature,
 993		    "'%s' (0x%lx) not permitted after prctl(ARCH_REQ_XCOMP_GUEST_PERM) permitted=0x%lx",
 994		    name, xfeature, bitmask);
 995}
 996
 997void vcpu_init_cpuid(struct kvm_vcpu *vcpu, const struct kvm_cpuid2 *cpuid)
 998{
 999	TEST_ASSERT(cpuid != vcpu->cpuid, "@cpuid can't be the vCPU's CPUID");
1000
1001	/* Allow overriding the default CPUID. */
1002	if (vcpu->cpuid && vcpu->cpuid->nent < cpuid->nent) {
1003		free(vcpu->cpuid);
1004		vcpu->cpuid = NULL;
1005	}
1006
1007	if (!vcpu->cpuid)
1008		vcpu->cpuid = allocate_kvm_cpuid2(cpuid->nent);
1009
1010	memcpy(vcpu->cpuid, cpuid, kvm_cpuid2_size(cpuid->nent));
1011	vcpu_set_cpuid(vcpu);
1012}
1013
1014void vcpu_set_cpuid_property(struct kvm_vcpu *vcpu,
1015			     struct kvm_x86_cpu_property property,
1016			     u32 value)
1017{
1018	struct kvm_cpuid_entry2 *entry;
1019
1020	entry = __vcpu_get_cpuid_entry(vcpu, property.function, property.index);
1021
1022	(&entry->eax)[property.reg] &= ~GENMASK(property.hi_bit, property.lo_bit);
1023	(&entry->eax)[property.reg] |= value << property.lo_bit;
1024
1025	vcpu_set_cpuid(vcpu);
1026
1027	/* Sanity check that @value doesn't exceed the bounds in any way. */
1028	TEST_ASSERT_EQ(kvm_cpuid_property(vcpu->cpuid, property), value);
1029}
1030
1031void vcpu_clear_cpuid_entry(struct kvm_vcpu *vcpu, u32 function)
1032{
1033	struct kvm_cpuid_entry2 *entry = vcpu_get_cpuid_entry(vcpu, function);
1034
1035	entry->eax = 0;
1036	entry->ebx = 0;
1037	entry->ecx = 0;
1038	entry->edx = 0;
1039	vcpu_set_cpuid(vcpu);
1040}
1041
1042void vcpu_set_or_clear_cpuid_feature(struct kvm_vcpu *vcpu,
1043				     struct kvm_x86_cpu_feature feature,
1044				     bool set)
1045{
1046	struct kvm_cpuid_entry2 *entry;
1047	u32 *reg;
1048
1049	entry = __vcpu_get_cpuid_entry(vcpu, feature.function, feature.index);
1050	reg = (&entry->eax) + feature.reg;
1051
1052	if (set)
1053		*reg |= BIT(feature.bit);
1054	else
1055		*reg &= ~BIT(feature.bit);
1056
1057	vcpu_set_cpuid(vcpu);
1058}
1059
1060u64 vcpu_get_msr(struct kvm_vcpu *vcpu, u64 msr_index)
1061{
1062	struct {
1063		struct kvm_msrs header;
1064		struct kvm_msr_entry entry;
1065	} buffer = {};
1066
1067	buffer.header.nmsrs = 1;
1068	buffer.entry.index = msr_index;
1069
1070	vcpu_msrs_get(vcpu, &buffer.header);
1071
1072	return buffer.entry.data;
1073}
1074
1075int _vcpu_set_msr(struct kvm_vcpu *vcpu, u64 msr_index, u64 msr_value)
1076{
1077	struct {
1078		struct kvm_msrs header;
1079		struct kvm_msr_entry entry;
1080	} buffer = {};
1081
1082	memset(&buffer, 0, sizeof(buffer));
1083	buffer.header.nmsrs = 1;
1084	buffer.entry.index = msr_index;
1085	buffer.entry.data = msr_value;
1086
1087	return __vcpu_ioctl(vcpu, KVM_SET_MSRS, &buffer.header);
1088}
1089
1090void vcpu_args_set(struct kvm_vcpu *vcpu, unsigned int num, ...)
1091{
1092	va_list ap;
1093	struct kvm_regs regs;
1094
1095	TEST_ASSERT(num >= 1 && num <= 6, "Unsupported number of args,\n"
1096		    "  num: %u",
1097		    num);
1098
1099	va_start(ap, num);
1100	vcpu_regs_get(vcpu, &regs);
1101
1102	if (num >= 1)
1103		regs.rdi = va_arg(ap, u64);
1104
1105	if (num >= 2)
1106		regs.rsi = va_arg(ap, u64);
1107
1108	if (num >= 3)
1109		regs.rdx = va_arg(ap, u64);
1110
1111	if (num >= 4)
1112		regs.rcx = va_arg(ap, u64);
1113
1114	if (num >= 5)
1115		regs.r8 = va_arg(ap, u64);
1116
1117	if (num >= 6)
1118		regs.r9 = va_arg(ap, u64);
1119
1120	vcpu_regs_set(vcpu, &regs);
1121	va_end(ap);
1122}
1123
1124void vcpu_arch_dump(FILE *stream, struct kvm_vcpu *vcpu, u8 indent)
1125{
1126	struct kvm_regs regs;
1127	struct kvm_sregs sregs;
1128
1129	fprintf(stream, "%*svCPU ID: %u\n", indent, "", vcpu->id);
1130
1131	fprintf(stream, "%*sregs:\n", indent + 2, "");
1132	vcpu_regs_get(vcpu, &regs);
1133	regs_dump(stream, &regs, indent + 4);
1134
1135	fprintf(stream, "%*ssregs:\n", indent + 2, "");
1136	vcpu_sregs_get(vcpu, &sregs);
1137	sregs_dump(stream, &sregs, indent + 4);
1138}
1139
1140static struct kvm_msr_list *__kvm_get_msr_index_list(bool feature_msrs)
1141{
1142	struct kvm_msr_list *list;
1143	struct kvm_msr_list nmsrs;
1144	int kvm_fd, r;
1145
1146	kvm_fd = open_kvm_dev_path_or_exit();
1147
1148	nmsrs.nmsrs = 0;
1149	if (!feature_msrs)
1150		r = __kvm_ioctl(kvm_fd, KVM_GET_MSR_INDEX_LIST, &nmsrs);
1151	else
1152		r = __kvm_ioctl(kvm_fd, KVM_GET_MSR_FEATURE_INDEX_LIST, &nmsrs);
1153
1154	TEST_ASSERT(r == -1 && errno == E2BIG,
1155		    "Expected -E2BIG, got rc: %i errno: %i (%s)",
1156		    r, errno, strerror(errno));
1157
1158	list = malloc(sizeof(*list) + nmsrs.nmsrs * sizeof(list->indices[0]));
1159	TEST_ASSERT(list, "-ENOMEM when allocating MSR index list");
1160	list->nmsrs = nmsrs.nmsrs;
1161
1162	if (!feature_msrs)
1163		kvm_ioctl(kvm_fd, KVM_GET_MSR_INDEX_LIST, list);
1164	else
1165		kvm_ioctl(kvm_fd, KVM_GET_MSR_FEATURE_INDEX_LIST, list);
1166	close(kvm_fd);
1167
1168	TEST_ASSERT(list->nmsrs == nmsrs.nmsrs,
1169		    "Number of MSRs in list changed, was %d, now %d",
1170		    nmsrs.nmsrs, list->nmsrs);
1171	return list;
1172}
1173
1174const struct kvm_msr_list *kvm_get_msr_index_list(void)
1175{
1176	static const struct kvm_msr_list *list;
1177
1178	if (!list)
1179		list = __kvm_get_msr_index_list(false);
1180	return list;
1181}
1182
1183
1184const struct kvm_msr_list *kvm_get_feature_msr_index_list(void)
1185{
1186	static const struct kvm_msr_list *list;
1187
1188	if (!list)
1189		list = __kvm_get_msr_index_list(true);
1190	return list;
1191}
1192
1193bool kvm_msr_is_in_save_restore_list(u32 msr_index)
1194{
1195	const struct kvm_msr_list *list = kvm_get_msr_index_list();
1196	int i;
1197
1198	for (i = 0; i < list->nmsrs; ++i) {
1199		if (list->indices[i] == msr_index)
1200			return true;
1201	}
1202
1203	return false;
1204}
1205
1206static void vcpu_save_xsave_state(struct kvm_vcpu *vcpu,
1207				  struct kvm_x86_state *state)
1208{
1209	int size = vm_check_cap(vcpu->vm, KVM_CAP_XSAVE2);
1210
1211	if (size) {
1212		state->xsave = malloc(size);
1213		vcpu_xsave2_get(vcpu, state->xsave);
1214	} else {
1215		state->xsave = malloc(sizeof(struct kvm_xsave));
1216		vcpu_xsave_get(vcpu, state->xsave);
1217	}
1218}
1219
1220struct kvm_x86_state *vcpu_save_state(struct kvm_vcpu *vcpu)
1221{
1222	const struct kvm_msr_list *msr_list = kvm_get_msr_index_list();
1223	struct kvm_x86_state *state;
1224	int i;
1225
1226	static int nested_size = -1;
1227
1228	if (nested_size == -1) {
1229		nested_size = kvm_check_cap(KVM_CAP_NESTED_STATE);
1230		TEST_ASSERT(nested_size <= sizeof(state->nested_),
1231			    "Nested state size too big, %i > %zi",
1232			    nested_size, sizeof(state->nested_));
1233	}
1234
1235	/*
1236	 * When KVM exits to userspace with KVM_EXIT_IO, KVM guarantees
1237	 * guest state is consistent only after userspace re-enters the
1238	 * kernel with KVM_RUN.  Complete IO prior to migrating state
1239	 * to a new VM.
1240	 */
1241	vcpu_run_complete_io(vcpu);
1242
1243	state = malloc(sizeof(*state) + msr_list->nmsrs * sizeof(state->msrs.entries[0]));
1244	TEST_ASSERT(state, "-ENOMEM when allocating kvm state");
1245
1246	vcpu_events_get(vcpu, &state->events);
1247	vcpu_mp_state_get(vcpu, &state->mp_state);
1248	vcpu_regs_get(vcpu, &state->regs);
1249	vcpu_save_xsave_state(vcpu, state);
1250
1251	if (kvm_has_cap(KVM_CAP_XCRS))
1252		vcpu_xcrs_get(vcpu, &state->xcrs);
1253
1254	vcpu_sregs_get(vcpu, &state->sregs);
1255
1256	if (nested_size) {
1257		state->nested.size = sizeof(state->nested_);
1258
1259		vcpu_nested_state_get(vcpu, &state->nested);
1260		TEST_ASSERT(state->nested.size <= nested_size,
1261			    "Nested state size too big, %i (KVM_CHECK_CAP gave %i)",
1262			    state->nested.size, nested_size);
1263	} else {
1264		state->nested.size = 0;
1265	}
1266
1267	state->msrs.nmsrs = msr_list->nmsrs;
1268	for (i = 0; i < msr_list->nmsrs; i++)
1269		state->msrs.entries[i].index = msr_list->indices[i];
1270	vcpu_msrs_get(vcpu, &state->msrs);
1271
1272	vcpu_debugregs_get(vcpu, &state->debugregs);
1273
1274	return state;
1275}
1276
1277void vcpu_load_state(struct kvm_vcpu *vcpu, struct kvm_x86_state *state)
1278{
1279	vcpu_sregs_set(vcpu, &state->sregs);
1280	vcpu_msrs_set(vcpu, &state->msrs);
1281
1282	if (kvm_has_cap(KVM_CAP_XCRS))
1283		vcpu_xcrs_set(vcpu, &state->xcrs);
1284
1285	vcpu_xsave_set(vcpu,  state->xsave);
1286	vcpu_events_set(vcpu, &state->events);
1287	vcpu_mp_state_set(vcpu, &state->mp_state);
1288	vcpu_debugregs_set(vcpu, &state->debugregs);
1289	vcpu_regs_set(vcpu, &state->regs);
1290
1291	if (state->nested.size)
1292		vcpu_nested_state_set(vcpu, &state->nested);
1293}
1294
1295void kvm_x86_state_cleanup(struct kvm_x86_state *state)
1296{
1297	free(state->xsave);
1298	free(state);
1299}
1300
1301void kvm_get_cpu_address_width(unsigned int *pa_bits, unsigned int *va_bits)
1302{
1303	if (!kvm_cpu_has_p(X86_PROPERTY_MAX_PHY_ADDR)) {
1304		*pa_bits = kvm_cpu_has(X86_FEATURE_PAE) ? 36 : 32;
1305		*va_bits = 32;
1306	} else {
1307		*pa_bits = kvm_cpu_property(X86_PROPERTY_MAX_PHY_ADDR);
1308		*va_bits = kvm_cpu_property(X86_PROPERTY_MAX_VIRT_ADDR);
1309	}
1310}
1311
1312void kvm_init_vm_address_properties(struct kvm_vm *vm)
1313{
1314	if (is_sev_vm(vm)) {
1315		vm->arch.sev_fd = open_sev_dev_path_or_exit();
1316		vm->arch.c_bit = BIT_ULL(this_cpu_property(X86_PROPERTY_SEV_C_BIT));
1317		vm->gpa_tag_mask = vm->arch.c_bit;
1318	} else {
1319		vm->arch.sev_fd = -1;
1320	}
1321}
1322
1323const struct kvm_cpuid_entry2 *get_cpuid_entry(const struct kvm_cpuid2 *cpuid,
1324					       u32 function, u32 index)
1325{
1326	int i;
1327
1328	for (i = 0; i < cpuid->nent; i++) {
1329		if (cpuid->entries[i].function == function &&
1330		    cpuid->entries[i].index == index)
1331			return &cpuid->entries[i];
1332	}
1333
1334	TEST_FAIL("CPUID function 0x%x index 0x%x not found ", function, index);
1335
1336	return NULL;
1337}
1338
1339#define X86_HYPERCALL(inputs...)					\
1340({									\
1341	u64 r;							\
1342									\
1343	asm volatile("test %[use_vmmcall], %[use_vmmcall]\n\t"		\
1344		     "jnz 1f\n\t"					\
1345		     "vmcall\n\t"					\
1346		     "jmp 2f\n\t"					\
1347		     "1: vmmcall\n\t"					\
1348		     "2:"						\
1349		     : "=a"(r)						\
1350		     : [use_vmmcall] "r" (host_cpu_is_amd_compatible),	\
1351		       inputs);						\
1352									\
1353	r;								\
1354})
1355
1356u64 kvm_hypercall(u64 nr, u64 a0, u64 a1, u64 a2, u64 a3)
1357{
1358	return X86_HYPERCALL("a"(nr), "b"(a0), "c"(a1), "d"(a2), "S"(a3));
1359}
1360
1361u64 __xen_hypercall(u64 nr, u64 a0, void *a1)
1362{
1363	return X86_HYPERCALL("a"(nr), "D"(a0), "S"(a1));
1364}
1365
1366void xen_hypercall(u64 nr, u64 a0, void *a1)
1367{
1368	GUEST_ASSERT(!__xen_hypercall(nr, a0, a1));
1369}
1370
1371unsigned long vm_compute_max_gfn(struct kvm_vm *vm)
1372{
1373	const unsigned long num_ht_pages = 12 << (30 - vm->page_shift); /* 12 GiB */
1374	unsigned long ht_gfn, max_gfn, max_pfn;
1375	u8 maxphyaddr, guest_maxphyaddr;
1376
1377	/*
1378	 * Use "guest MAXPHYADDR" from KVM if it's available.  Guest MAXPHYADDR
1379	 * enumerates the max _mappable_ GPA, which can be less than the raw
1380	 * MAXPHYADDR, e.g. if MAXPHYADDR=52, KVM is using TDP, and the CPU
1381	 * doesn't support 5-level TDP.
1382	 */
1383	guest_maxphyaddr = kvm_cpu_property(X86_PROPERTY_GUEST_MAX_PHY_ADDR);
1384	guest_maxphyaddr = guest_maxphyaddr ?: vm->pa_bits;
1385	TEST_ASSERT(guest_maxphyaddr <= vm->pa_bits,
1386		    "Guest MAXPHYADDR should never be greater than raw MAXPHYADDR");
1387
1388	max_gfn = (1ULL << (guest_maxphyaddr - vm->page_shift)) - 1;
1389
1390	/* Avoid reserved HyperTransport region on AMD or Hygon processors. */
1391	if (!host_cpu_is_amd_compatible)
1392		return max_gfn;
1393
1394	/* On parts with <40 physical address bits, the area is fully hidden */
1395	if (vm->pa_bits < 40)
1396		return max_gfn;
1397
1398	/* Before family 17h, the HyperTransport area is just below 1T.  */
1399	ht_gfn = (1 << 28) - num_ht_pages;
1400	if (this_cpu_family() < 0x17)
1401		goto done;
1402
1403	/*
1404	 * Otherwise it's at the top of the physical address space, possibly
1405	 * reduced due to SME or CSV by bits 11:6 of CPUID[0x8000001f].EBX.  Use
1406	 * the old conservative value if MAXPHYADDR is not enumerated.
1407	 */
1408	if (!this_cpu_has_p(X86_PROPERTY_MAX_PHY_ADDR))
1409		goto done;
1410
1411	maxphyaddr = this_cpu_property(X86_PROPERTY_MAX_PHY_ADDR);
1412	max_pfn = (1ULL << (maxphyaddr - vm->page_shift)) - 1;
1413
1414	if (this_cpu_has_p(X86_PROPERTY_PHYS_ADDR_REDUCTION))
1415		max_pfn >>= this_cpu_property(X86_PROPERTY_PHYS_ADDR_REDUCTION);
1416
1417	ht_gfn = max_pfn - num_ht_pages;
1418done:
1419	return min(max_gfn, ht_gfn - 1);
1420}
1421
1422void kvm_selftest_arch_init(void)
1423{
1424	host_cpu_is_intel = this_cpu_is_intel();
1425	host_cpu_is_amd = this_cpu_is_amd();
1426	host_cpu_is_hygon = this_cpu_is_hygon();
1427	host_cpu_is_amd_compatible = host_cpu_is_amd || host_cpu_is_hygon;
1428	is_forced_emulation_enabled = kvm_is_forced_emulation_enabled();
1429
1430	kvm_init_pmu_errata();
1431}
1432
1433bool sys_clocksource_is_based_on_tsc(void)
1434{
1435	char *clk_name = sys_get_cur_clocksource();
1436	bool ret = !strcmp(clk_name, "tsc\n") ||
1437		   !strcmp(clk_name, "hyperv_clocksource_tsc_page\n");
1438
1439	free(clk_name);
1440
1441	return ret;
1442}
1443
1444bool kvm_arch_has_default_irqchip(void)
1445{
1446	return true;
1447}
1448
1449void setup_smram(struct kvm_vm *vm, struct kvm_vcpu *vcpu, u64 smram_gpa,
1450		 const void *smi_handler, size_t handler_size)
1451{
1452	vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, smram_gpa,
1453				    SMRAM_MEMSLOT, SMRAM_PAGES, 0);
1454	TEST_ASSERT(vm_phy_pages_alloc(vm, SMRAM_PAGES, smram_gpa,
1455				       SMRAM_MEMSLOT) == smram_gpa,
1456		    "Could not allocate guest physical addresses for SMRAM");
1457
1458	memset(addr_gpa2hva(vm, smram_gpa), 0x0, SMRAM_SIZE);
1459	memcpy(addr_gpa2hva(vm, smram_gpa) + 0x8000, smi_handler, handler_size);
1460	vcpu_set_msr(vcpu, MSR_IA32_SMBASE, smram_gpa);
1461}
1462
1463void inject_smi(struct kvm_vcpu *vcpu)
1464{
1465	struct kvm_vcpu_events events;
1466
1467	vcpu_events_get(vcpu, &events);
1468	events.smi.pending = 1;
1469	events.flags |= KVM_VCPUEVENT_VALID_SMM;
1470	vcpu_events_set(vcpu, &events);
1471}
Configure Feed

Configure Feed