Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'kvm-s390-master-7.0-2' of https://git.kernel.org/pub/scm/linux/kernel/git/kvms390/linux into HEAD

KVM: s390: More memory management fixes

Lots of small and not-so-small fixes for the newly rewritten gmap,
mostly affecting the handling of nested guests.

+231 -178
+15 -85
arch/s390/kvm/dat.c
··· 135 135 } 136 136 137 137 /** 138 - * dat_crstep_xchg() - Exchange a gmap CRSTE with another. 139 - * @crstep: Pointer to the CRST entry 140 - * @new: Replacement entry. 141 - * @gfn: The affected guest address. 142 - * @asce: The ASCE of the address space. 143 - * 144 - * Context: This function is assumed to be called with kvm->mmu_lock held. 145 - */ 146 - void dat_crstep_xchg(union crste *crstep, union crste new, gfn_t gfn, union asce asce) 147 - { 148 - if (crstep->h.i) { 149 - WRITE_ONCE(*crstep, new); 150 - return; 151 - } else if (cpu_has_edat2()) { 152 - crdte_crste(crstep, *crstep, new, gfn, asce); 153 - return; 154 - } 155 - 156 - if (machine_has_tlb_guest()) 157 - idte_crste(crstep, gfn, IDTE_GUEST_ASCE, asce, IDTE_GLOBAL); 158 - else 159 - idte_crste(crstep, gfn, 0, NULL_ASCE, IDTE_GLOBAL); 160 - WRITE_ONCE(*crstep, new); 161 - } 162 - 163 - /** 164 138 * dat_crstep_xchg_atomic() - Atomically exchange a gmap CRSTE with another. 165 139 * @crstep: Pointer to the CRST entry. 166 140 * @old: Expected old value. ··· 149 175 * 150 176 * Return: %true if the exchange was successful. 151 177 */ 152 - bool dat_crstep_xchg_atomic(union crste *crstep, union crste old, union crste new, gfn_t gfn, 153 - union asce asce) 178 + bool __must_check dat_crstep_xchg_atomic(union crste *crstep, union crste old, union crste new, 179 + gfn_t gfn, union asce asce) 154 180 { 155 181 if (old.h.i) 156 182 return arch_try_cmpxchg((long *)crstep, &old.val, new.val); ··· 266 292 pt->ptes[i].val = init.val | i * PAGE_SIZE; 267 293 /* No need to take locks as the page table is not installed yet. */ 268 294 pgste_init.prefix_notif = old.s.fc1.prefix_notif; 295 + pgste_init.vsie_notif = old.s.fc1.vsie_notif; 269 296 pgste_init.pcl = uses_skeys && init.h.i; 270 297 dat_init_pgstes(pt, pgste_init.val); 271 298 } else { ··· 868 893 869 894 /* This table entry needs to be updated. */ 870 895 if (walk->start <= gfn && walk->end >= next) { 871 - dat_crstep_xchg_atomic(crstep, crste, new_crste, gfn, walk->asce); 896 + if (!dat_crstep_xchg_atomic(crstep, crste, new_crste, gfn, walk->asce)) 897 + return -EINVAL; 872 898 /* A lower level table was present, needs to be freed. */ 873 899 if (!crste.h.fc && !crste.h.i) { 874 900 if (is_pmd(crste)) ··· 997 1021 return _dat_walk_gfn_range(start, end, asce, &test_age_ops, 0, NULL) > 0; 998 1022 } 999 1023 1000 - int dat_link(struct kvm_s390_mmu_cache *mc, union asce asce, int level, 1001 - bool uses_skeys, struct guest_fault *f) 1002 - { 1003 - union crste oldval, newval; 1004 - union pte newpte, oldpte; 1005 - union pgste pgste; 1006 - int rc = 0; 1007 - 1008 - rc = dat_entry_walk(mc, f->gfn, asce, DAT_WALK_ALLOC_CONTINUE, level, &f->crstep, &f->ptep); 1009 - if (rc == -EINVAL || rc == -ENOMEM) 1010 - return rc; 1011 - if (rc) 1012 - return -EAGAIN; 1013 - 1014 - if (WARN_ON_ONCE(unlikely(get_level(f->crstep, f->ptep) > level))) 1015 - return -EINVAL; 1016 - 1017 - if (f->ptep) { 1018 - pgste = pgste_get_lock(f->ptep); 1019 - oldpte = *f->ptep; 1020 - newpte = _pte(f->pfn, f->writable, f->write_attempt | oldpte.s.d, !f->page); 1021 - newpte.s.sd = oldpte.s.sd; 1022 - oldpte.s.sd = 0; 1023 - if (oldpte.val == _PTE_EMPTY.val || oldpte.h.pfra == f->pfn) { 1024 - pgste = __dat_ptep_xchg(f->ptep, pgste, newpte, f->gfn, asce, uses_skeys); 1025 - if (f->callback) 1026 - f->callback(f); 1027 - } else { 1028 - rc = -EAGAIN; 1029 - } 1030 - pgste_set_unlock(f->ptep, pgste); 1031 - } else { 1032 - oldval = READ_ONCE(*f->crstep); 1033 - newval = _crste_fc1(f->pfn, oldval.h.tt, f->writable, 1034 - f->write_attempt | oldval.s.fc1.d); 1035 - newval.s.fc1.sd = oldval.s.fc1.sd; 1036 - if (oldval.val != _CRSTE_EMPTY(oldval.h.tt).val && 1037 - crste_origin_large(oldval) != crste_origin_large(newval)) 1038 - return -EAGAIN; 1039 - if (!dat_crstep_xchg_atomic(f->crstep, oldval, newval, f->gfn, asce)) 1040 - return -EAGAIN; 1041 - if (f->callback) 1042 - f->callback(f); 1043 - } 1044 - 1045 - return rc; 1046 - } 1047 - 1048 1024 static long dat_set_pn_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk) 1049 1025 { 1050 - union crste crste = READ_ONCE(*crstep); 1026 + union crste newcrste, oldcrste; 1051 1027 int *n = walk->priv; 1052 1028 1053 - if (!crste.h.fc || crste.h.i || crste.h.p) 1054 - return 0; 1055 - 1029 + do { 1030 + oldcrste = READ_ONCE(*crstep); 1031 + if (!oldcrste.h.fc || oldcrste.h.i || oldcrste.h.p) 1032 + return 0; 1033 + if (oldcrste.s.fc1.prefix_notif) 1034 + break; 1035 + newcrste = oldcrste; 1036 + newcrste.s.fc1.prefix_notif = 1; 1037 + } while (!dat_crstep_xchg_atomic(crstep, oldcrste, newcrste, gfn, walk->asce)); 1056 1038 *n = 2; 1057 - if (crste.s.fc1.prefix_notif) 1058 - return 0; 1059 - crste.s.fc1.prefix_notif = 1; 1060 - dat_crstep_xchg(crstep, crste, gfn, walk->asce); 1061 1039 return 0; 1062 1040 } 1063 1041
+12 -11
arch/s390/kvm/dat.h
··· 160 160 unsigned long :44; /* HW */ 161 161 unsigned long : 3; /* Unused */ 162 162 unsigned long : 1; /* HW */ 163 + unsigned long s : 1; /* Special */ 163 164 unsigned long w : 1; /* Writable soft-bit */ 164 165 unsigned long r : 1; /* Readable soft-bit */ 165 166 unsigned long d : 1; /* Dirty */ 166 167 unsigned long y : 1; /* Young */ 167 - unsigned long prefix_notif : 1; /* Guest prefix invalidation notification */ 168 168 unsigned long : 3; /* HW */ 169 + unsigned long prefix_notif : 1; /* Guest prefix invalidation notification */ 169 170 unsigned long vsie_notif : 1; /* Referenced in a shadow table */ 170 - unsigned long : 1; /* Unused */ 171 171 unsigned long : 4; /* HW */ 172 172 unsigned long sd : 1; /* Soft-Dirty */ 173 173 unsigned long pr : 1; /* Present */ ··· 183 183 unsigned long :33; /* HW */ 184 184 unsigned long :14; /* Unused */ 185 185 unsigned long : 1; /* HW */ 186 + unsigned long s : 1; /* Special */ 186 187 unsigned long w : 1; /* Writable soft-bit */ 187 188 unsigned long r : 1; /* Readable soft-bit */ 188 189 unsigned long d : 1; /* Dirty */ 189 190 unsigned long y : 1; /* Young */ 190 - unsigned long prefix_notif : 1; /* Guest prefix invalidation notification */ 191 191 unsigned long : 3; /* HW */ 192 + unsigned long prefix_notif : 1; /* Guest prefix invalidation notification */ 192 193 unsigned long vsie_notif : 1; /* Referenced in a shadow table */ 193 - unsigned long : 1; /* Unused */ 194 194 unsigned long : 4; /* HW */ 195 195 unsigned long sd : 1; /* Soft-Dirty */ 196 196 unsigned long pr : 1; /* Present */ ··· 254 254 struct { 255 255 unsigned long :47; 256 256 unsigned long : 1; /* HW (should be 0) */ 257 + unsigned long s : 1; /* Special */ 257 258 unsigned long w : 1; /* Writable */ 258 259 unsigned long r : 1; /* Readable */ 259 260 unsigned long d : 1; /* Dirty */ 260 261 unsigned long y : 1; /* Young */ 261 - unsigned long prefix_notif : 1; /* Guest prefix invalidation notification */ 262 262 unsigned long : 3; /* HW */ 263 + unsigned long prefix_notif : 1; /* Guest prefix invalidation notification */ 263 264 unsigned long vsie_notif : 1; /* Referenced in a shadow table */ 264 - unsigned long : 1; 265 265 unsigned long : 4; /* HW */ 266 266 unsigned long sd : 1; /* Soft-Dirty */ 267 267 unsigned long pr : 1; /* Present */ ··· 540 540 u16 type, u16 param); 541 541 int dat_set_prefix_notif_bit(union asce asce, gfn_t gfn); 542 542 bool dat_test_age_gfn(union asce asce, gfn_t start, gfn_t end); 543 - int dat_link(struct kvm_s390_mmu_cache *mc, union asce asce, int level, 544 - bool uses_skeys, struct guest_fault *f); 545 543 546 544 int dat_perform_essa(union asce asce, gfn_t gfn, int orc, union essa_state *state, bool *dirty); 547 545 long dat_reset_cmma(union asce asce, gfn_t start_gfn); ··· 936 938 return dat_crstep_xchg_atomic(_CRSTEP(pudp), _CRSTE(old), _CRSTE(new), gfn, asce); 937 939 } 938 940 939 - static inline void dat_crstep_clear(union crste *crstep, gfn_t gfn, union asce asce) 941 + static inline union crste dat_crstep_clear_atomic(union crste *crstep, gfn_t gfn, union asce asce) 940 942 { 941 - union crste newcrste = _CRSTE_EMPTY(crstep->h.tt); 943 + union crste oldcrste, empty = _CRSTE_EMPTY(crstep->h.tt); 942 944 943 - dat_crstep_xchg(crstep, newcrste, gfn, asce); 945 + do { 946 + oldcrste = READ_ONCE(*crstep); 947 + } while (!dat_crstep_xchg_atomic(crstep, oldcrste, empty, gfn, asce)); 948 + return oldcrste; 944 949 } 945 950 946 951 static inline int get_level(union crste *crstep, union pte *ptep)
+51 -20
arch/s390/kvm/gaccess.c
··· 1436 1436 1437 1437 if (!pgste_get_trylock(ptep_h, &pgste)) 1438 1438 return -EAGAIN; 1439 - newpte = _pte(f->pfn, f->writable, !p, 0); 1440 - newpte.s.d |= ptep->s.d; 1441 - newpte.s.sd |= ptep->s.sd; 1442 - newpte.h.p &= ptep->h.p; 1443 - pgste = _gmap_ptep_xchg(sg->parent, ptep_h, newpte, pgste, f->gfn, false); 1444 - pgste.vsie_notif = 1; 1439 + newpte = _pte(f->pfn, f->writable, !p, ptep_h->s.s); 1440 + newpte.s.d |= ptep_h->s.d; 1441 + newpte.s.sd |= ptep_h->s.sd; 1442 + newpte.h.p &= ptep_h->h.p; 1443 + if (!newpte.h.p && !f->writable) { 1444 + rc = -EOPNOTSUPP; 1445 + } else { 1446 + pgste = _gmap_ptep_xchg(sg->parent, ptep_h, newpte, pgste, f->gfn, false); 1447 + pgste.vsie_notif = 1; 1448 + } 1445 1449 pgste_set_unlock(ptep_h, pgste); 1450 + if (rc) 1451 + return rc; 1452 + if (!sg->parent) 1453 + return -EAGAIN; 1446 1454 1447 1455 newpte = _pte(f->pfn, 0, !p, 0); 1448 1456 if (!pgste_get_trylock(ptep, &pgste)) ··· 1464 1456 static int _do_shadow_crste(struct gmap *sg, gpa_t raddr, union crste *host, union crste *table, 1465 1457 struct guest_fault *f, bool p) 1466 1458 { 1467 - union crste newcrste; 1459 + union crste newcrste, oldcrste; 1468 1460 gfn_t gfn; 1469 1461 int rc; 1470 1462 ··· 1477 1469 if (rc) 1478 1470 return rc; 1479 1471 1480 - newcrste = _crste_fc1(f->pfn, host->h.tt, f->writable, !p); 1481 - newcrste.s.fc1.d |= host->s.fc1.d; 1482 - newcrste.s.fc1.sd |= host->s.fc1.sd; 1483 - newcrste.h.p &= host->h.p; 1484 - newcrste.s.fc1.vsie_notif = 1; 1485 - newcrste.s.fc1.prefix_notif = host->s.fc1.prefix_notif; 1486 - _gmap_crstep_xchg(sg->parent, host, newcrste, f->gfn, false); 1472 + do { 1473 + /* _gmap_crstep_xchg_atomic() could have unshadowed this shadow gmap */ 1474 + if (!sg->parent) 1475 + return -EAGAIN; 1476 + oldcrste = READ_ONCE(*host); 1477 + newcrste = _crste_fc1(f->pfn, oldcrste.h.tt, f->writable, !p); 1478 + newcrste.s.fc1.d |= oldcrste.s.fc1.d; 1479 + newcrste.s.fc1.sd |= oldcrste.s.fc1.sd; 1480 + newcrste.h.p &= oldcrste.h.p; 1481 + newcrste.s.fc1.vsie_notif = 1; 1482 + newcrste.s.fc1.prefix_notif = oldcrste.s.fc1.prefix_notif; 1483 + newcrste.s.fc1.s = oldcrste.s.fc1.s; 1484 + if (!newcrste.h.p && !f->writable) 1485 + return -EOPNOTSUPP; 1486 + } while (!_gmap_crstep_xchg_atomic(sg->parent, host, oldcrste, newcrste, f->gfn, false)); 1487 + if (!sg->parent) 1488 + return -EAGAIN; 1487 1489 1488 - newcrste = _crste_fc1(f->pfn, host->h.tt, 0, !p); 1489 - dat_crstep_xchg(table, newcrste, gpa_to_gfn(raddr), sg->asce); 1490 + newcrste = _crste_fc1(f->pfn, oldcrste.h.tt, 0, !p); 1491 + gfn = gpa_to_gfn(raddr); 1492 + while (!dat_crstep_xchg_atomic(table, READ_ONCE(*table), newcrste, gfn, sg->asce)) 1493 + ; 1490 1494 return 0; 1491 1495 } 1492 1496 ··· 1522 1502 if (rc) 1523 1503 return rc; 1524 1504 1525 - /* A race occourred. The shadow mapping is already valid, nothing to do */ 1526 - if ((ptep && !ptep->h.i) || (!ptep && crste_leaf(*table))) 1505 + /* A race occurred. The shadow mapping is already valid, nothing to do */ 1506 + if ((ptep && !ptep->h.i && ptep->h.p == w->p) || 1507 + (!ptep && crste_leaf(*table) && !table->h.i && table->h.p == w->p)) 1527 1508 return 0; 1528 1509 1529 1510 gl = get_level(table, ptep); 1511 + 1512 + /* In case of a real address space */ 1513 + if (w->level <= LEVEL_MEM) { 1514 + l = TABLE_TYPE_PAGE_TABLE; 1515 + hl = TABLE_TYPE_REGION1; 1516 + goto real_address_space; 1517 + } 1530 1518 1531 1519 /* 1532 1520 * Skip levels that are already protected. For each level, protect 1533 1521 * only the page containing the entry, not the whole table. 1534 1522 */ 1535 1523 for (i = gl ; i >= w->level; i--) { 1536 - rc = gmap_protect_rmap(mc, sg, entries[i - 1].gfn, gpa_to_gfn(saddr), 1537 - entries[i - 1].pfn, i, entries[i - 1].writable); 1524 + rc = gmap_protect_rmap(mc, sg, entries[i].gfn, gpa_to_gfn(saddr), 1525 + entries[i].pfn, i + 1, entries[i].writable); 1538 1526 if (rc) 1539 1527 return rc; 1528 + if (!sg->parent) 1529 + return -EAGAIN; 1540 1530 } 1541 1531 1542 1532 rc = dat_entry_walk(NULL, entries[LEVEL_MEM].gfn, sg->parent->asce, DAT_WALK_LEAF, ··· 1558 1528 /* Get the smallest granularity */ 1559 1529 l = min3(gl, hl, w->level); 1560 1530 1531 + real_address_space: 1561 1532 flags = DAT_WALK_SPLIT_ALLOC | (uses_skeys(sg->parent) ? DAT_WALK_USES_SKEYS : 0); 1562 1533 /* If necessary, create the shadow mapping */ 1563 1534 if (l < gl) {
+114 -46
arch/s390/kvm/gmap.c
··· 313 313 struct clear_young_pte_priv *priv = walk->priv; 314 314 union crste crste, new; 315 315 316 - crste = READ_ONCE(*crstep); 316 + do { 317 + crste = READ_ONCE(*crstep); 317 318 318 - if (!crste.h.fc) 319 - return 0; 320 - if (!crste.s.fc1.y && crste.h.i) 321 - return 0; 322 - if (!crste_prefix(crste) || gmap_mkold_prefix(priv->gmap, gfn, end)) { 319 + if (!crste.h.fc) 320 + return 0; 321 + if (!crste.s.fc1.y && crste.h.i) 322 + return 0; 323 + if (crste_prefix(crste) && !gmap_mkold_prefix(priv->gmap, gfn, end)) 324 + break; 325 + 323 326 new = crste; 324 327 new.h.i = 1; 325 328 new.s.fc1.y = 0; ··· 331 328 folio_set_dirty(phys_to_folio(crste_origin_large(crste))); 332 329 new.s.fc1.d = 0; 333 330 new.h.p = 1; 334 - dat_crstep_xchg(crstep, new, gfn, walk->asce); 335 - } 331 + } while (!dat_crstep_xchg_atomic(crstep, crste, new, gfn, walk->asce)); 332 + 336 333 priv->young = 1; 337 334 return 0; 338 335 } ··· 394 391 { 395 392 struct gmap_unmap_priv *priv = walk->priv; 396 393 struct folio *folio = NULL; 394 + union crste old = *crstep; 397 395 398 - if (crstep->h.fc) { 399 - if (crstep->s.fc1.pr && test_bit(GMAP_FLAG_EXPORT_ON_UNMAP, &priv->gmap->flags)) 400 - folio = phys_to_folio(crste_origin_large(*crstep)); 401 - gmap_crstep_xchg(priv->gmap, crstep, _CRSTE_EMPTY(crstep->h.tt), gfn); 402 - if (folio) 403 - uv_convert_from_secure_folio(folio); 404 - } 396 + if (!old.h.fc) 397 + return 0; 398 + 399 + if (old.s.fc1.pr && test_bit(GMAP_FLAG_EXPORT_ON_UNMAP, &priv->gmap->flags)) 400 + folio = phys_to_folio(crste_origin_large(old)); 401 + /* No races should happen because kvm->mmu_lock is held in write mode */ 402 + KVM_BUG_ON(!gmap_crstep_xchg_atomic(priv->gmap, crstep, old, _CRSTE_EMPTY(old.h.tt), gfn), 403 + priv->gmap->kvm); 404 + if (folio) 405 + uv_convert_from_secure_folio(folio); 405 406 406 407 return 0; 407 408 } ··· 481 474 482 475 if (fatal_signal_pending(current)) 483 476 return 1; 484 - crste = READ_ONCE(*table); 485 - if (!crste.h.fc) 486 - return 0; 487 - if (crste.h.p && !crste.s.fc1.sd) 488 - return 0; 477 + do { 478 + crste = READ_ONCE(*table); 479 + if (!crste.h.fc) 480 + return 0; 481 + if (crste.h.p && !crste.s.fc1.sd) 482 + return 0; 489 483 490 - /* 491 - * If this large page contains one or more prefixes of vCPUs that are 492 - * currently running, do not reset the protection, leave it marked as 493 - * dirty. 494 - */ 495 - if (!crste.s.fc1.prefix_notif || gmap_mkold_prefix(gmap, gfn, end)) { 484 + /* 485 + * If this large page contains one or more prefixes of vCPUs that are 486 + * currently running, do not reset the protection, leave it marked as 487 + * dirty. 488 + */ 489 + if (crste.s.fc1.prefix_notif && !gmap_mkold_prefix(gmap, gfn, end)) 490 + break; 496 491 new = crste; 497 492 new.h.p = 1; 498 493 new.s.fc1.sd = 0; 499 - gmap_crstep_xchg(gmap, table, new, gfn); 500 - } 494 + } while (!gmap_crstep_xchg_atomic(gmap, table, crste, new, gfn)); 501 495 502 496 for ( ; gfn < end; gfn++) 503 497 mark_page_dirty(gmap->kvm, gfn); ··· 519 511 _dat_walk_gfn_range(start, end, gmap->asce, &walk_ops, 0, gmap); 520 512 } 521 513 522 - static int gmap_handle_minor_crste_fault(union asce asce, struct guest_fault *f) 514 + static int gmap_handle_minor_crste_fault(struct gmap *gmap, struct guest_fault *f) 523 515 { 524 516 union crste newcrste, oldcrste = READ_ONCE(*f->crstep); 525 517 ··· 544 536 newcrste.s.fc1.d = 1; 545 537 newcrste.s.fc1.sd = 1; 546 538 } 547 - if (!oldcrste.s.fc1.d && newcrste.s.fc1.d) 548 - SetPageDirty(phys_to_page(crste_origin_large(newcrste))); 549 539 /* In case of races, let the slow path deal with it. */ 550 - return !dat_crstep_xchg_atomic(f->crstep, oldcrste, newcrste, f->gfn, asce); 540 + return !gmap_crstep_xchg_atomic(gmap, f->crstep, oldcrste, newcrste, f->gfn); 551 541 } 552 542 /* Trying to write on a read-only page, let the slow path deal with it. */ 553 543 return 1; ··· 574 568 newpte.s.d = 1; 575 569 newpte.s.sd = 1; 576 570 } 577 - if (!oldpte.s.d && newpte.s.d) 578 - SetPageDirty(pfn_to_page(newpte.h.pfra)); 579 571 *pgste = gmap_ptep_xchg(gmap, f->ptep, newpte, *pgste, f->gfn); 580 572 581 573 return 0; ··· 610 606 fault->callback(fault); 611 607 pgste_set_unlock(fault->ptep, pgste); 612 608 } else { 613 - rc = gmap_handle_minor_crste_fault(gmap->asce, fault); 609 + rc = gmap_handle_minor_crste_fault(gmap, fault); 614 610 if (!rc && fault->callback) 615 611 fault->callback(fault); 616 612 } ··· 627 623 return test_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &gmap->flags); 628 624 } 629 625 626 + static int _gmap_link(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, int level, 627 + struct guest_fault *f) 628 + { 629 + union crste oldval, newval; 630 + union pte newpte, oldpte; 631 + union pgste pgste; 632 + int rc = 0; 633 + 634 + rc = dat_entry_walk(mc, f->gfn, gmap->asce, DAT_WALK_ALLOC_CONTINUE, level, 635 + &f->crstep, &f->ptep); 636 + if (rc == -ENOMEM) 637 + return rc; 638 + if (KVM_BUG_ON(rc == -EINVAL, gmap->kvm)) 639 + return rc; 640 + if (rc) 641 + return -EAGAIN; 642 + if (KVM_BUG_ON(get_level(f->crstep, f->ptep) > level, gmap->kvm)) 643 + return -EINVAL; 644 + 645 + if (f->ptep) { 646 + pgste = pgste_get_lock(f->ptep); 647 + oldpte = *f->ptep; 648 + newpte = _pte(f->pfn, f->writable, f->write_attempt | oldpte.s.d, !f->page); 649 + newpte.s.sd = oldpte.s.sd; 650 + oldpte.s.sd = 0; 651 + if (oldpte.val == _PTE_EMPTY.val || oldpte.h.pfra == f->pfn) { 652 + pgste = gmap_ptep_xchg(gmap, f->ptep, newpte, pgste, f->gfn); 653 + if (f->callback) 654 + f->callback(f); 655 + } else { 656 + rc = -EAGAIN; 657 + } 658 + pgste_set_unlock(f->ptep, pgste); 659 + } else { 660 + do { 661 + oldval = READ_ONCE(*f->crstep); 662 + newval = _crste_fc1(f->pfn, oldval.h.tt, f->writable, 663 + f->write_attempt | oldval.s.fc1.d); 664 + newval.s.fc1.s = !f->page; 665 + newval.s.fc1.sd = oldval.s.fc1.sd; 666 + if (oldval.val != _CRSTE_EMPTY(oldval.h.tt).val && 667 + crste_origin_large(oldval) != crste_origin_large(newval)) 668 + return -EAGAIN; 669 + } while (!gmap_crstep_xchg_atomic(gmap, f->crstep, oldval, newval, f->gfn)); 670 + if (f->callback) 671 + f->callback(f); 672 + } 673 + 674 + return rc; 675 + } 676 + 630 677 int gmap_link(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, struct guest_fault *f) 631 678 { 632 679 unsigned int order; 633 - int rc, level; 680 + int level; 634 681 635 682 lockdep_assert_held(&gmap->kvm->mmu_lock); 636 683 ··· 693 638 else if (order >= get_order(_SEGMENT_SIZE) && gmap_1m_allowed(gmap, f->gfn)) 694 639 level = TABLE_TYPE_SEGMENT; 695 640 } 696 - rc = dat_link(mc, gmap->asce, level, uses_skeys(gmap), f); 697 - KVM_BUG_ON(rc == -EINVAL, gmap->kvm); 698 - return rc; 641 + return _gmap_link(mc, gmap, level, f); 699 642 } 700 643 701 644 static int gmap_ucas_map_one(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, 702 645 gfn_t p_gfn, gfn_t c_gfn, bool force_alloc) 703 646 { 647 + union crste newcrste, oldcrste; 704 648 struct page_table *pt; 705 - union crste newcrste; 706 649 union crste *crstep; 707 650 union pte *ptep; 708 651 int rc; ··· 726 673 &crstep, &ptep); 727 674 if (rc) 728 675 return rc; 729 - dat_crstep_xchg(crstep, newcrste, c_gfn, gmap->asce); 676 + do { 677 + oldcrste = READ_ONCE(*crstep); 678 + if (oldcrste.val == newcrste.val) 679 + break; 680 + } while (!dat_crstep_xchg_atomic(crstep, oldcrste, newcrste, c_gfn, gmap->asce)); 730 681 return 0; 731 682 } 732 683 ··· 834 777 int rc; 835 778 836 779 rc = dat_entry_walk(NULL, c_gfn, gmap->asce, 0, TABLE_TYPE_SEGMENT, &crstep, &ptep); 837 - if (!rc) 838 - dat_crstep_xchg(crstep, _PMD_EMPTY, c_gfn, gmap->asce); 780 + if (rc) 781 + return; 782 + while (!dat_crstep_xchg_atomic(crstep, READ_ONCE(*crstep), _PMD_EMPTY, c_gfn, gmap->asce)) 783 + ; 839 784 } 840 785 841 786 void gmap_ucas_unmap(struct gmap *gmap, gfn_t c_gfn, unsigned long count) ··· 1076 1017 dat_ptep_xchg(ptep, _PTE_EMPTY, r_gfn, sg->asce, uses_skeys(sg)); 1077 1018 return; 1078 1019 } 1079 - crste = READ_ONCE(*crstep); 1080 - dat_crstep_clear(crstep, r_gfn, sg->asce); 1020 + 1021 + crste = dat_crstep_clear_atomic(crstep, r_gfn, sg->asce); 1081 1022 if (crste_leaf(crste) || crste.h.i) 1082 1023 return; 1083 1024 if (is_pmd(crste)) ··· 1160 1101 static inline int __gmap_protect_asce_top_level(struct kvm_s390_mmu_cache *mc, struct gmap *sg, 1161 1102 struct gmap_protect_asce_top_level *context) 1162 1103 { 1104 + struct gmap *parent; 1163 1105 int rc, i; 1164 1106 1165 1107 guard(write_lock)(&sg->kvm->mmu_lock); ··· 1168 1108 if (kvm_s390_array_needs_retry_safe(sg->kvm, context->seq, context->f)) 1169 1109 return -EAGAIN; 1170 1110 1171 - scoped_guard(spinlock, &sg->parent->children_lock) { 1111 + parent = READ_ONCE(sg->parent); 1112 + if (!parent) 1113 + return -EAGAIN; 1114 + scoped_guard(spinlock, &parent->children_lock) { 1115 + if (READ_ONCE(sg->parent) != parent) 1116 + return -EAGAIN; 1172 1117 for (i = 0; i < CRST_TABLE_PAGES; i++) { 1173 1118 if (!context->f[i].valid) 1174 1119 continue; ··· 1255 1190 { 1256 1191 struct gmap *sg, *new; 1257 1192 int rc; 1193 + 1194 + if (WARN_ON(!parent)) 1195 + return ERR_PTR(-EINVAL); 1258 1196 1259 1197 scoped_guard(spinlock, &parent->children_lock) { 1260 1198 sg = gmap_find_shadow(parent, asce, edat_level);
+21 -12
arch/s390/kvm/gmap.h
··· 185 185 else 186 186 _gmap_handle_vsie_unshadow_event(gmap, gfn); 187 187 } 188 + if (!ptep->s.d && newpte.s.d && !newpte.s.s) 189 + SetPageDirty(pfn_to_page(newpte.h.pfra)); 188 190 return __dat_ptep_xchg(ptep, pgste, newpte, gfn, gmap->asce, uses_skeys(gmap)); 189 191 } 190 192 ··· 196 194 return _gmap_ptep_xchg(gmap, ptep, newpte, pgste, gfn, true); 197 195 } 198 196 199 - static inline void _gmap_crstep_xchg(struct gmap *gmap, union crste *crstep, union crste ne, 200 - gfn_t gfn, bool needs_lock) 197 + static inline bool __must_check _gmap_crstep_xchg_atomic(struct gmap *gmap, union crste *crstep, 198 + union crste oldcrste, union crste newcrste, 199 + gfn_t gfn, bool needs_lock) 201 200 { 202 - unsigned long align = 8 + (is_pmd(*crstep) ? 0 : 11); 201 + unsigned long align = is_pmd(newcrste) ? _PAGE_ENTRIES : _PAGE_ENTRIES * _CRST_ENTRIES; 202 + 203 + if (KVM_BUG_ON(crstep->h.tt != oldcrste.h.tt || newcrste.h.tt != oldcrste.h.tt, gmap->kvm)) 204 + return true; 203 205 204 206 lockdep_assert_held(&gmap->kvm->mmu_lock); 205 207 if (!needs_lock) 206 208 lockdep_assert_held(&gmap->children_lock); 207 209 208 210 gfn = ALIGN_DOWN(gfn, align); 209 - if (crste_prefix(*crstep) && (ne.h.p || ne.h.i || !crste_prefix(ne))) { 210 - ne.s.fc1.prefix_notif = 0; 211 + if (crste_prefix(oldcrste) && (newcrste.h.p || newcrste.h.i || !crste_prefix(newcrste))) { 212 + newcrste.s.fc1.prefix_notif = 0; 211 213 gmap_unmap_prefix(gmap, gfn, gfn + align); 212 214 } 213 - if (crste_leaf(*crstep) && crstep->s.fc1.vsie_notif && 214 - (ne.h.p || ne.h.i || !ne.s.fc1.vsie_notif)) { 215 - ne.s.fc1.vsie_notif = 0; 215 + if (crste_leaf(oldcrste) && oldcrste.s.fc1.vsie_notif && 216 + (newcrste.h.p || newcrste.h.i || !newcrste.s.fc1.vsie_notif)) { 217 + newcrste.s.fc1.vsie_notif = 0; 216 218 if (needs_lock) 217 219 gmap_handle_vsie_unshadow_event(gmap, gfn); 218 220 else 219 221 _gmap_handle_vsie_unshadow_event(gmap, gfn); 220 222 } 221 - dat_crstep_xchg(crstep, ne, gfn, gmap->asce); 223 + if (!oldcrste.s.fc1.d && newcrste.s.fc1.d && !newcrste.s.fc1.s) 224 + SetPageDirty(phys_to_page(crste_origin_large(newcrste))); 225 + return dat_crstep_xchg_atomic(crstep, oldcrste, newcrste, gfn, gmap->asce); 222 226 } 223 227 224 - static inline void gmap_crstep_xchg(struct gmap *gmap, union crste *crstep, union crste ne, 225 - gfn_t gfn) 228 + static inline bool __must_check gmap_crstep_xchg_atomic(struct gmap *gmap, union crste *crstep, 229 + union crste oldcrste, union crste newcrste, 230 + gfn_t gfn) 226 231 { 227 - return _gmap_crstep_xchg(gmap, crstep, ne, gfn, true); 232 + return _gmap_crstep_xchg_atomic(gmap, crstep, oldcrste, newcrste, gfn, true); 228 233 } 229 234 230 235 /**
+15 -3
arch/s390/kvm/kvm-s390.c
··· 5520 5520 } 5521 5521 #endif 5522 5522 case KVM_S390_VCPU_FAULT: { 5523 - idx = srcu_read_lock(&vcpu->kvm->srcu); 5524 - r = vcpu_dat_fault_handler(vcpu, arg, 0); 5525 - srcu_read_unlock(&vcpu->kvm->srcu, idx); 5523 + gpa_t gaddr = arg; 5524 + 5525 + scoped_guard(srcu, &vcpu->kvm->srcu) { 5526 + r = vcpu_ucontrol_translate(vcpu, &gaddr); 5527 + if (r) 5528 + break; 5529 + 5530 + r = kvm_s390_faultin_gfn_simple(vcpu, NULL, gpa_to_gfn(gaddr), false); 5531 + if (r == PGM_ADDRESSING) 5532 + r = -EFAULT; 5533 + if (r <= 0) 5534 + break; 5535 + r = -EIO; 5536 + KVM_BUG_ON(r, vcpu->kvm); 5537 + } 5526 5538 break; 5527 5539 } 5528 5540 case KVM_ENABLE_CAP:
+3 -1
arch/s390/kvm/vsie.c
··· 1328 1328 static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) 1329 1329 { 1330 1330 struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; 1331 - struct gmap *sg; 1331 + struct gmap *sg = NULL; 1332 1332 int rc = 0; 1333 1333 1334 1334 while (1) { ··· 1368 1368 sg = gmap_put(sg); 1369 1369 cond_resched(); 1370 1370 } 1371 + if (sg) 1372 + sg = gmap_put(sg); 1371 1373 1372 1374 if (rc == -EFAULT) { 1373 1375 /*