Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

KVM: s390: Remove non-atomic dat_crstep_xchg()

In practice dat_crstep_xchg() is racy and hard to use correctly. Simply
remove it and replace its uses with dat_crstep_xchg_atomic().

This solves some actual races that lead to system hangs / crashes.

Opportunistically fix an alignment issue in _gmap_crstep_xchg_atomic().

Fixes: 589071eaaa8f ("KVM: s390: KVM page table management functions: clear and replace")
Fixes: 94fd9b16cc67 ("KVM: s390: KVM page table management functions: lifecycle management")
Reviewed-by: Steffen Eiden <seiden@linux.ibm.com>
Signed-off-by: Claudio Imbrenda <imbrenda@linux.ibm.com>

+99 -96
+14 -37
arch/s390/kvm/dat.c
··· 135 135 } 136 136 137 137 /** 138 - * dat_crstep_xchg() - Exchange a gmap CRSTE with another. 139 - * @crstep: Pointer to the CRST entry 140 - * @new: Replacement entry. 141 - * @gfn: The affected guest address. 142 - * @asce: The ASCE of the address space. 143 - * 144 - * Context: This function is assumed to be called with kvm->mmu_lock held. 145 - */ 146 - void dat_crstep_xchg(union crste *crstep, union crste new, gfn_t gfn, union asce asce) 147 - { 148 - if (crstep->h.i) { 149 - WRITE_ONCE(*crstep, new); 150 - return; 151 - } else if (cpu_has_edat2()) { 152 - crdte_crste(crstep, *crstep, new, gfn, asce); 153 - return; 154 - } 155 - 156 - if (machine_has_tlb_guest()) 157 - idte_crste(crstep, gfn, IDTE_GUEST_ASCE, asce, IDTE_GLOBAL); 158 - else 159 - idte_crste(crstep, gfn, 0, NULL_ASCE, IDTE_GLOBAL); 160 - WRITE_ONCE(*crstep, new); 161 - } 162 - 163 - /** 164 138 * dat_crstep_xchg_atomic() - Atomically exchange a gmap CRSTE with another. 165 139 * @crstep: Pointer to the CRST entry. 166 140 * @old: Expected old value. ··· 149 175 * 150 176 * Return: %true if the exchange was successful. 151 177 */ 152 - bool dat_crstep_xchg_atomic(union crste *crstep, union crste old, union crste new, gfn_t gfn, 153 - union asce asce) 178 + bool __must_check dat_crstep_xchg_atomic(union crste *crstep, union crste old, union crste new, 179 + gfn_t gfn, union asce asce) 154 180 { 155 181 if (old.h.i) 156 182 return arch_try_cmpxchg((long *)crstep, &old.val, new.val); ··· 868 894 869 895 /* This table entry needs to be updated. */ 870 896 if (walk->start <= gfn && walk->end >= next) { 871 - dat_crstep_xchg_atomic(crstep, crste, new_crste, gfn, walk->asce); 897 + if (!dat_crstep_xchg_atomic(crstep, crste, new_crste, gfn, walk->asce)) 898 + return -EINVAL; 872 899 /* A lower level table was present, needs to be freed. */ 873 900 if (!crste.h.fc && !crste.h.i) { 874 901 if (is_pmd(crste)) ··· 1047 1072 1048 1073 static long dat_set_pn_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk) 1049 1074 { 1050 - union crste crste = READ_ONCE(*crstep); 1075 + union crste newcrste, oldcrste; 1051 1076 int *n = walk->priv; 1052 1077 1053 - if (!crste.h.fc || crste.h.i || crste.h.p) 1054 - return 0; 1055 - 1078 + do { 1079 + oldcrste = READ_ONCE(*crstep); 1080 + if (!oldcrste.h.fc || oldcrste.h.i || oldcrste.h.p) 1081 + return 0; 1082 + if (oldcrste.s.fc1.prefix_notif) 1083 + break; 1084 + newcrste = oldcrste; 1085 + newcrste.s.fc1.prefix_notif = 1; 1086 + } while (!dat_crstep_xchg_atomic(crstep, oldcrste, newcrste, gfn, walk->asce)); 1056 1087 *n = 2; 1057 - if (crste.s.fc1.prefix_notif) 1058 - return 0; 1059 - crste.s.fc1.prefix_notif = 1; 1060 - dat_crstep_xchg(crstep, crste, gfn, walk->asce); 1061 1088 return 0; 1062 1089 } 1063 1090
+6 -3
arch/s390/kvm/dat.h
··· 938 938 return dat_crstep_xchg_atomic(_CRSTEP(pudp), _CRSTE(old), _CRSTE(new), gfn, asce); 939 939 } 940 940 941 - static inline void dat_crstep_clear(union crste *crstep, gfn_t gfn, union asce asce) 941 + static inline union crste dat_crstep_clear_atomic(union crste *crstep, gfn_t gfn, union asce asce) 942 942 { 943 - union crste newcrste = _CRSTE_EMPTY(crstep->h.tt); 943 + union crste oldcrste, empty = _CRSTE_EMPTY(crstep->h.tt); 944 944 945 - dat_crstep_xchg(crstep, newcrste, gfn, asce); 945 + do { 946 + oldcrste = READ_ONCE(*crstep); 947 + } while (!dat_crstep_xchg_atomic(crstep, oldcrste, empty, gfn, asce)); 948 + return oldcrste; 946 949 } 947 950 948 951 static inline int get_level(union crste *crstep, union pte *ptep)
+14 -10
arch/s390/kvm/gaccess.c
··· 1456 1456 static int _do_shadow_crste(struct gmap *sg, gpa_t raddr, union crste *host, union crste *table, 1457 1457 struct guest_fault *f, bool p) 1458 1458 { 1459 - union crste newcrste; 1459 + union crste newcrste, oldcrste; 1460 1460 gfn_t gfn; 1461 1461 int rc; 1462 1462 ··· 1469 1469 if (rc) 1470 1470 return rc; 1471 1471 1472 - newcrste = _crste_fc1(f->pfn, host->h.tt, f->writable, !p); 1473 - newcrste.s.fc1.d |= host->s.fc1.d; 1474 - newcrste.s.fc1.sd |= host->s.fc1.sd; 1475 - newcrste.h.p &= host->h.p; 1476 - newcrste.s.fc1.vsie_notif = 1; 1477 - newcrste.s.fc1.prefix_notif = host->s.fc1.prefix_notif; 1478 - _gmap_crstep_xchg(sg->parent, host, newcrste, f->gfn, false); 1472 + do { 1473 + oldcrste = READ_ONCE(*host); 1474 + newcrste = _crste_fc1(f->pfn, oldcrste.h.tt, f->writable, !p); 1475 + newcrste.s.fc1.d |= oldcrste.s.fc1.d; 1476 + newcrste.s.fc1.sd |= oldcrste.s.fc1.sd; 1477 + newcrste.h.p &= oldcrste.h.p; 1478 + newcrste.s.fc1.vsie_notif = 1; 1479 + newcrste.s.fc1.prefix_notif = oldcrste.s.fc1.prefix_notif; 1480 + } while (!_gmap_crstep_xchg_atomic(sg->parent, host, oldcrste, newcrste, f->gfn, false)); 1479 1481 1480 - newcrste = _crste_fc1(f->pfn, host->h.tt, 0, !p); 1481 - dat_crstep_xchg(table, newcrste, gpa_to_gfn(raddr), sg->asce); 1482 + newcrste = _crste_fc1(f->pfn, oldcrste.h.tt, 0, !p); 1483 + gfn = gpa_to_gfn(raddr); 1484 + while (!dat_crstep_xchg_atomic(table, READ_ONCE(*table), newcrste, gfn, sg->asce)) 1485 + ; 1482 1486 return 0; 1483 1487 } 1484 1488
+48 -34
arch/s390/kvm/gmap.c
··· 313 313 struct clear_young_pte_priv *priv = walk->priv; 314 314 union crste crste, new; 315 315 316 - crste = READ_ONCE(*crstep); 316 + do { 317 + crste = READ_ONCE(*crstep); 317 318 318 - if (!crste.h.fc) 319 - return 0; 320 - if (!crste.s.fc1.y && crste.h.i) 321 - return 0; 322 - if (!crste_prefix(crste) || gmap_mkold_prefix(priv->gmap, gfn, end)) { 319 + if (!crste.h.fc) 320 + return 0; 321 + if (!crste.s.fc1.y && crste.h.i) 322 + return 0; 323 + if (crste_prefix(crste) && !gmap_mkold_prefix(priv->gmap, gfn, end)) 324 + break; 325 + 323 326 new = crste; 324 327 new.h.i = 1; 325 328 new.s.fc1.y = 0; ··· 331 328 folio_set_dirty(phys_to_folio(crste_origin_large(crste))); 332 329 new.s.fc1.d = 0; 333 330 new.h.p = 1; 334 - dat_crstep_xchg(crstep, new, gfn, walk->asce); 335 - } 331 + } while (!dat_crstep_xchg_atomic(crstep, crste, new, gfn, walk->asce)); 332 + 336 333 priv->young = 1; 337 334 return 0; 338 335 } ··· 394 391 { 395 392 struct gmap_unmap_priv *priv = walk->priv; 396 393 struct folio *folio = NULL; 394 + union crste old = *crstep; 397 395 398 - if (crstep->h.fc) { 399 - if (crstep->s.fc1.pr && test_bit(GMAP_FLAG_EXPORT_ON_UNMAP, &priv->gmap->flags)) 400 - folio = phys_to_folio(crste_origin_large(*crstep)); 401 - gmap_crstep_xchg(priv->gmap, crstep, _CRSTE_EMPTY(crstep->h.tt), gfn); 402 - if (folio) 403 - uv_convert_from_secure_folio(folio); 404 - } 396 + if (!old.h.fc) 397 + return 0; 398 + 399 + if (old.s.fc1.pr && test_bit(GMAP_FLAG_EXPORT_ON_UNMAP, &priv->gmap->flags)) 400 + folio = phys_to_folio(crste_origin_large(old)); 401 + /* No races should happen because kvm->mmu_lock is held in write mode */ 402 + KVM_BUG_ON(!gmap_crstep_xchg_atomic(priv->gmap, crstep, old, _CRSTE_EMPTY(old.h.tt), gfn), 403 + priv->gmap->kvm); 404 + if (folio) 405 + uv_convert_from_secure_folio(folio); 405 406 406 407 return 0; 407 408 } ··· 481 474 482 475 if (fatal_signal_pending(current)) 483 476 return 1; 484 - crste = READ_ONCE(*table); 485 - if (!crste.h.fc) 486 - return 0; 487 - if (crste.h.p && !crste.s.fc1.sd) 488 - return 0; 477 + do { 478 + crste = READ_ONCE(*table); 479 + if (!crste.h.fc) 480 + return 0; 481 + if (crste.h.p && !crste.s.fc1.sd) 482 + return 0; 489 483 490 - /* 491 - * If this large page contains one or more prefixes of vCPUs that are 492 - * currently running, do not reset the protection, leave it marked as 493 - * dirty. 494 - */ 495 - if (!crste.s.fc1.prefix_notif || gmap_mkold_prefix(gmap, gfn, end)) { 484 + /* 485 + * If this large page contains one or more prefixes of vCPUs that are 486 + * currently running, do not reset the protection, leave it marked as 487 + * dirty. 488 + */ 489 + if (crste.s.fc1.prefix_notif && !gmap_mkold_prefix(gmap, gfn, end)) 490 + break; 496 491 new = crste; 497 492 new.h.p = 1; 498 493 new.s.fc1.sd = 0; 499 - gmap_crstep_xchg(gmap, table, new, gfn); 500 - } 494 + } while (!gmap_crstep_xchg_atomic(gmap, table, crste, new, gfn)); 501 495 502 496 for ( ; gfn < end; gfn++) 503 497 mark_page_dirty(gmap->kvm, gfn); ··· 654 646 static int gmap_ucas_map_one(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, 655 647 gfn_t p_gfn, gfn_t c_gfn, bool force_alloc) 656 648 { 649 + union crste newcrste, oldcrste; 657 650 struct page_table *pt; 658 - union crste newcrste; 659 651 union crste *crstep; 660 652 union pte *ptep; 661 653 int rc; ··· 681 673 &crstep, &ptep); 682 674 if (rc) 683 675 return rc; 684 - dat_crstep_xchg(crstep, newcrste, c_gfn, gmap->asce); 676 + do { 677 + oldcrste = READ_ONCE(*crstep); 678 + if (oldcrste.val == newcrste.val) 679 + break; 680 + } while (!dat_crstep_xchg_atomic(crstep, oldcrste, newcrste, c_gfn, gmap->asce)); 685 681 return 0; 686 682 } 687 683 ··· 789 777 int rc; 790 778 791 779 rc = dat_entry_walk(NULL, c_gfn, gmap->asce, 0, TABLE_TYPE_SEGMENT, &crstep, &ptep); 792 - if (!rc) 793 - dat_crstep_xchg(crstep, _PMD_EMPTY, c_gfn, gmap->asce); 780 + if (rc) 781 + return; 782 + while (!dat_crstep_xchg_atomic(crstep, READ_ONCE(*crstep), _PMD_EMPTY, c_gfn, gmap->asce)) 783 + ; 794 784 } 795 785 796 786 void gmap_ucas_unmap(struct gmap *gmap, gfn_t c_gfn, unsigned long count) ··· 1031 1017 dat_ptep_xchg(ptep, _PTE_EMPTY, r_gfn, sg->asce, uses_skeys(sg)); 1032 1018 return; 1033 1019 } 1034 - crste = READ_ONCE(*crstep); 1035 - dat_crstep_clear(crstep, r_gfn, sg->asce); 1020 + 1021 + crste = dat_crstep_clear_atomic(crstep, r_gfn, sg->asce); 1036 1022 if (crste_leaf(crste) || crste.h.i) 1037 1023 return; 1038 1024 if (is_pmd(crste))
+17 -12
arch/s390/kvm/gmap.h
··· 194 194 return _gmap_ptep_xchg(gmap, ptep, newpte, pgste, gfn, true); 195 195 } 196 196 197 - static inline void _gmap_crstep_xchg(struct gmap *gmap, union crste *crstep, union crste ne, 198 - gfn_t gfn, bool needs_lock) 197 + static inline bool __must_check _gmap_crstep_xchg_atomic(struct gmap *gmap, union crste *crstep, 198 + union crste oldcrste, union crste newcrste, 199 + gfn_t gfn, bool needs_lock) 199 200 { 200 - unsigned long align = 8 + (is_pmd(*crstep) ? 0 : 11); 201 + unsigned long align = is_pmd(newcrste) ? _PAGE_ENTRIES : _PAGE_ENTRIES * _CRST_ENTRIES; 202 + 203 + if (KVM_BUG_ON(crstep->h.tt != oldcrste.h.tt || newcrste.h.tt != oldcrste.h.tt, gmap->kvm)) 204 + return true; 201 205 202 206 lockdep_assert_held(&gmap->kvm->mmu_lock); 203 207 if (!needs_lock) 204 208 lockdep_assert_held(&gmap->children_lock); 205 209 206 210 gfn = ALIGN_DOWN(gfn, align); 207 - if (crste_prefix(*crstep) && (ne.h.p || ne.h.i || !crste_prefix(ne))) { 208 - ne.s.fc1.prefix_notif = 0; 211 + if (crste_prefix(oldcrste) && (newcrste.h.p || newcrste.h.i || !crste_prefix(newcrste))) { 212 + newcrste.s.fc1.prefix_notif = 0; 209 213 gmap_unmap_prefix(gmap, gfn, gfn + align); 210 214 } 211 - if (crste_leaf(*crstep) && crstep->s.fc1.vsie_notif && 212 - (ne.h.p || ne.h.i || !ne.s.fc1.vsie_notif)) { 213 - ne.s.fc1.vsie_notif = 0; 215 + if (crste_leaf(oldcrste) && oldcrste.s.fc1.vsie_notif && 216 + (newcrste.h.p || newcrste.h.i || !newcrste.s.fc1.vsie_notif)) { 217 + newcrste.s.fc1.vsie_notif = 0; 214 218 if (needs_lock) 215 219 gmap_handle_vsie_unshadow_event(gmap, gfn); 216 220 else 217 221 _gmap_handle_vsie_unshadow_event(gmap, gfn); 218 222 } 219 - dat_crstep_xchg(crstep, ne, gfn, gmap->asce); 223 + return dat_crstep_xchg_atomic(crstep, oldcrste, newcrste, gfn, gmap->asce); 220 224 } 221 225 222 - static inline void gmap_crstep_xchg(struct gmap *gmap, union crste *crstep, union crste ne, 223 - gfn_t gfn) 226 + static inline bool __must_check gmap_crstep_xchg_atomic(struct gmap *gmap, union crste *crstep, 227 + union crste oldcrste, union crste newcrste, 228 + gfn_t gfn) 224 229 { 225 - return _gmap_crstep_xchg(gmap, crstep, ne, gfn, true); 230 + return _gmap_crstep_xchg_atomic(gmap, crstep, oldcrste, newcrste, gfn, true); 226 231 } 227 232 228 233 /**