Merge branch 'libnvdimm-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm

Pull nvdimm fixes from Dan Williams:
"A small crop of lockdep, sleeping while atomic, and other fixes /
band-aids in advance of the full-blown reworks targeting the next
merge window. The largest change here is "libnvdimm: fix blk free
space accounting" which deletes a pile of buggy code that better
testing would have caught before merging. The next change that is
borderline too big for a late rc is switching the device-dax locking
from rcu to srcu, I couldn't think of a smaller way to make that fix.

The __copy_user_nocache fix will have a full replacement in 4.12 to
move those pmem special case considerations into the pmem driver. The
"libnvdimm: band aid btt vs clear poison locking" commit admits that
our error clearing support for btt went in broken, so we just disable
it in 4.11 and -stable. A replacement / full fix is in the pipeline
for 4.12

Some of these would have been caught earlier had DEBUG_ATOMIC_SLEEP
been enabled on my development station. I wonder if we should have:

config DEBUG_ATOMIC_SLEEP
default PROVE_LOCKING

...since I mistakenly thought I got both with PROVE_LOCKING=y.

These have received a build success notification from the 0day robot,
and some have appeared in a -next release with no reported issues"

* 'libnvdimm-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm:
x86, pmem: fix broken __copy_user_nocache cache-bypass assumptions
device-dax: switch to srcu, fix rcu_read_lock() vs pte allocation
libnvdimm: band aid btt vs clear poison locking
libnvdimm: fix reconfig_mutex, mmap_sem, and jbd2_handle lockdep splat
libnvdimm: fix blk free space accounting
acpi, nfit, libnvdimm: fix interleave set cookie calculation (64-bit comparison)

Linus Torvalds 9 years ago d5ff0814 403a39f8

+70 -85

7 changed files

expand all

arch

x86

include

asm

pmem.h

drivers

acpi

nfit

core.c

dax

Kconfig

dax.c

nvdimm

bus.c

claim.c

dimm_devs.c

+31 -11

arch/x86/include/asm/pmem.h

··· 55 55 * @size: number of bytes to write back 56 56 * 57 57 * Write back a cache range using the CLWB (cache line write back) 58 - * instruction. 58 + * instruction. Note that @size is internally rounded up to be cache 59 + * line size aligned. 59 60 */ 60 61 static inline void arch_wb_cache_pmem(void *addr, size_t size) 61 62 { ··· 68 67 for (p = (void *)((unsigned long)addr & ~clflush_mask); 69 68 p < vend; p += x86_clflush_size) 70 69 clwb(p); 71 - } 72 - 73 - /* 74 - * copy_from_iter_nocache() on x86 only uses non-temporal stores for iovec 75 - * iterators, so for other types (bvec & kvec) we must do a cache write-back. 76 - */ 77 - static inline bool __iter_needs_pmem_wb(struct iov_iter *i) 78 - { 79 - return iter_is_iovec(i) == false; 80 70 } 81 71 82 72 /** ··· 86 94 /* TODO: skip the write-back by always using non-temporal stores */ 87 95 len = copy_from_iter_nocache(addr, bytes, i); 88 96 89 - if (__iter_needs_pmem_wb(i)) 97 + /* 98 + * In the iovec case on x86_64 copy_from_iter_nocache() uses 99 + * non-temporal stores for the bulk of the transfer, but we need 100 + * to manually flush if the transfer is unaligned. A cached 101 + * memory copy is used when destination or size is not naturally 102 + * aligned. That is: 103 + * - Require 8-byte alignment when size is 8 bytes or larger. 104 + * - Require 4-byte alignment when size is 4 bytes. 105 + * 106 + * In the non-iovec case the entire destination needs to be 107 + * flushed. 108 + */ 109 + if (iter_is_iovec(i)) { 110 + unsigned long flushed, dest = (unsigned long) addr; 111 + 112 + if (bytes < 8) { 113 + if (!IS_ALIGNED(dest, 4) || (bytes != 4)) 114 + arch_wb_cache_pmem(addr, 1); 115 + } else { 116 + if (!IS_ALIGNED(dest, 8)) { 117 + dest = ALIGN(dest, boot_cpu_data.x86_clflush_size); 118 + arch_wb_cache_pmem(addr, 1); 119 + } 120 + 121 + flushed = dest - (unsigned long) addr; 122 + if (bytes > flushed && !IS_ALIGNED(bytes - flushed, 8)) 123 + arch_wb_cache_pmem(addr + bytes - 1, 1); 124 + } 125 + } else 90 126 arch_wb_cache_pmem(addr, bytes); 91 127 92 128 return len;

+5 -1

drivers/acpi/nfit/core.c

··· 1617 1617 const struct nfit_set_info_map *map0 = m0; 1618 1618 const struct nfit_set_info_map *map1 = m1; 1619 1619 1620 - return map0->region_offset - map1->region_offset; 1620 + if (map0->region_offset < map1->region_offset) 1621 + return -1; 1622 + else if (map0->region_offset > map1->region_offset) 1623 + return 1; 1624 + return 0; 1621 1625 } 1622 1626 1623 1627 /* Retrieve the nth entry referencing this spa */

drivers/dax/Kconfig

··· 2 2 tristate "DAX: direct access to differentiated memory" 3 3 default m if NVDIMM_DAX 4 4 depends on TRANSPARENT_HUGEPAGE 5 + select SRCU 5 6 help 6 7 Support raw access to differentiated (persistence, bandwidth, 7 8 latency...) memory via an mmap(2) capable character

+7 -6

drivers/dax/dax.c

··· 25 25 #include "dax.h" 26 26 27 27 static dev_t dax_devt; 28 + DEFINE_STATIC_SRCU(dax_srcu); 28 29 static struct class *dax_class; 29 30 static DEFINE_IDA(dax_minor_ida); 30 31 static int nr_dax = CONFIG_NR_DEV_DAX; ··· 61 60 * @region - parent region 62 61 * @dev - device backing the character device 63 62 * @cdev - core chardev data 64 - * @alive - !alive + rcu grace period == no new mappings can be established 63 + * @alive - !alive + srcu grace period == no new mappings can be established 65 64 * @id - child id in the region 66 65 * @num_resources - number of physical address extents in this device 67 66 * @res - array of physical address ranges ··· 570 569 static int dax_dev_huge_fault(struct vm_fault *vmf, 571 570 enum page_entry_size pe_size) 572 571 { 573 - int rc; 572 + int rc, id; 574 573 struct file *filp = vmf->vma->vm_file; 575 574 struct dax_dev *dax_dev = filp->private_data; 576 575 ··· 579 578 ? "write" : "read", 580 579 vmf->vma->vm_start, vmf->vma->vm_end); 581 580 582 - rcu_read_lock(); 581 + id = srcu_read_lock(&dax_srcu); 583 582 switch (pe_size) { 584 583 case PE_SIZE_PTE: 585 584 rc = __dax_dev_pte_fault(dax_dev, vmf); ··· 593 592 default: 594 593 return VM_FAULT_FALLBACK; 595 594 } 596 - rcu_read_unlock(); 595 + srcu_read_unlock(&dax_srcu, id); 597 596 598 597 return rc; 599 598 } ··· 714 713 * Note, rcu is not protecting the liveness of dax_dev, rcu is 715 714 * ensuring that any fault handlers that might have seen 716 715 * dax_dev->alive == true, have completed. Any fault handlers 717 - * that start after synchronize_rcu() has started will abort 716 + * that start after synchronize_srcu() has started will abort 718 717 * upon seeing dax_dev->alive == false. 719 718 */ 720 719 dax_dev->alive = false; 721 - synchronize_rcu(); 720 + synchronize_srcu(&dax_srcu); 722 721 unmap_mapping_range(dax_dev->inode->i_mapping, 0, 0, 1); 723 722 cdev_del(cdev); 724 723 device_unregister(dev);

drivers/nvdimm/bus.c

··· 934 934 rc = nd_desc->ndctl(nd_desc, nvdimm, cmd, buf, buf_len, NULL); 935 935 if (rc < 0) 936 936 goto out_unlock; 937 + nvdimm_bus_unlock(&nvdimm_bus->dev); 938 + 937 939 if (copy_to_user(p, buf, buf_len)) 938 940 rc = -EFAULT; 941 + 942 + vfree(buf); 943 + return rc; 944 + 939 945 out_unlock: 940 946 nvdimm_bus_unlock(&nvdimm_bus->dev); 941 947 out:

+9 -1

drivers/nvdimm/claim.c

··· 243 243 } 244 244 245 245 if (unlikely(is_bad_pmem(&nsio->bb, sector, sz_align))) { 246 - if (IS_ALIGNED(offset, 512) && IS_ALIGNED(size, 512)) { 246 + /* 247 + * FIXME: nsio_rw_bytes() may be called from atomic 248 + * context in the btt case and nvdimm_clear_poison() 249 + * takes a sleeping lock. Until the locking can be 250 + * reworked this capability requires that the namespace 251 + * is not claimed by btt. 252 + */ 253 + if (IS_ALIGNED(offset, 512) && IS_ALIGNED(size, 512) 254 + && (!ndns->claim || !is_nd_btt(ndns->claim))) { 247 255 long cleared; 248 256 249 257 cleared = nvdimm_clear_poison(&ndns->dev, offset, size);

+11 -66

drivers/nvdimm/dimm_devs.c

··· 395 395 396 396 int alias_dpa_busy(struct device *dev, void *data) 397 397 { 398 - resource_size_t map_end, blk_start, new, busy; 398 + resource_size_t map_end, blk_start, new; 399 399 struct blk_alloc_info *info = data; 400 400 struct nd_mapping *nd_mapping; 401 401 struct nd_region *nd_region; ··· 436 436 retry: 437 437 /* 438 438 * Find the free dpa from the end of the last pmem allocation to 439 - * the end of the interleave-set mapping that is not already 440 - * covered by a blk allocation. 439 + * the end of the interleave-set mapping. 441 440 */ 442 - busy = 0; 443 441 for_each_dpa_resource(ndd, res) { 442 + if (strncmp(res->name, "pmem", 4) != 0) 443 + continue; 444 444 if ((res->start >= blk_start && res->start < map_end) 445 445 || (res->end >= blk_start 446 446 && res->end <= map_end)) { 447 - if (strncmp(res->name, "pmem", 4) == 0) { 448 - new = max(blk_start, min(map_end + 1, 449 - res->end + 1)); 450 - if (new != blk_start) { 451 - blk_start = new; 452 - goto retry; 453 - } 454 - } else 455 - busy += min(map_end, res->end) 456 - - max(nd_mapping->start, res->start) + 1; 457 - } else if (nd_mapping->start > res->start 458 - && map_end < res->end) { 459 - /* total eclipse of the PMEM region mapping */ 460 - busy += nd_mapping->size; 461 - break; 447 + new = max(blk_start, min(map_end + 1, res->end + 1)); 448 + if (new != blk_start) { 449 + blk_start = new; 450 + goto retry; 451 + } 462 452 } 463 453 } 464 454 ··· 460 470 return 1; 461 471 } 462 472 463 - info->available -= blk_start - nd_mapping->start + busy; 473 + info->available -= blk_start - nd_mapping->start; 464 474 465 475 return 0; 466 - } 467 - 468 - static int blk_dpa_busy(struct device *dev, void *data) 469 - { 470 - struct blk_alloc_info *info = data; 471 - struct nd_mapping *nd_mapping; 472 - struct nd_region *nd_region; 473 - resource_size_t map_end; 474 - int i; 475 - 476 - if (!is_nd_pmem(dev)) 477 - return 0; 478 - 479 - nd_region = to_nd_region(dev); 480 - for (i = 0; i < nd_region->ndr_mappings; i++) { 481 - nd_mapping = &nd_region->mapping[i]; 482 - if (nd_mapping->nvdimm == info->nd_mapping->nvdimm) 483 - break; 484 - } 485 - 486 - if (i >= nd_region->ndr_mappings) 487 - return 0; 488 - 489 - map_end = nd_mapping->start + nd_mapping->size - 1; 490 - if (info->res->start >= nd_mapping->start 491 - && info->res->start < map_end) { 492 - if (info->res->end <= map_end) { 493 - info->busy = 0; 494 - return 1; 495 - } else { 496 - info->busy -= info->res->end - map_end; 497 - return 0; 498 - } 499 - } else if (info->res->end >= nd_mapping->start 500 - && info->res->end <= map_end) { 501 - info->busy -= nd_mapping->start - info->res->start; 502 - return 0; 503 - } else { 504 - info->busy -= nd_mapping->size; 505 - return 0; 506 - } 507 476 } 508 477 509 478 /** ··· 494 545 for_each_dpa_resource(ndd, res) { 495 546 if (strncmp(res->name, "blk", 3) != 0) 496 547 continue; 497 - 498 - info.res = res; 499 - info.busy = resource_size(res); 500 - device_for_each_child(&nvdimm_bus->dev, &info, blk_dpa_busy); 501 - info.available -= info.busy; 548 + info.available -= resource_size(res); 502 549 } 503 550 504 551 return info.available;

Configure Feed

Configure Feed