Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'akpm' (patches from Andrew)

Merge small final update from Andrew Morton:

- DAX feature work: add fsync/msync support

- kfree cleanup, MAINTAINERS update

* emailed patches from Andrew Morton <akpm@linux-foundation.org>:
MAINTAINERS: return arch/sh to maintained state, with new maintainers
tree wide: use kvfree() than conditional kfree()/vfree()
dax: never rely on bh.b_dev being set by get_block()
xfs: call dax_pfn_mkwrite() for DAX fsync/msync
ext4: call dax_pfn_mkwrite() for DAX fsync/msync
ext2: call dax_pfn_mkwrite() for DAX fsync/msync
dax: add support for fsync/sync
mm: add find_get_entries_tag()
dax: support dirty DAX entries in radix tree
pmem: add wb_cache_pmem() to the PMEM API
dax: fix conversion of holes to PMDs
dax: fix NULL pointer dereference in __dax_dbg()

+493 -171
+3 -1
MAINTAINERS
··· 10453 10453 F: drivers/net/ethernet/dlink/sundance.c 10454 10454 10455 10455 SUPERH 10456 + M: Yoshinori Sato <ysato@users.sourceforge.jp> 10457 + M: Rich Felker <dalias@libc.org> 10456 10458 L: linux-sh@vger.kernel.org 10457 10459 Q: http://patchwork.kernel.org/project/linux-sh/list/ 10458 - S: Orphan 10460 + S: Maintained 10459 10461 F: Documentation/sh/ 10460 10462 F: arch/sh/ 10461 10463 F: drivers/sh/
+2 -9
arch/arm/mm/dma-mapping.c
··· 1200 1200 while (i--) 1201 1201 if (pages[i]) 1202 1202 __free_pages(pages[i], 0); 1203 - if (array_size <= PAGE_SIZE) 1204 - kfree(pages); 1205 - else 1206 - vfree(pages); 1203 + kvfree(pages); 1207 1204 return NULL; 1208 1205 } 1209 1206 ··· 1208 1211 size_t size, struct dma_attrs *attrs) 1209 1212 { 1210 1213 int count = size >> PAGE_SHIFT; 1211 - int array_size = count * sizeof(struct page *); 1212 1214 int i; 1213 1215 1214 1216 if (dma_get_attr(DMA_ATTR_FORCE_CONTIGUOUS, attrs)) { ··· 1218 1222 __free_pages(pages[i], 0); 1219 1223 } 1220 1224 1221 - if (array_size <= PAGE_SIZE) 1222 - kfree(pages); 1223 - else 1224 - vfree(pages); 1225 + kvfree(pages); 1225 1226 return 0; 1226 1227 } 1227 1228
+6 -5
arch/x86/include/asm/pmem.h
··· 67 67 } 68 68 69 69 /** 70 - * __arch_wb_cache_pmem - write back a cache range with CLWB 70 + * arch_wb_cache_pmem - write back a cache range with CLWB 71 71 * @vaddr: virtual start address 72 72 * @size: number of bytes to write back 73 73 * 74 74 * Write back a cache range using the CLWB (cache line write back) 75 75 * instruction. This function requires explicit ordering with an 76 - * arch_wmb_pmem() call. This API is internal to the x86 PMEM implementation. 76 + * arch_wmb_pmem() call. 77 77 */ 78 - static inline void __arch_wb_cache_pmem(void *vaddr, size_t size) 78 + static inline void arch_wb_cache_pmem(void __pmem *addr, size_t size) 79 79 { 80 80 u16 x86_clflush_size = boot_cpu_data.x86_clflush_size; 81 81 unsigned long clflush_mask = x86_clflush_size - 1; 82 + void *vaddr = (void __force *)addr; 82 83 void *vend = vaddr + size; 83 84 void *p; 84 85 ··· 116 115 len = copy_from_iter_nocache(vaddr, bytes, i); 117 116 118 117 if (__iter_needs_pmem_wb(i)) 119 - __arch_wb_cache_pmem(vaddr, bytes); 118 + arch_wb_cache_pmem(addr, bytes); 120 119 121 120 return len; 122 121 } ··· 134 133 void *vaddr = (void __force *)addr; 135 134 136 135 memset(vaddr, 0, size); 137 - __arch_wb_cache_pmem(vaddr, size); 136 + arch_wb_cache_pmem(addr, size); 138 137 } 139 138 140 139 static inline bool __arch_has_wmb_pmem(void)
+2 -4
drivers/acpi/apei/erst.c
··· 32 32 #include <linux/hardirq.h> 33 33 #include <linux/pstore.h> 34 34 #include <linux/vmalloc.h> 35 + #include <linux/mm.h> /* kvfree() */ 35 36 #include <acpi/apei.h> 36 37 37 38 #include "apei-internal.h" ··· 533 532 return -ENOMEM; 534 533 memcpy(new_entries, entries, 535 534 erst_record_id_cache.len * sizeof(entries[0])); 536 - if (erst_record_id_cache.size < PAGE_SIZE) 537 - kfree(entries); 538 - else 539 - vfree(entries); 535 + kvfree(entries); 540 536 erst_record_id_cache.entries = entries = new_entries; 541 537 erst_record_id_cache.size = new_size; 542 538 }
+7 -19
drivers/block/drbd/drbd_bitmap.c
··· 364 364 } 365 365 } 366 366 367 - static void bm_vk_free(void *ptr, int v) 367 + static inline void bm_vk_free(void *ptr) 368 368 { 369 - if (v) 370 - vfree(ptr); 371 - else 372 - kfree(ptr); 369 + kvfree(ptr); 373 370 } 374 371 375 372 /* ··· 376 379 { 377 380 struct page **old_pages = b->bm_pages; 378 381 struct page **new_pages, *page; 379 - unsigned int i, bytes, vmalloced = 0; 382 + unsigned int i, bytes; 380 383 unsigned long have = b->bm_number_of_pages; 381 384 382 385 BUG_ON(have == 0 && old_pages != NULL); ··· 398 401 PAGE_KERNEL); 399 402 if (!new_pages) 400 403 return NULL; 401 - vmalloced = 1; 402 404 } 403 405 404 406 if (want >= have) { ··· 407 411 page = alloc_page(GFP_NOIO | __GFP_HIGHMEM); 408 412 if (!page) { 409 413 bm_free_pages(new_pages + have, i - have); 410 - bm_vk_free(new_pages, vmalloced); 414 + bm_vk_free(new_pages); 411 415 return NULL; 412 416 } 413 417 /* we want to know which page it is ··· 422 426 bm_free_pages(old_pages + want, have - want); 423 427 */ 424 428 } 425 - 426 - if (vmalloced) 427 - b->bm_flags |= BM_P_VMALLOCED; 428 - else 429 - b->bm_flags &= ~BM_P_VMALLOCED; 430 429 431 430 return new_pages; 432 431 } ··· 460 469 if (!expect(device->bitmap)) 461 470 return; 462 471 bm_free_pages(device->bitmap->bm_pages, device->bitmap->bm_number_of_pages); 463 - bm_vk_free(device->bitmap->bm_pages, (BM_P_VMALLOCED & device->bitmap->bm_flags)); 472 + bm_vk_free(device->bitmap->bm_pages); 464 473 kfree(device->bitmap); 465 474 device->bitmap = NULL; 466 475 } ··· 634 643 unsigned long want, have, onpages; /* number of pages */ 635 644 struct page **npages, **opages = NULL; 636 645 int err = 0, growing; 637 - int opages_vmalloced; 638 646 639 647 if (!expect(b)) 640 648 return -ENOMEM; ··· 645 655 646 656 if (capacity == b->bm_dev_capacity) 647 657 goto out; 648 - 649 - opages_vmalloced = (BM_P_VMALLOCED & b->bm_flags); 650 658 651 659 if (capacity == 0) { 652 660 spin_lock_irq(&b->bm_lock); ··· 659 671 b->bm_dev_capacity = 0; 660 672 spin_unlock_irq(&b->bm_lock); 661 673 bm_free_pages(opages, onpages); 662 - bm_vk_free(opages, opages_vmalloced); 674 + bm_vk_free(opages); 663 675 goto out; 664 676 } 665 677 bits = BM_SECT_TO_BIT(ALIGN(capacity, BM_SECT_PER_BIT)); ··· 732 744 733 745 spin_unlock_irq(&b->bm_lock); 734 746 if (opages != npages) 735 - bm_vk_free(opages, opages_vmalloced); 747 + bm_vk_free(opages); 736 748 if (!growing) 737 749 b->bm_set = bm_count_bits(b); 738 750 drbd_info(device, "resync bitmap: bits=%lu words=%lu pages=%lu\n", bits, words, want);
-3
drivers/block/drbd/drbd_int.h
··· 536 536 /* definition of bits in bm_flags to be used in drbd_bm_lock 537 537 * and drbd_bitmap_io and friends. */ 538 538 enum bm_flag { 539 - /* do we need to kfree, or vfree bm_pages? */ 540 - BM_P_VMALLOCED = 0x10000, /* internal use only, will be masked out */ 541 - 542 539 /* currently locked for bulk operation */ 543 540 BM_LOCKED_MASK = 0xf, 544 541
+3 -12
drivers/char/mspec.c
··· 93 93 spinlock_t lock; /* Serialize access to this structure. */ 94 94 int count; /* Number of pages allocated. */ 95 95 enum mspec_page_type type; /* Type of pages allocated. */ 96 - int flags; /* See VMD_xxx below. */ 97 96 unsigned long vm_start; /* Original (unsplit) base. */ 98 97 unsigned long vm_end; /* Original (unsplit) end. */ 99 98 unsigned long maddr[0]; /* Array of MSPEC addresses. */ 100 99 }; 101 - 102 - #define VMD_VMALLOCED 0x1 /* vmalloc'd rather than kmalloc'd */ 103 100 104 101 /* used on shub2 to clear FOP cache in the HUB */ 105 102 static unsigned long scratch_page[MAX_NUMNODES]; ··· 182 185 "failed to zero page %ld\n", my_page); 183 186 } 184 187 185 - if (vdata->flags & VMD_VMALLOCED) 186 - vfree(vdata); 187 - else 188 - kfree(vdata); 188 + kvfree(vdata); 189 189 } 190 190 191 191 /* ··· 250 256 enum mspec_page_type type) 251 257 { 252 258 struct vma_data *vdata; 253 - int pages, vdata_size, flags = 0; 259 + int pages, vdata_size; 254 260 255 261 if (vma->vm_pgoff != 0) 256 262 return -EINVAL; ··· 265 271 vdata_size = sizeof(struct vma_data) + pages * sizeof(long); 266 272 if (vdata_size <= PAGE_SIZE) 267 273 vdata = kzalloc(vdata_size, GFP_KERNEL); 268 - else { 274 + else 269 275 vdata = vzalloc(vdata_size); 270 - flags = VMD_VMALLOCED; 271 - } 272 276 if (!vdata) 273 277 return -ENOMEM; 274 278 275 279 vdata->vm_start = vma->vm_start; 276 280 vdata->vm_end = vma->vm_end; 277 - vdata->flags = flags; 278 281 vdata->type = type; 279 282 spin_lock_init(&vdata->lock); 280 283 atomic_set(&vdata->refcnt, 1);
+1 -4
drivers/gpu/drm/drm_hashtab.c
··· 198 198 void drm_ht_remove(struct drm_open_hash *ht) 199 199 { 200 200 if (ht->table) { 201 - if ((PAGE_SIZE / sizeof(*ht->table)) >> ht->order) 202 - kfree(ht->table); 203 - else 204 - vfree(ht->table); 201 + kvfree(ht->table); 205 202 ht->table = NULL; 206 203 } 207 204 }
+2 -6
drivers/staging/lustre/include/linux/libcfs/libcfs_private.h
··· 151 151 152 152 #define LIBCFS_FREE(ptr, size) \ 153 153 do { \ 154 - int s = (size); \ 155 154 if (unlikely((ptr) == NULL)) { \ 156 155 CERROR("LIBCFS: free NULL '" #ptr "' (%d bytes) at " \ 157 - "%s:%d\n", s, __FILE__, __LINE__); \ 156 + "%s:%d\n", (int)(size), __FILE__, __LINE__); \ 158 157 break; \ 159 158 } \ 160 - if (unlikely(s > LIBCFS_VMALLOC_SIZE)) \ 161 - vfree(ptr); \ 162 - else \ 163 - kfree(ptr); \ 159 + kvfree(ptr); \ 164 160 } while (0) 165 161 166 162 /******************************************************************************/
+1 -1
fs/block_dev.c
··· 75 75 { 76 76 struct address_space *mapping = bdev->bd_inode->i_mapping; 77 77 78 - if (mapping->nrpages == 0 && mapping->nrshadows == 0) 78 + if (mapping->nrpages == 0 && mapping->nrexceptional == 0) 79 79 return; 80 80 81 81 invalidate_bh_lrus();
+1 -2
fs/coda/coda_linux.h
··· 72 72 } while (0) 73 73 74 74 75 - #define CODA_FREE(ptr,size) \ 76 - do { if (size < PAGE_SIZE) kfree((ptr)); else vfree((ptr)); } while (0) 75 + #define CODA_FREE(ptr, size) kvfree((ptr)) 77 76 78 77 /* inode to cnode access functions */ 79 78
+260 -14
fs/dax.c
··· 24 24 #include <linux/memcontrol.h> 25 25 #include <linux/mm.h> 26 26 #include <linux/mutex.h> 27 + #include <linux/pagevec.h> 27 28 #include <linux/pmem.h> 28 29 #include <linux/sched.h> 29 30 #include <linux/uio.h> ··· 246 245 loff_t end = pos + iov_iter_count(iter); 247 246 248 247 memset(&bh, 0, sizeof(bh)); 248 + bh.b_bdev = inode->i_sb->s_bdev; 249 249 250 250 if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) { 251 251 struct address_space *mapping = inode->i_mapping; ··· 326 324 return 0; 327 325 } 328 326 327 + #define NO_SECTOR -1 328 + #define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_CACHE_SHIFT)) 329 + 330 + static int dax_radix_entry(struct address_space *mapping, pgoff_t index, 331 + sector_t sector, bool pmd_entry, bool dirty) 332 + { 333 + struct radix_tree_root *page_tree = &mapping->page_tree; 334 + pgoff_t pmd_index = DAX_PMD_INDEX(index); 335 + int type, error = 0; 336 + void *entry; 337 + 338 + WARN_ON_ONCE(pmd_entry && !dirty); 339 + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 340 + 341 + spin_lock_irq(&mapping->tree_lock); 342 + 343 + entry = radix_tree_lookup(page_tree, pmd_index); 344 + if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) { 345 + index = pmd_index; 346 + goto dirty; 347 + } 348 + 349 + entry = radix_tree_lookup(page_tree, index); 350 + if (entry) { 351 + type = RADIX_DAX_TYPE(entry); 352 + if (WARN_ON_ONCE(type != RADIX_DAX_PTE && 353 + type != RADIX_DAX_PMD)) { 354 + error = -EIO; 355 + goto unlock; 356 + } 357 + 358 + if (!pmd_entry || type == RADIX_DAX_PMD) 359 + goto dirty; 360 + 361 + /* 362 + * We only insert dirty PMD entries into the radix tree. This 363 + * means we don't need to worry about removing a dirty PTE 364 + * entry and inserting a clean PMD entry, thus reducing the 365 + * range we would flush with a follow-up fsync/msync call. 366 + */ 367 + radix_tree_delete(&mapping->page_tree, index); 368 + mapping->nrexceptional--; 369 + } 370 + 371 + if (sector == NO_SECTOR) { 372 + /* 373 + * This can happen during correct operation if our pfn_mkwrite 374 + * fault raced against a hole punch operation. If this 375 + * happens the pte that was hole punched will have been 376 + * unmapped and the radix tree entry will have been removed by 377 + * the time we are called, but the call will still happen. We 378 + * will return all the way up to wp_pfn_shared(), where the 379 + * pte_same() check will fail, eventually causing page fault 380 + * to be retried by the CPU. 381 + */ 382 + goto unlock; 383 + } 384 + 385 + error = radix_tree_insert(page_tree, index, 386 + RADIX_DAX_ENTRY(sector, pmd_entry)); 387 + if (error) 388 + goto unlock; 389 + 390 + mapping->nrexceptional++; 391 + dirty: 392 + if (dirty) 393 + radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY); 394 + unlock: 395 + spin_unlock_irq(&mapping->tree_lock); 396 + return error; 397 + } 398 + 399 + static int dax_writeback_one(struct block_device *bdev, 400 + struct address_space *mapping, pgoff_t index, void *entry) 401 + { 402 + struct radix_tree_root *page_tree = &mapping->page_tree; 403 + int type = RADIX_DAX_TYPE(entry); 404 + struct radix_tree_node *node; 405 + struct blk_dax_ctl dax; 406 + void **slot; 407 + int ret = 0; 408 + 409 + spin_lock_irq(&mapping->tree_lock); 410 + /* 411 + * Regular page slots are stabilized by the page lock even 412 + * without the tree itself locked. These unlocked entries 413 + * need verification under the tree lock. 414 + */ 415 + if (!__radix_tree_lookup(page_tree, index, &node, &slot)) 416 + goto unlock; 417 + if (*slot != entry) 418 + goto unlock; 419 + 420 + /* another fsync thread may have already written back this entry */ 421 + if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)) 422 + goto unlock; 423 + 424 + if (WARN_ON_ONCE(type != RADIX_DAX_PTE && type != RADIX_DAX_PMD)) { 425 + ret = -EIO; 426 + goto unlock; 427 + } 428 + 429 + dax.sector = RADIX_DAX_SECTOR(entry); 430 + dax.size = (type == RADIX_DAX_PMD ? PMD_SIZE : PAGE_SIZE); 431 + spin_unlock_irq(&mapping->tree_lock); 432 + 433 + /* 434 + * We cannot hold tree_lock while calling dax_map_atomic() because it 435 + * eventually calls cond_resched(). 436 + */ 437 + ret = dax_map_atomic(bdev, &dax); 438 + if (ret < 0) 439 + return ret; 440 + 441 + if (WARN_ON_ONCE(ret < dax.size)) { 442 + ret = -EIO; 443 + goto unmap; 444 + } 445 + 446 + wb_cache_pmem(dax.addr, dax.size); 447 + 448 + spin_lock_irq(&mapping->tree_lock); 449 + radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE); 450 + spin_unlock_irq(&mapping->tree_lock); 451 + unmap: 452 + dax_unmap_atomic(bdev, &dax); 453 + return ret; 454 + 455 + unlock: 456 + spin_unlock_irq(&mapping->tree_lock); 457 + return ret; 458 + } 459 + 460 + /* 461 + * Flush the mapping to the persistent domain within the byte range of [start, 462 + * end]. This is required by data integrity operations to ensure file data is 463 + * on persistent storage prior to completion of the operation. 464 + */ 465 + int dax_writeback_mapping_range(struct address_space *mapping, loff_t start, 466 + loff_t end) 467 + { 468 + struct inode *inode = mapping->host; 469 + struct block_device *bdev = inode->i_sb->s_bdev; 470 + pgoff_t start_index, end_index, pmd_index; 471 + pgoff_t indices[PAGEVEC_SIZE]; 472 + struct pagevec pvec; 473 + bool done = false; 474 + int i, ret = 0; 475 + void *entry; 476 + 477 + if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT)) 478 + return -EIO; 479 + 480 + start_index = start >> PAGE_CACHE_SHIFT; 481 + end_index = end >> PAGE_CACHE_SHIFT; 482 + pmd_index = DAX_PMD_INDEX(start_index); 483 + 484 + rcu_read_lock(); 485 + entry = radix_tree_lookup(&mapping->page_tree, pmd_index); 486 + rcu_read_unlock(); 487 + 488 + /* see if the start of our range is covered by a PMD entry */ 489 + if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) 490 + start_index = pmd_index; 491 + 492 + tag_pages_for_writeback(mapping, start_index, end_index); 493 + 494 + pagevec_init(&pvec, 0); 495 + while (!done) { 496 + pvec.nr = find_get_entries_tag(mapping, start_index, 497 + PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE, 498 + pvec.pages, indices); 499 + 500 + if (pvec.nr == 0) 501 + break; 502 + 503 + for (i = 0; i < pvec.nr; i++) { 504 + if (indices[i] > end_index) { 505 + done = true; 506 + break; 507 + } 508 + 509 + ret = dax_writeback_one(bdev, mapping, indices[i], 510 + pvec.pages[i]); 511 + if (ret < 0) 512 + return ret; 513 + } 514 + } 515 + wmb_pmem(); 516 + return 0; 517 + } 518 + EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); 519 + 329 520 static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, 330 521 struct vm_area_struct *vma, struct vm_fault *vmf) 331 522 { ··· 557 362 wmb_pmem(); 558 363 } 559 364 dax_unmap_atomic(bdev, &dax); 365 + 366 + error = dax_radix_entry(mapping, vmf->pgoff, dax.sector, false, 367 + vmf->flags & FAULT_FLAG_WRITE); 368 + if (error) 369 + goto out; 560 370 561 371 error = vm_insert_mixed(vma, vaddr, dax.pfn); 562 372 ··· 608 408 609 409 memset(&bh, 0, sizeof(bh)); 610 410 block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits); 411 + bh.b_bdev = inode->i_sb->s_bdev; 611 412 bh.b_size = PAGE_SIZE; 612 413 613 414 repeat: ··· 688 487 delete_from_page_cache(page); 689 488 unlock_page(page); 690 489 page_cache_release(page); 490 + page = NULL; 691 491 } 692 492 693 493 /* ··· 792 590 struct block_device *bdev; 793 591 pgoff_t size, pgoff; 794 592 sector_t block; 795 - int result = 0; 593 + int error, result = 0; 594 + bool alloc = false; 796 595 797 596 /* dax pmd mappings require pfn_t_devmap() */ 798 597 if (!IS_ENABLED(CONFIG_FS_DAX_PMD)) ··· 827 624 } 828 625 829 626 memset(&bh, 0, sizeof(bh)); 627 + bh.b_bdev = inode->i_sb->s_bdev; 830 628 block = (sector_t)pgoff << (PAGE_SHIFT - blkbits); 831 629 832 630 bh.b_size = PMD_SIZE; 833 - if (get_block(inode, block, &bh, write) != 0) 631 + 632 + if (get_block(inode, block, &bh, 0) != 0) 834 633 return VM_FAULT_SIGBUS; 634 + 635 + if (!buffer_mapped(&bh) && write) { 636 + if (get_block(inode, block, &bh, 1) != 0) 637 + return VM_FAULT_SIGBUS; 638 + alloc = true; 639 + } 640 + 835 641 bdev = bh.b_bdev; 836 - i_mmap_lock_read(mapping); 837 642 838 643 /* 839 644 * If the filesystem isn't willing to tell us the length of a hole, ··· 850 639 */ 851 640 if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) { 852 641 dax_pmd_dbg(&bh, address, "allocated block too small"); 853 - goto fallback; 642 + return VM_FAULT_FALLBACK; 854 643 } 855 644 856 645 /* 857 646 * If we allocated new storage, make sure no process has any 858 647 * zero pages covering this hole 859 648 */ 860 - if (buffer_new(&bh)) { 861 - i_mmap_unlock_read(mapping); 862 - unmap_mapping_range(mapping, pgoff << PAGE_SHIFT, PMD_SIZE, 0); 863 - i_mmap_lock_read(mapping); 649 + if (alloc) { 650 + loff_t lstart = pgoff << PAGE_SHIFT; 651 + loff_t lend = lstart + PMD_SIZE - 1; /* inclusive */ 652 + 653 + truncate_pagecache_range(inode, lstart, lend); 864 654 } 655 + 656 + i_mmap_lock_read(mapping); 865 657 866 658 /* 867 659 * If a truncate happened while we were allocating blocks, we may ··· 878 664 goto out; 879 665 } 880 666 if ((pgoff | PG_PMD_COLOUR) >= size) { 881 - dax_pmd_dbg(&bh, address, "pgoff unaligned"); 667 + dax_pmd_dbg(&bh, address, 668 + "offset + huge page size > file size"); 882 669 goto fallback; 883 670 } 884 671 ··· 947 732 } 948 733 dax_unmap_atomic(bdev, &dax); 949 734 735 + /* 736 + * For PTE faults we insert a radix tree entry for reads, and 737 + * leave it clean. Then on the first write we dirty the radix 738 + * tree entry via the dax_pfn_mkwrite() path. This sequence 739 + * allows the dax_pfn_mkwrite() call to be simpler and avoid a 740 + * call into get_block() to translate the pgoff to a sector in 741 + * order to be able to create a new radix tree entry. 742 + * 743 + * The PMD path doesn't have an equivalent to 744 + * dax_pfn_mkwrite(), though, so for a read followed by a 745 + * write we traverse all the way through __dax_pmd_fault() 746 + * twice. This means we can just skip inserting a radix tree 747 + * entry completely on the initial read and just wait until 748 + * the write to insert a dirty entry. 749 + */ 750 + if (write) { 751 + error = dax_radix_entry(mapping, pgoff, dax.sector, 752 + true, true); 753 + if (error) { 754 + dax_pmd_dbg(&bh, address, 755 + "PMD radix insertion failed"); 756 + goto fallback; 757 + } 758 + } 759 + 950 760 dev_dbg(part_to_dev(bdev->bd_part), 951 761 "%s: %s addr: %lx pfn: %lx sect: %llx\n", 952 762 __func__, current->comm, address, ··· 1030 790 * dax_pfn_mkwrite - handle first write to DAX page 1031 791 * @vma: The virtual memory area where the fault occurred 1032 792 * @vmf: The description of the fault 1033 - * 1034 793 */ 1035 794 int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) 1036 795 { 1037 - struct super_block *sb = file_inode(vma->vm_file)->i_sb; 796 + struct file *file = vma->vm_file; 1038 797 1039 - sb_start_pagefault(sb); 1040 - file_update_time(vma->vm_file); 1041 - sb_end_pagefault(sb); 798 + /* 799 + * We pass NO_SECTOR to dax_radix_entry() because we expect that a 800 + * RADIX_DAX_PTE entry already exists in the radix tree from a 801 + * previous call to __dax_fault(). We just want to look up that PTE 802 + * entry using vmf->pgoff and make sure the dirty tag is set. This 803 + * saves us from having to make a call to get_block() here to look 804 + * up the sector. 805 + */ 806 + dax_radix_entry(file->f_mapping, vmf->pgoff, NO_SECTOR, false, true); 1042 807 return VM_FAULT_NOPAGE; 1043 808 } 1044 809 EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); ··· 1080 835 BUG_ON((offset + length) > PAGE_CACHE_SIZE); 1081 836 1082 837 memset(&bh, 0, sizeof(bh)); 838 + bh.b_bdev = inode->i_sb->s_bdev; 1083 839 bh.b_size = PAGE_CACHE_SIZE; 1084 840 err = get_block(inode, index, &bh, 0); 1085 841 if (err < 0)
+3 -1
fs/ext2/file.c
··· 102 102 { 103 103 struct inode *inode = file_inode(vma->vm_file); 104 104 struct ext2_inode_info *ei = EXT2_I(inode); 105 - int ret = VM_FAULT_NOPAGE; 106 105 loff_t size; 106 + int ret; 107 107 108 108 sb_start_pagefault(inode->i_sb); 109 109 file_update_time(vma->vm_file); ··· 113 113 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; 114 114 if (vmf->pgoff >= size) 115 115 ret = VM_FAULT_SIGBUS; 116 + else 117 + ret = dax_pfn_mkwrite(vma, vmf); 116 118 117 119 up_read(&ei->dax_sem); 118 120 sb_end_pagefault(inode->i_sb);
+3 -1
fs/ext4/file.c
··· 291 291 { 292 292 struct inode *inode = file_inode(vma->vm_file); 293 293 struct super_block *sb = inode->i_sb; 294 - int ret = VM_FAULT_NOPAGE; 295 294 loff_t size; 295 + int ret; 296 296 297 297 sb_start_pagefault(sb); 298 298 file_update_time(vma->vm_file); ··· 300 300 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; 301 301 if (vmf->pgoff >= size) 302 302 ret = VM_FAULT_SIGBUS; 303 + else 304 + ret = dax_pfn_mkwrite(vma, vmf); 303 305 up_read(&EXT4_I(inode)->i_mmap_sem); 304 306 sb_end_pagefault(sb); 305 307
+1 -1
fs/inode.c
··· 495 495 */ 496 496 spin_lock_irq(&inode->i_data.tree_lock); 497 497 BUG_ON(inode->i_data.nrpages); 498 - BUG_ON(inode->i_data.nrshadows); 498 + BUG_ON(inode->i_data.nrexceptional); 499 499 spin_unlock_irq(&inode->i_data.tree_lock); 500 500 BUG_ON(!list_empty(&inode->i_data.private_list)); 501 501 BUG_ON(!(inode->i_state & I_FREEING));
+2 -6
fs/jffs2/build.c
··· 17 17 #include <linux/slab.h> 18 18 #include <linux/vmalloc.h> 19 19 #include <linux/mtd/mtd.h> 20 + #include <linux/mm.h> /* kvfree() */ 20 21 #include "nodelist.h" 21 22 22 23 static void jffs2_build_remove_unlinked_inode(struct jffs2_sb_info *, ··· 384 383 return 0; 385 384 386 385 out_free: 387 - #ifndef __ECOS 388 - if (jffs2_blocks_use_vmalloc(c)) 389 - vfree(c->blocks); 390 - else 391 - #endif 392 - kfree(c->blocks); 386 + kvfree(c->blocks); 393 387 394 388 return ret; 395 389 }
+1 -4
fs/jffs2/fs.c
··· 596 596 out_root: 597 597 jffs2_free_ino_caches(c); 598 598 jffs2_free_raw_node_refs(c); 599 - if (jffs2_blocks_use_vmalloc(c)) 600 - vfree(c->blocks); 601 - else 602 - kfree(c->blocks); 599 + kvfree(c->blocks); 603 600 out_inohash: 604 601 jffs2_clear_xattr_subsystem(c); 605 602 kfree(c->inocache_list);
+1 -4
fs/jffs2/super.c
··· 331 331 332 332 jffs2_free_ino_caches(c); 333 333 jffs2_free_raw_node_refs(c); 334 - if (jffs2_blocks_use_vmalloc(c)) 335 - vfree(c->blocks); 336 - else 337 - kfree(c->blocks); 334 + kvfree(c->blocks); 338 335 jffs2_flash_cleanup(c); 339 336 kfree(c->inocache_list); 340 337 jffs2_clear_xattr_subsystem(c);
+1 -6
fs/udf/super.c
··· 279 279 { 280 280 int i; 281 281 int nr_groups = bitmap->s_nr_groups; 282 - int size = sizeof(struct udf_bitmap) + (sizeof(struct buffer_head *) * 283 - nr_groups); 284 282 285 283 for (i = 0; i < nr_groups; i++) 286 284 if (bitmap->s_block_bitmap[i]) 287 285 brelse(bitmap->s_block_bitmap[i]); 288 286 289 - if (size <= PAGE_SIZE) 290 - kfree(bitmap); 291 - else 292 - vfree(bitmap); 287 + kvfree(bitmap); 293 288 } 294 289 295 290 static void udf_free_partition(struct udf_part_map *map)
+4 -3
fs/xfs/xfs_file.c
··· 1610 1610 /* 1611 1611 * pfn_mkwrite was originally inteneded to ensure we capture time stamp 1612 1612 * updates on write faults. In reality, it's need to serialise against 1613 - * truncate similar to page_mkwrite. Hence we open-code dax_pfn_mkwrite() 1614 - * here and cycle the XFS_MMAPLOCK_SHARED to ensure we serialise the fault 1615 - * barrier in place. 1613 + * truncate similar to page_mkwrite. Hence we cycle the XFS_MMAPLOCK_SHARED 1614 + * to ensure we serialise the fault barrier in place. 1616 1615 */ 1617 1616 static int 1618 1617 xfs_filemap_pfn_mkwrite( ··· 1634 1635 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; 1635 1636 if (vmf->pgoff >= size) 1636 1637 ret = VM_FAULT_SIGBUS; 1638 + else if (IS_DAX(inode)) 1639 + ret = dax_pfn_mkwrite(vma, vmf); 1637 1640 xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); 1638 1641 sb_end_pagefault(inode->i_sb); 1639 1642 return ret;
+7
include/linux/dax.h
··· 36 36 { 37 37 return vma->vm_file && IS_DAX(vma->vm_file->f_mapping->host); 38 38 } 39 + 40 + static inline bool dax_mapping(struct address_space *mapping) 41 + { 42 + return mapping->host && IS_DAX(mapping->host); 43 + } 44 + int dax_writeback_mapping_range(struct address_space *mapping, loff_t start, 45 + loff_t end); 39 46 #endif
+2 -1
include/linux/fs.h
··· 433 433 struct rw_semaphore i_mmap_rwsem; /* protect tree, count, list */ 434 434 /* Protected by tree_lock together with the radix tree */ 435 435 unsigned long nrpages; /* number of total pages */ 436 - unsigned long nrshadows; /* number of shadow entries */ 436 + /* number of shadow or DAX exceptional entries */ 437 + unsigned long nrexceptional; 437 438 pgoff_t writeback_index;/* writeback starts here */ 438 439 const struct address_space_operations *a_ops; /* methods */ 439 440 unsigned long flags; /* error bits/gfp mask */
+3
include/linux/pagemap.h
··· 361 361 unsigned int nr_pages, struct page **pages); 362 362 unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, 363 363 int tag, unsigned int nr_pages, struct page **pages); 364 + unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start, 365 + int tag, unsigned int nr_entries, 366 + struct page **entries, pgoff_t *indices); 364 367 365 368 struct page *grab_cache_page_write_begin(struct address_space *mapping, 366 369 pgoff_t index, unsigned flags);
+21 -1
include/linux/pmem.h
··· 53 53 { 54 54 BUG(); 55 55 } 56 + 57 + static inline void arch_wb_cache_pmem(void __pmem *addr, size_t size) 58 + { 59 + BUG(); 60 + } 56 61 #endif 57 62 58 63 /* 59 64 * Architectures that define ARCH_HAS_PMEM_API must provide 60 65 * implementations for arch_memcpy_to_pmem(), arch_wmb_pmem(), 61 - * arch_copy_from_iter_pmem(), arch_clear_pmem() and arch_has_wmb_pmem(). 66 + * arch_copy_from_iter_pmem(), arch_clear_pmem(), arch_wb_cache_pmem() 67 + * and arch_has_wmb_pmem(). 62 68 */ 63 69 static inline void memcpy_from_pmem(void *dst, void __pmem const *src, size_t size) 64 70 { ··· 183 177 arch_clear_pmem(addr, size); 184 178 else 185 179 default_clear_pmem(addr, size); 180 + } 181 + 182 + /** 183 + * wb_cache_pmem - write back processor cache for PMEM memory range 184 + * @addr: virtual start address 185 + * @size: number of bytes to write back 186 + * 187 + * Write back the processor cache range starting at 'addr' for 'size' bytes. 188 + * This function requires explicit ordering with a wmb_pmem() call. 189 + */ 190 + static inline void wb_cache_pmem(void __pmem *addr, size_t size) 191 + { 192 + if (arch_has_pmem_api()) 193 + arch_wb_cache_pmem(addr, size); 186 194 } 187 195 #endif /* __PMEM_H__ */
+9
include/linux/radix-tree.h
··· 51 51 #define RADIX_TREE_EXCEPTIONAL_ENTRY 2 52 52 #define RADIX_TREE_EXCEPTIONAL_SHIFT 2 53 53 54 + #define RADIX_DAX_MASK 0xf 55 + #define RADIX_DAX_SHIFT 4 56 + #define RADIX_DAX_PTE (0x4 | RADIX_TREE_EXCEPTIONAL_ENTRY) 57 + #define RADIX_DAX_PMD (0x8 | RADIX_TREE_EXCEPTIONAL_ENTRY) 58 + #define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_MASK) 59 + #define RADIX_DAX_SECTOR(entry) (((unsigned long)entry >> RADIX_DAX_SHIFT)) 60 + #define RADIX_DAX_ENTRY(sector, pmd) ((void *)((unsigned long)sector << \ 61 + RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE))) 62 + 54 63 static inline int radix_tree_is_indirect_ptr(void *ptr) 55 64 { 56 65 return (int)((unsigned long)ptr & RADIX_TREE_INDIRECT_PTR);
+1 -1
ipc/sem.c
··· 1493 1493 wake_up_sem_queue_do(&tasks); 1494 1494 out_free: 1495 1495 if (sem_io != fast_sem_io) 1496 - ipc_free(sem_io, sizeof(ushort)*nsems); 1496 + ipc_free(sem_io); 1497 1497 return err; 1498 1498 } 1499 1499
+3 -8
ipc/util.c
··· 414 414 /** 415 415 * ipc_free - free ipc space 416 416 * @ptr: pointer returned by ipc_alloc 417 - * @size: size of block 418 417 * 419 - * Free a block created with ipc_alloc(). The caller must know the size 420 - * used in the allocation call. 418 + * Free a block created with ipc_alloc(). 421 419 */ 422 - void ipc_free(void *ptr, int size) 420 + void ipc_free(void *ptr) 423 421 { 424 - if (size > PAGE_SIZE) 425 - vfree(ptr); 426 - else 427 - kfree(ptr); 422 + kvfree(ptr); 428 423 } 429 424 430 425 /**
+1 -1
ipc/util.h
··· 118 118 * both function can sleep 119 119 */ 120 120 void *ipc_alloc(int size); 121 - void ipc_free(void *ptr, int size); 121 + void ipc_free(void *ptr); 122 122 123 123 /* 124 124 * For allocation that need to be freed by RCU.
+85 -6
mm/filemap.c
··· 11 11 */ 12 12 #include <linux/export.h> 13 13 #include <linux/compiler.h> 14 + #include <linux/dax.h> 14 15 #include <linux/fs.h> 15 16 #include <linux/uaccess.h> 16 17 #include <linux/capability.h> ··· 124 123 __radix_tree_lookup(&mapping->page_tree, page->index, &node, &slot); 125 124 126 125 if (shadow) { 127 - mapping->nrshadows++; 126 + mapping->nrexceptional++; 128 127 /* 129 - * Make sure the nrshadows update is committed before 128 + * Make sure the nrexceptional update is committed before 130 129 * the nrpages update so that final truncate racing 131 130 * with reclaim does not see both counters 0 at the 132 131 * same time and miss a shadow entry. ··· 482 481 { 483 482 int err = 0; 484 483 484 + if (dax_mapping(mapping) && mapping->nrexceptional) { 485 + err = dax_writeback_mapping_range(mapping, lstart, lend); 486 + if (err) 487 + return err; 488 + } 489 + 485 490 if (mapping->nrpages) { 486 491 err = __filemap_fdatawrite_range(mapping, lstart, lend, 487 492 WB_SYNC_ALL); ··· 586 579 p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock); 587 580 if (!radix_tree_exceptional_entry(p)) 588 581 return -EEXIST; 582 + 583 + if (WARN_ON(dax_mapping(mapping))) 584 + return -EINVAL; 585 + 589 586 if (shadowp) 590 587 *shadowp = p; 591 - mapping->nrshadows--; 588 + mapping->nrexceptional--; 592 589 if (node) 593 590 workingset_node_shadows_dec(node); 594 591 } ··· 1256 1245 if (radix_tree_deref_retry(page)) 1257 1246 goto restart; 1258 1247 /* 1259 - * A shadow entry of a recently evicted page, 1260 - * or a swap entry from shmem/tmpfs. Return 1261 - * it without attempting to raise page count. 1248 + * A shadow entry of a recently evicted page, a swap 1249 + * entry from shmem/tmpfs or a DAX entry. Return it 1250 + * without attempting to raise page count. 1262 1251 */ 1263 1252 goto export; 1264 1253 } ··· 1504 1493 return ret; 1505 1494 } 1506 1495 EXPORT_SYMBOL(find_get_pages_tag); 1496 + 1497 + /** 1498 + * find_get_entries_tag - find and return entries that match @tag 1499 + * @mapping: the address_space to search 1500 + * @start: the starting page cache index 1501 + * @tag: the tag index 1502 + * @nr_entries: the maximum number of entries 1503 + * @entries: where the resulting entries are placed 1504 + * @indices: the cache indices corresponding to the entries in @entries 1505 + * 1506 + * Like find_get_entries, except we only return entries which are tagged with 1507 + * @tag. 1508 + */ 1509 + unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start, 1510 + int tag, unsigned int nr_entries, 1511 + struct page **entries, pgoff_t *indices) 1512 + { 1513 + void **slot; 1514 + unsigned int ret = 0; 1515 + struct radix_tree_iter iter; 1516 + 1517 + if (!nr_entries) 1518 + return 0; 1519 + 1520 + rcu_read_lock(); 1521 + restart: 1522 + radix_tree_for_each_tagged(slot, &mapping->page_tree, 1523 + &iter, start, tag) { 1524 + struct page *page; 1525 + repeat: 1526 + page = radix_tree_deref_slot(slot); 1527 + if (unlikely(!page)) 1528 + continue; 1529 + if (radix_tree_exception(page)) { 1530 + if (radix_tree_deref_retry(page)) { 1531 + /* 1532 + * Transient condition which can only trigger 1533 + * when entry at index 0 moves out of or back 1534 + * to root: none yet gotten, safe to restart. 1535 + */ 1536 + goto restart; 1537 + } 1538 + 1539 + /* 1540 + * A shadow entry of a recently evicted page, a swap 1541 + * entry from shmem/tmpfs or a DAX entry. Return it 1542 + * without attempting to raise page count. 1543 + */ 1544 + goto export; 1545 + } 1546 + if (!page_cache_get_speculative(page)) 1547 + goto repeat; 1548 + 1549 + /* Has the page moved? */ 1550 + if (unlikely(page != *slot)) { 1551 + page_cache_release(page); 1552 + goto repeat; 1553 + } 1554 + export: 1555 + indices[ret] = iter.index; 1556 + entries[ret] = page; 1557 + if (++ret == nr_entries) 1558 + break; 1559 + } 1560 + rcu_read_unlock(); 1561 + return ret; 1562 + } 1563 + EXPORT_SYMBOL(find_get_entries_tag); 1507 1564 1508 1565 /* 1509 1566 * CD/DVDs are error prone. When a medium error occurs, the driver may fail
+7 -11
mm/percpu.c
··· 305 305 /** 306 306 * pcpu_mem_free - free memory 307 307 * @ptr: memory to free 308 - * @size: size of the area 309 308 * 310 309 * Free @ptr. @ptr should have been allocated using pcpu_mem_zalloc(). 311 310 */ 312 - static void pcpu_mem_free(void *ptr, size_t size) 311 + static void pcpu_mem_free(void *ptr) 313 312 { 314 - if (size <= PAGE_SIZE) 315 - kfree(ptr); 316 - else 317 - vfree(ptr); 313 + kvfree(ptr); 318 314 } 319 315 320 316 /** ··· 459 463 * pcpu_mem_free() might end up calling vfree() which uses 460 464 * IRQ-unsafe lock and thus can't be called under pcpu_lock. 461 465 */ 462 - pcpu_mem_free(old, old_size); 463 - pcpu_mem_free(new, new_size); 466 + pcpu_mem_free(old); 467 + pcpu_mem_free(new); 464 468 465 469 return 0; 466 470 } ··· 728 732 chunk->map = pcpu_mem_zalloc(PCPU_DFL_MAP_ALLOC * 729 733 sizeof(chunk->map[0])); 730 734 if (!chunk->map) { 731 - pcpu_mem_free(chunk, pcpu_chunk_struct_size); 735 + pcpu_mem_free(chunk); 732 736 return NULL; 733 737 } 734 738 ··· 749 753 { 750 754 if (!chunk) 751 755 return; 752 - pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0])); 753 - pcpu_mem_free(chunk, pcpu_chunk_struct_size); 756 + pcpu_mem_free(chunk->map); 757 + pcpu_mem_free(chunk); 754 758 } 755 759 756 760 /**
+39 -30
mm/truncate.c
··· 9 9 10 10 #include <linux/kernel.h> 11 11 #include <linux/backing-dev.h> 12 + #include <linux/dax.h> 12 13 #include <linux/gfp.h> 13 14 #include <linux/mm.h> 14 15 #include <linux/swap.h> ··· 35 34 return; 36 35 37 36 spin_lock_irq(&mapping->tree_lock); 38 - /* 39 - * Regular page slots are stabilized by the page lock even 40 - * without the tree itself locked. These unlocked entries 41 - * need verification under the tree lock. 42 - */ 43 - if (!__radix_tree_lookup(&mapping->page_tree, index, &node, &slot)) 44 - goto unlock; 45 - if (*slot != entry) 46 - goto unlock; 47 - radix_tree_replace_slot(slot, NULL); 48 - mapping->nrshadows--; 49 - if (!node) 50 - goto unlock; 51 - workingset_node_shadows_dec(node); 52 - /* 53 - * Don't track node without shadow entries. 54 - * 55 - * Avoid acquiring the list_lru lock if already untracked. 56 - * The list_empty() test is safe as node->private_list is 57 - * protected by mapping->tree_lock. 58 - */ 59 - if (!workingset_node_shadows(node) && 60 - !list_empty(&node->private_list)) 61 - list_lru_del(&workingset_shadow_nodes, &node->private_list); 62 - __radix_tree_delete_node(&mapping->page_tree, node); 37 + 38 + if (dax_mapping(mapping)) { 39 + if (radix_tree_delete_item(&mapping->page_tree, index, entry)) 40 + mapping->nrexceptional--; 41 + } else { 42 + /* 43 + * Regular page slots are stabilized by the page lock even 44 + * without the tree itself locked. These unlocked entries 45 + * need verification under the tree lock. 46 + */ 47 + if (!__radix_tree_lookup(&mapping->page_tree, index, &node, 48 + &slot)) 49 + goto unlock; 50 + if (*slot != entry) 51 + goto unlock; 52 + radix_tree_replace_slot(slot, NULL); 53 + mapping->nrexceptional--; 54 + if (!node) 55 + goto unlock; 56 + workingset_node_shadows_dec(node); 57 + /* 58 + * Don't track node without shadow entries. 59 + * 60 + * Avoid acquiring the list_lru lock if already untracked. 61 + * The list_empty() test is safe as node->private_list is 62 + * protected by mapping->tree_lock. 63 + */ 64 + if (!workingset_node_shadows(node) && 65 + !list_empty(&node->private_list)) 66 + list_lru_del(&workingset_shadow_nodes, 67 + &node->private_list); 68 + __radix_tree_delete_node(&mapping->page_tree, node); 69 + } 63 70 unlock: 64 71 spin_unlock_irq(&mapping->tree_lock); 65 72 } ··· 237 228 int i; 238 229 239 230 cleancache_invalidate_inode(mapping); 240 - if (mapping->nrpages == 0 && mapping->nrshadows == 0) 231 + if (mapping->nrpages == 0 && mapping->nrexceptional == 0) 241 232 return; 242 233 243 234 /* Offsets within partial pages */ ··· 411 402 */ 412 403 void truncate_inode_pages_final(struct address_space *mapping) 413 404 { 414 - unsigned long nrshadows; 405 + unsigned long nrexceptional; 415 406 unsigned long nrpages; 416 407 417 408 /* ··· 425 416 426 417 /* 427 418 * When reclaim installs eviction entries, it increases 428 - * nrshadows first, then decreases nrpages. Make sure we see 419 + * nrexceptional first, then decreases nrpages. Make sure we see 429 420 * this in the right order or we might miss an entry. 430 421 */ 431 422 nrpages = mapping->nrpages; 432 423 smp_rmb(); 433 - nrshadows = mapping->nrshadows; 424 + nrexceptional = mapping->nrexceptional; 434 425 435 - if (nrpages || nrshadows) { 426 + if (nrpages || nrexceptional) { 436 427 /* 437 428 * As truncation uses a lockless tree lookup, cycle 438 429 * the tree lock to make sure any ongoing tree
+8 -1
mm/vmscan.c
··· 46 46 #include <linux/oom.h> 47 47 #include <linux/prefetch.h> 48 48 #include <linux/printk.h> 49 + #include <linux/dax.h> 49 50 50 51 #include <asm/tlbflush.h> 51 52 #include <asm/div64.h> ··· 672 671 * inode reclaim needs to empty out the radix tree or 673 672 * the nodes are lost. Don't plant shadows behind its 674 673 * back. 674 + * 675 + * We also don't store shadows for DAX mappings because the 676 + * only page cache pages found in these are zero pages 677 + * covering holes, and because we don't want to mix DAX 678 + * exceptional entries and shadow exceptional entries in the 679 + * same page_tree. 675 680 */ 676 681 if (reclaimed && page_is_file_cache(page) && 677 - !mapping_exiting(mapping)) 682 + !mapping_exiting(mapping) && !dax_mapping(mapping)) 678 683 shadow = workingset_eviction(mapping, page); 679 684 __delete_from_page_cache(page, shadow, memcg); 680 685 spin_unlock_irqrestore(&mapping->tree_lock, flags);
+2 -2
mm/workingset.c
··· 351 351 node->slots[i] = NULL; 352 352 BUG_ON(node->count < (1U << RADIX_TREE_COUNT_SHIFT)); 353 353 node->count -= 1U << RADIX_TREE_COUNT_SHIFT; 354 - BUG_ON(!mapping->nrshadows); 355 - mapping->nrshadows--; 354 + BUG_ON(!mapping->nrexceptional); 355 + mapping->nrexceptional--; 356 356 } 357 357 } 358 358 BUG_ON(node->count);
+1 -3
net/ipv4/fib_trie.c
··· 289 289 290 290 if (!n->tn_bits) 291 291 kmem_cache_free(trie_leaf_kmem, n); 292 - else if (n->tn_bits <= TNODE_KMALLOC_MAX) 293 - kfree(n); 294 292 else 295 - vfree(n); 293 + kvfree(n); 296 294 } 297 295 298 296 #define node_free(n) call_rcu(&tn_info(n)->rcu, __node_free_rcu)