Merge branch 'akpm' (patches from Andrew)

tjh.dev / kernel

fork

Configure Feed

Issues Pull Requests Commits Tags

Feed URL

Select the types of activity you want to include in your feed.

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

fork

Configure Feed

Issues Pull Requests Commits Tags

Feed URL

Select the types of activity you want to include in your feed.

Merge branch 'akpm' (patches from Andrew)

Merge small final update from Andrew Morton:

- DAX feature work: add fsync/msync support

- kfree cleanup, MAINTAINERS update

* emailed patches from Andrew Morton <akpm@linux-foundation.org>:
MAINTAINERS: return arch/sh to maintained state, with new maintainers
tree wide: use kvfree() than conditional kfree()/vfree()
dax: never rely on bh.b_dev being set by get_block()
xfs: call dax_pfn_mkwrite() for DAX fsync/msync
ext4: call dax_pfn_mkwrite() for DAX fsync/msync
ext2: call dax_pfn_mkwrite() for DAX fsync/msync
dax: add support for fsync/sync
mm: add find_get_entries_tag()
dax: support dirty DAX entries in radix tree
pmem: add wb_cache_pmem() to the PMEM API
dax: fix conversion of holes to PMDs
dax: fix NULL pointer dereference in __dax_dbg()

Linus Torvalds 10 years ago 20c759ca b82dde02

+493 -171

34 changed files

expand all collapse all

MAINTAINERS

arch

arm

dma-mapping.c

x86

include

asm

pmem.h

drivers

acpi

apei

erst.c

block

drbd

drbd_bitmap.c

drbd_int.h

char

mspec.c

gpu

drm

drm_hashtab.c

staging

lustre

include

linux

libcfs

libcfs_private.h

block_dev.c

coda

coda_linux.h

dax.c

ext2

file.c

ext4

file.c

inode.c

jffs2

build.c

fs.c

super.c

udf

super.c

xfs

xfs_file.c

include

linux

dax.h

fs.h

pagemap.h

pmem.h

radix-tree.h

ipc

sem.c

util.c

util.h

filemap.c

percpu.c

truncate.c

vmscan.c

workingset.c

net

ipv4

fib_trie.c

+3 -1

MAINTAINERS

reviewed

··· 10453 10453 F: drivers/net/ethernet/dlink/sundance.c 10454 10454 10455 10455 SUPERH 10456 10456 + M: Yoshinori Sato <ysato@users.sourceforge.jp> 10457 10457 + M: Rich Felker <dalias@libc.org> 10456 10458 L: linux-sh@vger.kernel.org 10457 10459 Q: http://patchwork.kernel.org/project/linux-sh/list/ 10458 10458 - S: Orphan 10460 10460 + S: Maintained 10459 10461 F: Documentation/sh/ 10460 10462 F: arch/sh/ 10461 10463 F: drivers/sh/

+2 -9

arch/arm/mm/dma-mapping.c

reviewed

··· 1200 1200 while (i--) 1201 1201 if (pages[i]) 1202 1202 __free_pages(pages[i], 0); 1203 1203 - if (array_size <= PAGE_SIZE) 1204 1204 - kfree(pages); 1205 1205 - else 1206 1206 - vfree(pages); 1203 1203 + kvfree(pages); 1207 1204 return NULL; 1208 1205 } 1209 1206 ··· 1208 1211 size_t size, struct dma_attrs *attrs) 1209 1212 { 1210 1213 int count = size >> PAGE_SHIFT; 1211 1211 - int array_size = count * sizeof(struct page *); 1212 1214 int i; 1213 1215 1214 1216 if (dma_get_attr(DMA_ATTR_FORCE_CONTIGUOUS, attrs)) { ··· 1218 1222 __free_pages(pages[i], 0); 1219 1223 } 1220 1224 1221 1221 - if (array_size <= PAGE_SIZE) 1222 1222 - kfree(pages); 1223 1223 - else 1224 1224 - vfree(pages); 1225 1225 + kvfree(pages); 1225 1226 return 0; 1226 1227 } 1227 1228

+6 -5

arch/x86/include/asm/pmem.h

reviewed

··· 67 67 } 68 68 69 69 /** 70 70 - * __arch_wb_cache_pmem - write back a cache range with CLWB 70 70 + * arch_wb_cache_pmem - write back a cache range with CLWB 71 71 * @vaddr: virtual start address 72 72 * @size: number of bytes to write back 73 73 * 74 74 * Write back a cache range using the CLWB (cache line write back) 75 75 * instruction. This function requires explicit ordering with an 76 76 - * arch_wmb_pmem() call. This API is internal to the x86 PMEM implementation. 76 76 + * arch_wmb_pmem() call. 77 77 */ 78 78 - static inline void __arch_wb_cache_pmem(void *vaddr, size_t size) 78 78 + static inline void arch_wb_cache_pmem(void __pmem *addr, size_t size) 79 79 { 80 80 u16 x86_clflush_size = boot_cpu_data.x86_clflush_size; 81 81 unsigned long clflush_mask = x86_clflush_size - 1; 82 82 + void *vaddr = (void __force *)addr; 82 83 void *vend = vaddr + size; 83 84 void *p; 84 85 ··· 116 115 len = copy_from_iter_nocache(vaddr, bytes, i); 117 116 118 117 if (__iter_needs_pmem_wb(i)) 119 119 - __arch_wb_cache_pmem(vaddr, bytes); 118 118 + arch_wb_cache_pmem(addr, bytes); 120 119 121 120 return len; 122 121 } ··· 134 133 void *vaddr = (void __force *)addr; 135 134 136 135 memset(vaddr, 0, size); 137 137 - __arch_wb_cache_pmem(vaddr, size); 136 136 + arch_wb_cache_pmem(addr, size); 138 137 } 139 138 140 139 static inline bool __arch_has_wmb_pmem(void)

+2 -4

drivers/acpi/apei/erst.c

reviewed

··· 32 32 #include <linux/hardirq.h> 33 33 #include <linux/pstore.h> 34 34 #include <linux/vmalloc.h> 35 35 + #include <linux/mm.h> /* kvfree() */ 35 36 #include <acpi/apei.h> 36 37 37 38 #include "apei-internal.h" ··· 533 532 return -ENOMEM; 534 533 memcpy(new_entries, entries, 535 534 erst_record_id_cache.len * sizeof(entries[0])); 536 536 - if (erst_record_id_cache.size < PAGE_SIZE) 537 537 - kfree(entries); 538 538 - else 539 539 - vfree(entries); 535 535 + kvfree(entries); 540 536 erst_record_id_cache.entries = entries = new_entries; 541 537 erst_record_id_cache.size = new_size; 542 538 }

+7 -19

drivers/block/drbd/drbd_bitmap.c

reviewed

··· 364 364 } 365 365 } 366 366 367 367 - static void bm_vk_free(void *ptr, int v) 367 367 + static inline void bm_vk_free(void *ptr) 368 368 { 369 369 - if (v) 370 370 - vfree(ptr); 371 371 - else 372 372 - kfree(ptr); 369 369 + kvfree(ptr); 373 370 } 374 371 375 372 /* ··· 376 379 { 377 380 struct page **old_pages = b->bm_pages; 378 381 struct page **new_pages, *page; 379 379 - unsigned int i, bytes, vmalloced = 0; 382 382 + unsigned int i, bytes; 380 383 unsigned long have = b->bm_number_of_pages; 381 384 382 385 BUG_ON(have == 0 && old_pages != NULL); ··· 398 401 PAGE_KERNEL); 399 402 if (!new_pages) 400 403 return NULL; 401 401 - vmalloced = 1; 402 404 } 403 405 404 406 if (want >= have) { ··· 407 411 page = alloc_page(GFP_NOIO | __GFP_HIGHMEM); 408 412 if (!page) { 409 413 bm_free_pages(new_pages + have, i - have); 410 410 - bm_vk_free(new_pages, vmalloced); 414 414 + bm_vk_free(new_pages); 411 415 return NULL; 412 416 } 413 417 /* we want to know which page it is ··· 422 426 bm_free_pages(old_pages + want, have - want); 423 427 */ 424 428 } 425 425 - 426 426 - if (vmalloced) 427 427 - b->bm_flags |= BM_P_VMALLOCED; 428 428 - else 429 429 - b->bm_flags &= ~BM_P_VMALLOCED; 430 429 431 430 return new_pages; 432 431 } ··· 460 469 if (!expect(device->bitmap)) 461 470 return; 462 471 bm_free_pages(device->bitmap->bm_pages, device->bitmap->bm_number_of_pages); 463 463 - bm_vk_free(device->bitmap->bm_pages, (BM_P_VMALLOCED & device->bitmap->bm_flags)); 472 472 + bm_vk_free(device->bitmap->bm_pages); 464 473 kfree(device->bitmap); 465 474 device->bitmap = NULL; 466 475 } ··· 634 643 unsigned long want, have, onpages; /* number of pages */ 635 644 struct page **npages, **opages = NULL; 636 645 int err = 0, growing; 637 637 - int opages_vmalloced; 638 646 639 647 if (!expect(b)) 640 648 return -ENOMEM; ··· 645 655 646 656 if (capacity == b->bm_dev_capacity) 647 657 goto out; 648 648 - 649 649 - opages_vmalloced = (BM_P_VMALLOCED & b->bm_flags); 650 658 651 659 if (capacity == 0) { 652 660 spin_lock_irq(&b->bm_lock); ··· 659 671 b->bm_dev_capacity = 0; 660 672 spin_unlock_irq(&b->bm_lock); 661 673 bm_free_pages(opages, onpages); 662 662 - bm_vk_free(opages, opages_vmalloced); 674 674 + bm_vk_free(opages); 663 675 goto out; 664 676 } 665 677 bits = BM_SECT_TO_BIT(ALIGN(capacity, BM_SECT_PER_BIT)); ··· 732 744 733 745 spin_unlock_irq(&b->bm_lock); 734 746 if (opages != npages) 735 735 - bm_vk_free(opages, opages_vmalloced); 747 747 + bm_vk_free(opages); 736 748 if (!growing) 737 749 b->bm_set = bm_count_bits(b); 738 750 drbd_info(device, "resync bitmap: bits=%lu words=%lu pages=%lu\n", bits, words, want);

-3

drivers/block/drbd/drbd_int.h

reviewed

··· 536 536 /* definition of bits in bm_flags to be used in drbd_bm_lock 537 537 * and drbd_bitmap_io and friends. */ 538 538 enum bm_flag { 539 539 - /* do we need to kfree, or vfree bm_pages? */ 540 540 - BM_P_VMALLOCED = 0x10000, /* internal use only, will be masked out */ 541 541 - 542 539 /* currently locked for bulk operation */ 543 540 BM_LOCKED_MASK = 0xf, 544 541

+3 -12

drivers/char/mspec.c

reviewed

··· 93 93 spinlock_t lock; /* Serialize access to this structure. */ 94 94 int count; /* Number of pages allocated. */ 95 95 enum mspec_page_type type; /* Type of pages allocated. */ 96 96 - int flags; /* See VMD_xxx below. */ 97 96 unsigned long vm_start; /* Original (unsplit) base. */ 98 97 unsigned long vm_end; /* Original (unsplit) end. */ 99 98 unsigned long maddr[0]; /* Array of MSPEC addresses. */ 100 99 }; 101 101 - 102 102 - #define VMD_VMALLOCED 0x1 /* vmalloc'd rather than kmalloc'd */ 103 100 104 101 /* used on shub2 to clear FOP cache in the HUB */ 105 102 static unsigned long scratch_page[MAX_NUMNODES]; ··· 182 185 "failed to zero page %ld\n", my_page); 183 186 } 184 187 185 185 - if (vdata->flags & VMD_VMALLOCED) 186 186 - vfree(vdata); 187 187 - else 188 188 - kfree(vdata); 188 188 + kvfree(vdata); 189 189 } 190 190 191 191 /* ··· 250 256 enum mspec_page_type type) 251 257 { 252 258 struct vma_data *vdata; 253 253 - int pages, vdata_size, flags = 0; 259 259 + int pages, vdata_size; 254 260 255 261 if (vma->vm_pgoff != 0) 256 262 return -EINVAL; ··· 265 271 vdata_size = sizeof(struct vma_data) + pages * sizeof(long); 266 272 if (vdata_size <= PAGE_SIZE) 267 273 vdata = kzalloc(vdata_size, GFP_KERNEL); 268 268 - else { 274 274 + else 269 275 vdata = vzalloc(vdata_size); 270 270 - flags = VMD_VMALLOCED; 271 271 - } 272 276 if (!vdata) 273 277 return -ENOMEM; 274 278 275 279 vdata->vm_start = vma->vm_start; 276 280 vdata->vm_end = vma->vm_end; 277 277 - vdata->flags = flags; 278 281 vdata->type = type; 279 282 spin_lock_init(&vdata->lock); 280 283 atomic_set(&vdata->refcnt, 1);

+1 -4

drivers/gpu/drm/drm_hashtab.c

reviewed

··· 198 198 void drm_ht_remove(struct drm_open_hash *ht) 199 199 { 200 200 if (ht->table) { 201 201 - if ((PAGE_SIZE / sizeof(*ht->table)) >> ht->order) 202 202 - kfree(ht->table); 203 203 - else 204 204 - vfree(ht->table); 201 201 + kvfree(ht->table); 205 202 ht->table = NULL; 206 203 } 207 204 }

+2 -6

drivers/staging/lustre/include/linux/libcfs/libcfs_private.h

reviewed

··· 151 151 152 152 #define LIBCFS_FREE(ptr, size) \ 153 153 do { \ 154 154 - int s = (size); \ 155 154 if (unlikely((ptr) == NULL)) { \ 156 155 CERROR("LIBCFS: free NULL '" #ptr "' (%d bytes) at " \ 157 157 - "%s:%d\n", s, __FILE__, __LINE__); \ 156 156 + "%s:%d\n", (int)(size), __FILE__, __LINE__); \ 158 157 break; \ 159 158 } \ 160 160 - if (unlikely(s > LIBCFS_VMALLOC_SIZE)) \ 161 161 - vfree(ptr); \ 162 162 - else \ 163 163 - kfree(ptr); \ 159 159 + kvfree(ptr); \ 164 160 } while (0) 165 161 166 162 /******************************************************************************/

+1 -1

fs/block_dev.c

reviewed

··· 75 75 { 76 76 struct address_space *mapping = bdev->bd_inode->i_mapping; 77 77 78 78 - if (mapping->nrpages == 0 && mapping->nrshadows == 0) 78 78 + if (mapping->nrpages == 0 && mapping->nrexceptional == 0) 79 79 return; 80 80 81 81 invalidate_bh_lrus();

+1 -2

fs/coda/coda_linux.h

reviewed

··· 72 72 } while (0) 73 73 74 74 75 75 - #define CODA_FREE(ptr,size) \ 76 76 - do { if (size < PAGE_SIZE) kfree((ptr)); else vfree((ptr)); } while (0) 75 75 + #define CODA_FREE(ptr, size) kvfree((ptr)) 77 76 78 77 /* inode to cnode access functions */ 79 78

+260 -14

fs/dax.c

reviewed

··· 24 24 #include <linux/memcontrol.h> 25 25 #include <linux/mm.h> 26 26 #include <linux/mutex.h> 27 27 + #include <linux/pagevec.h> 27 28 #include <linux/pmem.h> 28 29 #include <linux/sched.h> 29 30 #include <linux/uio.h> ··· 246 245 loff_t end = pos + iov_iter_count(iter); 247 246 248 247 memset(&bh, 0, sizeof(bh)); 248 248 + bh.b_bdev = inode->i_sb->s_bdev; 249 249 250 250 if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) { 251 251 struct address_space *mapping = inode->i_mapping; ··· 326 324 return 0; 327 325 } 328 326 327 327 + #define NO_SECTOR -1 328 328 + #define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_CACHE_SHIFT)) 329 329 + 330 330 + static int dax_radix_entry(struct address_space *mapping, pgoff_t index, 331 331 + sector_t sector, bool pmd_entry, bool dirty) 332 332 + { 333 333 + struct radix_tree_root *page_tree = &mapping->page_tree; 334 334 + pgoff_t pmd_index = DAX_PMD_INDEX(index); 335 335 + int type, error = 0; 336 336 + void *entry; 337 337 + 338 338 + WARN_ON_ONCE(pmd_entry && !dirty); 339 339 + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 340 340 + 341 341 + spin_lock_irq(&mapping->tree_lock); 342 342 + 343 343 + entry = radix_tree_lookup(page_tree, pmd_index); 344 344 + if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) { 345 345 + index = pmd_index; 346 346 + goto dirty; 347 347 + } 348 348 + 349 349 + entry = radix_tree_lookup(page_tree, index); 350 350 + if (entry) { 351 351 + type = RADIX_DAX_TYPE(entry); 352 352 + if (WARN_ON_ONCE(type != RADIX_DAX_PTE && 353 353 + type != RADIX_DAX_PMD)) { 354 354 + error = -EIO; 355 355 + goto unlock; 356 356 + } 357 357 + 358 358 + if (!pmd_entry || type == RADIX_DAX_PMD) 359 359 + goto dirty; 360 360 + 361 361 + /* 362 362 + * We only insert dirty PMD entries into the radix tree. This 363 363 + * means we don't need to worry about removing a dirty PTE 364 364 + * entry and inserting a clean PMD entry, thus reducing the 365 365 + * range we would flush with a follow-up fsync/msync call. 366 366 + */ 367 367 + radix_tree_delete(&mapping->page_tree, index); 368 368 + mapping->nrexceptional--; 369 369 + } 370 370 + 371 371 + if (sector == NO_SECTOR) { 372 372 + /* 373 373 + * This can happen during correct operation if our pfn_mkwrite 374 374 + * fault raced against a hole punch operation. If this 375 375 + * happens the pte that was hole punched will have been 376 376 + * unmapped and the radix tree entry will have been removed by 377 377 + * the time we are called, but the call will still happen. We 378 378 + * will return all the way up to wp_pfn_shared(), where the 379 379 + * pte_same() check will fail, eventually causing page fault 380 380 + * to be retried by the CPU. 381 381 + */ 382 382 + goto unlock; 383 383 + } 384 384 + 385 385 + error = radix_tree_insert(page_tree, index, 386 386 + RADIX_DAX_ENTRY(sector, pmd_entry)); 387 387 + if (error) 388 388 + goto unlock; 389 389 + 390 390 + mapping->nrexceptional++; 391 391 + dirty: 392 392 + if (dirty) 393 393 + radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY); 394 394 + unlock: 395 395 + spin_unlock_irq(&mapping->tree_lock); 396 396 + return error; 397 397 + } 398 398 + 399 399 + static int dax_writeback_one(struct block_device *bdev, 400 400 + struct address_space *mapping, pgoff_t index, void *entry) 401 401 + { 402 402 + struct radix_tree_root *page_tree = &mapping->page_tree; 403 403 + int type = RADIX_DAX_TYPE(entry); 404 404 + struct radix_tree_node *node; 405 405 + struct blk_dax_ctl dax; 406 406 + void **slot; 407 407 + int ret = 0; 408 408 + 409 409 + spin_lock_irq(&mapping->tree_lock); 410 410 + /* 411 411 + * Regular page slots are stabilized by the page lock even 412 412 + * without the tree itself locked. These unlocked entries 413 413 + * need verification under the tree lock. 414 414 + */ 415 415 + if (!__radix_tree_lookup(page_tree, index, &node, &slot)) 416 416 + goto unlock; 417 417 + if (*slot != entry) 418 418 + goto unlock; 419 419 + 420 420 + /* another fsync thread may have already written back this entry */ 421 421 + if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)) 422 422 + goto unlock; 423 423 + 424 424 + if (WARN_ON_ONCE(type != RADIX_DAX_PTE && type != RADIX_DAX_PMD)) { 425 425 + ret = -EIO; 426 426 + goto unlock; 427 427 + } 428 428 + 429 429 + dax.sector = RADIX_DAX_SECTOR(entry); 430 430 + dax.size = (type == RADIX_DAX_PMD ? PMD_SIZE : PAGE_SIZE); 431 431 + spin_unlock_irq(&mapping->tree_lock); 432 432 + 433 433 + /* 434 434 + * We cannot hold tree_lock while calling dax_map_atomic() because it 435 435 + * eventually calls cond_resched(). 436 436 + */ 437 437 + ret = dax_map_atomic(bdev, &dax); 438 438 + if (ret < 0) 439 439 + return ret; 440 440 + 441 441 + if (WARN_ON_ONCE(ret < dax.size)) { 442 442 + ret = -EIO; 443 443 + goto unmap; 444 444 + } 445 445 + 446 446 + wb_cache_pmem(dax.addr, dax.size); 447 447 + 448 448 + spin_lock_irq(&mapping->tree_lock); 449 449 + radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE); 450 450 + spin_unlock_irq(&mapping->tree_lock); 451 451 + unmap: 452 452 + dax_unmap_atomic(bdev, &dax); 453 453 + return ret; 454 454 + 455 455 + unlock: 456 456 + spin_unlock_irq(&mapping->tree_lock); 457 457 + return ret; 458 458 + } 459 459 + 460 460 + /* 461 461 + * Flush the mapping to the persistent domain within the byte range of [start, 462 462 + * end]. This is required by data integrity operations to ensure file data is 463 463 + * on persistent storage prior to completion of the operation. 464 464 + */ 465 465 + int dax_writeback_mapping_range(struct address_space *mapping, loff_t start, 466 466 + loff_t end) 467 467 + { 468 468 + struct inode *inode = mapping->host; 469 469 + struct block_device *bdev = inode->i_sb->s_bdev; 470 470 + pgoff_t start_index, end_index, pmd_index; 471 471 + pgoff_t indices[PAGEVEC_SIZE]; 472 472 + struct pagevec pvec; 473 473 + bool done = false; 474 474 + int i, ret = 0; 475 475 + void *entry; 476 476 + 477 477 + if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT)) 478 478 + return -EIO; 479 479 + 480 480 + start_index = start >> PAGE_CACHE_SHIFT; 481 481 + end_index = end >> PAGE_CACHE_SHIFT; 482 482 + pmd_index = DAX_PMD_INDEX(start_index); 483 483 + 484 484 + rcu_read_lock(); 485 485 + entry = radix_tree_lookup(&mapping->page_tree, pmd_index); 486 486 + rcu_read_unlock(); 487 487 + 488 488 + /* see if the start of our range is covered by a PMD entry */ 489 489 + if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) 490 490 + start_index = pmd_index; 491 491 + 492 492 + tag_pages_for_writeback(mapping, start_index, end_index); 493 493 + 494 494 + pagevec_init(&pvec, 0); 495 495 + while (!done) { 496 496 + pvec.nr = find_get_entries_tag(mapping, start_index, 497 497 + PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE, 498 498 + pvec.pages, indices); 499 499 + 500 500 + if (pvec.nr == 0) 501 501 + break; 502 502 + 503 503 + for (i = 0; i < pvec.nr; i++) { 504 504 + if (indices[i] > end_index) { 505 505 + done = true; 506 506 + break; 507 507 + } 508 508 + 509 509 + ret = dax_writeback_one(bdev, mapping, indices[i], 510 510 + pvec.pages[i]); 511 511 + if (ret < 0) 512 512 + return ret; 513 513 + } 514 514 + } 515 515 + wmb_pmem(); 516 516 + return 0; 517 517 + } 518 518 + EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); 519 519 + 329 520 static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, 330 521 struct vm_area_struct *vma, struct vm_fault *vmf) 331 522 { ··· 557 362 wmb_pmem(); 558 363 } 559 364 dax_unmap_atomic(bdev, &dax); 365 365 + 366 366 + error = dax_radix_entry(mapping, vmf->pgoff, dax.sector, false, 367 367 + vmf->flags & FAULT_FLAG_WRITE); 368 368 + if (error) 369 369 + goto out; 560 370 561 371 error = vm_insert_mixed(vma, vaddr, dax.pfn); 562 372 ··· 608 408 609 409 memset(&bh, 0, sizeof(bh)); 610 410 block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits); 411 411 + bh.b_bdev = inode->i_sb->s_bdev; 611 412 bh.b_size = PAGE_SIZE; 612 413 613 414 repeat: ··· 688 487 delete_from_page_cache(page); 689 488 unlock_page(page); 690 489 page_cache_release(page); 490 490 + page = NULL; 691 491 } 692 492 693 493 /* ··· 792 590 struct block_device *bdev; 793 591 pgoff_t size, pgoff; 794 592 sector_t block; 795 795 - int result = 0; 593 593 + int error, result = 0; 594 594 + bool alloc = false; 796 595 797 596 /* dax pmd mappings require pfn_t_devmap() */ 798 597 if (!IS_ENABLED(CONFIG_FS_DAX_PMD)) ··· 827 624 } 828 625 829 626 memset(&bh, 0, sizeof(bh)); 627 627 + bh.b_bdev = inode->i_sb->s_bdev; 830 628 block = (sector_t)pgoff << (PAGE_SHIFT - blkbits); 831 629 832 630 bh.b_size = PMD_SIZE; 833 833 - if (get_block(inode, block, &bh, write) != 0) 631 631 + 632 632 + if (get_block(inode, block, &bh, 0) != 0) 834 633 return VM_FAULT_SIGBUS; 634 634 + 635 635 + if (!buffer_mapped(&bh) && write) { 636 636 + if (get_block(inode, block, &bh, 1) != 0) 637 637 + return VM_FAULT_SIGBUS; 638 638 + alloc = true; 639 639 + } 640 640 + 835 641 bdev = bh.b_bdev; 836 836 - i_mmap_lock_read(mapping); 837 642 838 643 /* 839 644 * If the filesystem isn't willing to tell us the length of a hole, ··· 850 639 */ 851 640 if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) { 852 641 dax_pmd_dbg(&bh, address, "allocated block too small"); 853 853 - goto fallback; 642 642 + return VM_FAULT_FALLBACK; 854 643 } 855 644 856 645 /* 857 646 * If we allocated new storage, make sure no process has any 858 647 * zero pages covering this hole 859 648 */ 860 860 - if (buffer_new(&bh)) { 861 861 - i_mmap_unlock_read(mapping); 862 862 - unmap_mapping_range(mapping, pgoff << PAGE_SHIFT, PMD_SIZE, 0); 863 863 - i_mmap_lock_read(mapping); 649 649 + if (alloc) { 650 650 + loff_t lstart = pgoff << PAGE_SHIFT; 651 651 + loff_t lend = lstart + PMD_SIZE - 1; /* inclusive */ 652 652 + 653 653 + truncate_pagecache_range(inode, lstart, lend); 864 654 } 655 655 + 656 656 + i_mmap_lock_read(mapping); 865 657 866 658 /* 867 659 * If a truncate happened while we were allocating blocks, we may ··· 878 664 goto out; 879 665 } 880 666 if ((pgoff | PG_PMD_COLOUR) >= size) { 881 881 - dax_pmd_dbg(&bh, address, "pgoff unaligned"); 667 667 + dax_pmd_dbg(&bh, address, 668 668 + "offset + huge page size > file size"); 882 669 goto fallback; 883 670 } 884 671 ··· 947 732 } 948 733 dax_unmap_atomic(bdev, &dax); 949 734 735 735 + /* 736 736 + * For PTE faults we insert a radix tree entry for reads, and 737 737 + * leave it clean. Then on the first write we dirty the radix 738 738 + * tree entry via the dax_pfn_mkwrite() path. This sequence 739 739 + * allows the dax_pfn_mkwrite() call to be simpler and avoid a 740 740 + * call into get_block() to translate the pgoff to a sector in 741 741 + * order to be able to create a new radix tree entry. 742 742 + * 743 743 + * The PMD path doesn't have an equivalent to 744 744 + * dax_pfn_mkwrite(), though, so for a read followed by a 745 745 + * write we traverse all the way through __dax_pmd_fault() 746 746 + * twice. This means we can just skip inserting a radix tree 747 747 + * entry completely on the initial read and just wait until 748 748 + * the write to insert a dirty entry. 749 749 + */ 750 750 + if (write) { 751 751 + error = dax_radix_entry(mapping, pgoff, dax.sector, 752 752 + true, true); 753 753 + if (error) { 754 754 + dax_pmd_dbg(&bh, address, 755 755 + "PMD radix insertion failed"); 756 756 + goto fallback; 757 757 + } 758 758 + } 759 759 + 950 760 dev_dbg(part_to_dev(bdev->bd_part), 951 761 "%s: %s addr: %lx pfn: %lx sect: %llx\n", 952 762 __func__, current->comm, address, ··· 1030 790 * dax_pfn_mkwrite - handle first write to DAX page 1031 791 * @vma: The virtual memory area where the fault occurred 1032 792 * @vmf: The description of the fault 1033 1033 - * 1034 793 */ 1035 794 int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) 1036 795 { 1037 1037 - struct super_block *sb = file_inode(vma->vm_file)->i_sb; 796 796 + struct file *file = vma->vm_file; 1038 797 1039 1039 - sb_start_pagefault(sb); 1040 1040 - file_update_time(vma->vm_file); 1041 1041 - sb_end_pagefault(sb); 798 798 + /* 799 799 + * We pass NO_SECTOR to dax_radix_entry() because we expect that a 800 800 + * RADIX_DAX_PTE entry already exists in the radix tree from a 801 801 + * previous call to __dax_fault(). We just want to look up that PTE 802 802 + * entry using vmf->pgoff and make sure the dirty tag is set. This 803 803 + * saves us from having to make a call to get_block() here to look 804 804 + * up the sector. 805 805 + */ 806 806 + dax_radix_entry(file->f_mapping, vmf->pgoff, NO_SECTOR, false, true); 1042 807 return VM_FAULT_NOPAGE; 1043 808 } 1044 809 EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); ··· 1080 835 BUG_ON((offset + length) > PAGE_CACHE_SIZE); 1081 836 1082 837 memset(&bh, 0, sizeof(bh)); 838 838 + bh.b_bdev = inode->i_sb->s_bdev; 1083 839 bh.b_size = PAGE_CACHE_SIZE; 1084 840 err = get_block(inode, index, &bh, 0); 1085 841 if (err < 0)

+3 -1

fs/ext2/file.c

reviewed

··· 102 102 { 103 103 struct inode *inode = file_inode(vma->vm_file); 104 104 struct ext2_inode_info *ei = EXT2_I(inode); 105 105 - int ret = VM_FAULT_NOPAGE; 106 105 loff_t size; 106 106 + int ret; 107 107 108 108 sb_start_pagefault(inode->i_sb); 109 109 file_update_time(vma->vm_file); ··· 113 113 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; 114 114 if (vmf->pgoff >= size) 115 115 ret = VM_FAULT_SIGBUS; 116 116 + else 117 117 + ret = dax_pfn_mkwrite(vma, vmf); 116 118 117 119 up_read(&ei->dax_sem); 118 120 sb_end_pagefault(inode->i_sb);

+3 -1

fs/ext4/file.c

reviewed

··· 291 291 { 292 292 struct inode *inode = file_inode(vma->vm_file); 293 293 struct super_block *sb = inode->i_sb; 294 294 - int ret = VM_FAULT_NOPAGE; 295 294 loff_t size; 295 295 + int ret; 296 296 297 297 sb_start_pagefault(sb); 298 298 file_update_time(vma->vm_file); ··· 300 300 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; 301 301 if (vmf->pgoff >= size) 302 302 ret = VM_FAULT_SIGBUS; 303 303 + else 304 304 + ret = dax_pfn_mkwrite(vma, vmf); 303 305 up_read(&EXT4_I(inode)->i_mmap_sem); 304 306 sb_end_pagefault(sb); 305 307

+1 -1

fs/inode.c

reviewed

··· 495 495 */ 496 496 spin_lock_irq(&inode->i_data.tree_lock); 497 497 BUG_ON(inode->i_data.nrpages); 498 498 - BUG_ON(inode->i_data.nrshadows); 498 498 + BUG_ON(inode->i_data.nrexceptional); 499 499 spin_unlock_irq(&inode->i_data.tree_lock); 500 500 BUG_ON(!list_empty(&inode->i_data.private_list)); 501 501 BUG_ON(!(inode->i_state & I_FREEING));

+2 -6

fs/jffs2/build.c

reviewed

··· 17 17 #include <linux/slab.h> 18 18 #include <linux/vmalloc.h> 19 19 #include <linux/mtd/mtd.h> 20 20 + #include <linux/mm.h> /* kvfree() */ 20 21 #include "nodelist.h" 21 22 22 23 static void jffs2_build_remove_unlinked_inode(struct jffs2_sb_info *, ··· 384 383 return 0; 385 384 386 385 out_free: 387 387 - #ifndef __ECOS 388 388 - if (jffs2_blocks_use_vmalloc(c)) 389 389 - vfree(c->blocks); 390 390 - else 391 391 - #endif 392 392 - kfree(c->blocks); 386 386 + kvfree(c->blocks); 393 387 394 388 return ret; 395 389 }

+1 -4

fs/jffs2/fs.c

reviewed

··· 596 596 out_root: 597 597 jffs2_free_ino_caches(c); 598 598 jffs2_free_raw_node_refs(c); 599 599 - if (jffs2_blocks_use_vmalloc(c)) 600 600 - vfree(c->blocks); 601 601 - else 602 602 - kfree(c->blocks); 599 599 + kvfree(c->blocks); 603 600 out_inohash: 604 601 jffs2_clear_xattr_subsystem(c); 605 602 kfree(c->inocache_list);

+1 -4

fs/jffs2/super.c

reviewed

··· 331 331 332 332 jffs2_free_ino_caches(c); 333 333 jffs2_free_raw_node_refs(c); 334 334 - if (jffs2_blocks_use_vmalloc(c)) 335 335 - vfree(c->blocks); 336 336 - else 337 337 - kfree(c->blocks); 334 334 + kvfree(c->blocks); 338 335 jffs2_flash_cleanup(c); 339 336 kfree(c->inocache_list); 340 337 jffs2_clear_xattr_subsystem(c);

+1 -6

fs/udf/super.c

reviewed

··· 279 279 { 280 280 int i; 281 281 int nr_groups = bitmap->s_nr_groups; 282 282 - int size = sizeof(struct udf_bitmap) + (sizeof(struct buffer_head *) * 283 283 - nr_groups); 284 282 285 283 for (i = 0; i < nr_groups; i++) 286 284 if (bitmap->s_block_bitmap[i]) 287 285 brelse(bitmap->s_block_bitmap[i]); 288 286 289 289 - if (size <= PAGE_SIZE) 290 290 - kfree(bitmap); 291 291 - else 292 292 - vfree(bitmap); 287 287 + kvfree(bitmap); 293 288 } 294 289 295 290 static void udf_free_partition(struct udf_part_map *map)

+4 -3

fs/xfs/xfs_file.c

reviewed

··· 1610 1610 /* 1611 1611 * pfn_mkwrite was originally inteneded to ensure we capture time stamp 1612 1612 * updates on write faults. In reality, it's need to serialise against 1613 1613 - * truncate similar to page_mkwrite. Hence we open-code dax_pfn_mkwrite() 1614 1614 - * here and cycle the XFS_MMAPLOCK_SHARED to ensure we serialise the fault 1615 1615 - * barrier in place. 1613 1613 + * truncate similar to page_mkwrite. Hence we cycle the XFS_MMAPLOCK_SHARED 1614 1614 + * to ensure we serialise the fault barrier in place. 1616 1615 */ 1617 1616 static int 1618 1617 xfs_filemap_pfn_mkwrite( ··· 1634 1635 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; 1635 1636 if (vmf->pgoff >= size) 1636 1637 ret = VM_FAULT_SIGBUS; 1638 1638 + else if (IS_DAX(inode)) 1639 1639 + ret = dax_pfn_mkwrite(vma, vmf); 1637 1640 xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); 1638 1641 sb_end_pagefault(inode->i_sb); 1639 1642 return ret;

include/linux/dax.h

reviewed

··· 36 36 { 37 37 return vma->vm_file && IS_DAX(vma->vm_file->f_mapping->host); 38 38 } 39 39 + 40 40 + static inline bool dax_mapping(struct address_space *mapping) 41 41 + { 42 42 + return mapping->host && IS_DAX(mapping->host); 43 43 + } 44 44 + int dax_writeback_mapping_range(struct address_space *mapping, loff_t start, 45 45 + loff_t end); 39 46 #endif

+2 -1

include/linux/fs.h

reviewed

··· 433 433 struct rw_semaphore i_mmap_rwsem; /* protect tree, count, list */ 434 434 /* Protected by tree_lock together with the radix tree */ 435 435 unsigned long nrpages; /* number of total pages */ 436 436 - unsigned long nrshadows; /* number of shadow entries */ 436 436 + /* number of shadow or DAX exceptional entries */ 437 437 + unsigned long nrexceptional; 437 438 pgoff_t writeback_index;/* writeback starts here */ 438 439 const struct address_space_operations *a_ops; /* methods */ 439 440 unsigned long flags; /* error bits/gfp mask */

include/linux/pagemap.h

reviewed

··· 361 361 unsigned int nr_pages, struct page **pages); 362 362 unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, 363 363 int tag, unsigned int nr_pages, struct page **pages); 364 364 + unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start, 365 365 + int tag, unsigned int nr_entries, 366 366 + struct page **entries, pgoff_t *indices); 364 367 365 368 struct page *grab_cache_page_write_begin(struct address_space *mapping, 366 369 pgoff_t index, unsigned flags);

+21 -1

include/linux/pmem.h

reviewed

··· 53 53 { 54 54 BUG(); 55 55 } 56 56 + 57 57 + static inline void arch_wb_cache_pmem(void __pmem *addr, size_t size) 58 58 + { 59 59 + BUG(); 60 60 + } 56 61 #endif 57 62 58 63 /* 59 64 * Architectures that define ARCH_HAS_PMEM_API must provide 60 65 * implementations for arch_memcpy_to_pmem(), arch_wmb_pmem(), 61 61 - * arch_copy_from_iter_pmem(), arch_clear_pmem() and arch_has_wmb_pmem(). 66 66 + * arch_copy_from_iter_pmem(), arch_clear_pmem(), arch_wb_cache_pmem() 67 67 + * and arch_has_wmb_pmem(). 62 68 */ 63 69 static inline void memcpy_from_pmem(void *dst, void __pmem const *src, size_t size) 64 70 { ··· 183 177 arch_clear_pmem(addr, size); 184 178 else 185 179 default_clear_pmem(addr, size); 180 180 + } 181 181 + 182 182 + /** 183 183 + * wb_cache_pmem - write back processor cache for PMEM memory range 184 184 + * @addr: virtual start address 185 185 + * @size: number of bytes to write back 186 186 + * 187 187 + * Write back the processor cache range starting at 'addr' for 'size' bytes. 188 188 + * This function requires explicit ordering with a wmb_pmem() call. 189 189 + */ 190 190 + static inline void wb_cache_pmem(void __pmem *addr, size_t size) 191 191 + { 192 192 + if (arch_has_pmem_api()) 193 193 + arch_wb_cache_pmem(addr, size); 186 194 } 187 195 #endif /* __PMEM_H__ */

include/linux/radix-tree.h

reviewed

··· 51 51 #define RADIX_TREE_EXCEPTIONAL_ENTRY 2 52 52 #define RADIX_TREE_EXCEPTIONAL_SHIFT 2 53 53 54 54 + #define RADIX_DAX_MASK 0xf 55 55 + #define RADIX_DAX_SHIFT 4 56 56 + #define RADIX_DAX_PTE (0x4 | RADIX_TREE_EXCEPTIONAL_ENTRY) 57 57 + #define RADIX_DAX_PMD (0x8 | RADIX_TREE_EXCEPTIONAL_ENTRY) 58 58 + #define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_MASK) 59 59 + #define RADIX_DAX_SECTOR(entry) (((unsigned long)entry >> RADIX_DAX_SHIFT)) 60 60 + #define RADIX_DAX_ENTRY(sector, pmd) ((void *)((unsigned long)sector << \ 61 61 + RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE))) 62 62 + 54 63 static inline int radix_tree_is_indirect_ptr(void *ptr) 55 64 { 56 65 return (int)((unsigned long)ptr & RADIX_TREE_INDIRECT_PTR);

+1 -1

ipc/sem.c

reviewed

··· 1493 1493 wake_up_sem_queue_do(&tasks); 1494 1494 out_free: 1495 1495 if (sem_io != fast_sem_io) 1496 1496 - ipc_free(sem_io, sizeof(ushort)*nsems); 1496 1496 + ipc_free(sem_io); 1497 1497 return err; 1498 1498 } 1499 1499

+3 -8

ipc/util.c

reviewed

··· 414 414 /** 415 415 * ipc_free - free ipc space 416 416 * @ptr: pointer returned by ipc_alloc 417 417 - * @size: size of block 418 417 * 419 419 - * Free a block created with ipc_alloc(). The caller must know the size 420 420 - * used in the allocation call. 418 418 + * Free a block created with ipc_alloc(). 421 419 */ 422 422 - void ipc_free(void *ptr, int size) 420 420 + void ipc_free(void *ptr) 423 421 { 424 424 - if (size > PAGE_SIZE) 425 425 - vfree(ptr); 426 426 - else 427 427 - kfree(ptr); 422 422 + kvfree(ptr); 428 423 } 429 424 430 425 /**

+1 -1

ipc/util.h

reviewed

··· 118 118 * both function can sleep 119 119 */ 120 120 void *ipc_alloc(int size); 121 121 - void ipc_free(void *ptr, int size); 121 121 + void ipc_free(void *ptr); 122 122 123 123 /* 124 124 * For allocation that need to be freed by RCU.

+85 -6

mm/filemap.c

reviewed

··· 11 11 */ 12 12 #include <linux/export.h> 13 13 #include <linux/compiler.h> 14 14 + #include <linux/dax.h> 14 15 #include <linux/fs.h> 15 16 #include <linux/uaccess.h> 16 17 #include <linux/capability.h> ··· 124 123 __radix_tree_lookup(&mapping->page_tree, page->index, &node, &slot); 125 124 126 125 if (shadow) { 127 127 - mapping->nrshadows++; 126 126 + mapping->nrexceptional++; 128 127 /* 129 129 - * Make sure the nrshadows update is committed before 128 128 + * Make sure the nrexceptional update is committed before 130 129 * the nrpages update so that final truncate racing 131 130 * with reclaim does not see both counters 0 at the 132 131 * same time and miss a shadow entry. ··· 482 481 { 483 482 int err = 0; 484 483 484 484 + if (dax_mapping(mapping) && mapping->nrexceptional) { 485 485 + err = dax_writeback_mapping_range(mapping, lstart, lend); 486 486 + if (err) 487 487 + return err; 488 488 + } 489 489 + 485 490 if (mapping->nrpages) { 486 491 err = __filemap_fdatawrite_range(mapping, lstart, lend, 487 492 WB_SYNC_ALL); ··· 586 579 p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock); 587 580 if (!radix_tree_exceptional_entry(p)) 588 581 return -EEXIST; 582 582 + 583 583 + if (WARN_ON(dax_mapping(mapping))) 584 584 + return -EINVAL; 585 585 + 589 586 if (shadowp) 590 587 *shadowp = p; 591 591 - mapping->nrshadows--; 588 588 + mapping->nrexceptional--; 592 589 if (node) 593 590 workingset_node_shadows_dec(node); 594 591 } ··· 1256 1245 if (radix_tree_deref_retry(page)) 1257 1246 goto restart; 1258 1247 /* 1259 1259 - * A shadow entry of a recently evicted page, 1260 1260 - * or a swap entry from shmem/tmpfs. Return 1261 1261 - * it without attempting to raise page count. 1248 1248 + * A shadow entry of a recently evicted page, a swap 1249 1249 + * entry from shmem/tmpfs or a DAX entry. Return it 1250 1250 + * without attempting to raise page count. 1262 1251 */ 1263 1252 goto export; 1264 1253 } ··· 1504 1493 return ret; 1505 1494 } 1506 1495 EXPORT_SYMBOL(find_get_pages_tag); 1496 1496 + 1497 1497 + /** 1498 1498 + * find_get_entries_tag - find and return entries that match @tag 1499 1499 + * @mapping: the address_space to search 1500 1500 + * @start: the starting page cache index 1501 1501 + * @tag: the tag index 1502 1502 + * @nr_entries: the maximum number of entries 1503 1503 + * @entries: where the resulting entries are placed 1504 1504 + * @indices: the cache indices corresponding to the entries in @entries 1505 1505 + * 1506 1506 + * Like find_get_entries, except we only return entries which are tagged with 1507 1507 + * @tag. 1508 1508 + */ 1509 1509 + unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start, 1510 1510 + int tag, unsigned int nr_entries, 1511 1511 + struct page **entries, pgoff_t *indices) 1512 1512 + { 1513 1513 + void **slot; 1514 1514 + unsigned int ret = 0; 1515 1515 + struct radix_tree_iter iter; 1516 1516 + 1517 1517 + if (!nr_entries) 1518 1518 + return 0; 1519 1519 + 1520 1520 + rcu_read_lock(); 1521 1521 + restart: 1522 1522 + radix_tree_for_each_tagged(slot, &mapping->page_tree, 1523 1523 + &iter, start, tag) { 1524 1524 + struct page *page; 1525 1525 + repeat: 1526 1526 + page = radix_tree_deref_slot(slot); 1527 1527 + if (unlikely(!page)) 1528 1528 + continue; 1529 1529 + if (radix_tree_exception(page)) { 1530 1530 + if (radix_tree_deref_retry(page)) { 1531 1531 + /* 1532 1532 + * Transient condition which can only trigger 1533 1533 + * when entry at index 0 moves out of or back 1534 1534 + * to root: none yet gotten, safe to restart. 1535 1535 + */ 1536 1536 + goto restart; 1537 1537 + } 1538 1538 + 1539 1539 + /* 1540 1540 + * A shadow entry of a recently evicted page, a swap 1541 1541 + * entry from shmem/tmpfs or a DAX entry. Return it 1542 1542 + * without attempting to raise page count. 1543 1543 + */ 1544 1544 + goto export; 1545 1545 + } 1546 1546 + if (!page_cache_get_speculative(page)) 1547 1547 + goto repeat; 1548 1548 + 1549 1549 + /* Has the page moved? */ 1550 1550 + if (unlikely(page != *slot)) { 1551 1551 + page_cache_release(page); 1552 1552 + goto repeat; 1553 1553 + } 1554 1554 + export: 1555 1555 + indices[ret] = iter.index; 1556 1556 + entries[ret] = page; 1557 1557 + if (++ret == nr_entries) 1558 1558 + break; 1559 1559 + } 1560 1560 + rcu_read_unlock(); 1561 1561 + return ret; 1562 1562 + } 1563 1563 + EXPORT_SYMBOL(find_get_entries_tag); 1507 1564 1508 1565 /* 1509 1566 * CD/DVDs are error prone. When a medium error occurs, the driver may fail

+7 -11

mm/percpu.c

reviewed

··· 305 305 /** 306 306 * pcpu_mem_free - free memory 307 307 * @ptr: memory to free 308 308 - * @size: size of the area 309 308 * 310 309 * Free @ptr. @ptr should have been allocated using pcpu_mem_zalloc(). 311 310 */ 312 312 - static void pcpu_mem_free(void *ptr, size_t size) 311 311 + static void pcpu_mem_free(void *ptr) 313 312 { 314 314 - if (size <= PAGE_SIZE) 315 315 - kfree(ptr); 316 316 - else 317 317 - vfree(ptr); 313 313 + kvfree(ptr); 318 314 } 319 315 320 316 /** ··· 459 463 * pcpu_mem_free() might end up calling vfree() which uses 460 464 * IRQ-unsafe lock and thus can't be called under pcpu_lock. 461 465 */ 462 462 - pcpu_mem_free(old, old_size); 463 463 - pcpu_mem_free(new, new_size); 466 466 + pcpu_mem_free(old); 467 467 + pcpu_mem_free(new); 464 468 465 469 return 0; 466 470 } ··· 728 732 chunk->map = pcpu_mem_zalloc(PCPU_DFL_MAP_ALLOC * 729 733 sizeof(chunk->map[0])); 730 734 if (!chunk->map) { 731 731 - pcpu_mem_free(chunk, pcpu_chunk_struct_size); 735 735 + pcpu_mem_free(chunk); 732 736 return NULL; 733 737 } 734 738 ··· 749 753 { 750 754 if (!chunk) 751 755 return; 752 752 - pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0])); 753 753 - pcpu_mem_free(chunk, pcpu_chunk_struct_size); 756 756 + pcpu_mem_free(chunk->map); 757 757 + pcpu_mem_free(chunk); 754 758 } 755 759 756 760 /**

+39 -30

mm/truncate.c

reviewed

··· 9 9 10 10 #include <linux/kernel.h> 11 11 #include <linux/backing-dev.h> 12 12 + #include <linux/dax.h> 12 13 #include <linux/gfp.h> 13 14 #include <linux/mm.h> 14 15 #include <linux/swap.h> ··· 35 34 return; 36 35 37 36 spin_lock_irq(&mapping->tree_lock); 38 38 - /* 39 39 - * Regular page slots are stabilized by the page lock even 40 40 - * without the tree itself locked. These unlocked entries 41 41 - * need verification under the tree lock. 42 42 - */ 43 43 - if (!__radix_tree_lookup(&mapping->page_tree, index, &node, &slot)) 44 44 - goto unlock; 45 45 - if (*slot != entry) 46 46 - goto unlock; 47 47 - radix_tree_replace_slot(slot, NULL); 48 48 - mapping->nrshadows--; 49 49 - if (!node) 50 50 - goto unlock; 51 51 - workingset_node_shadows_dec(node); 52 52 - /* 53 53 - * Don't track node without shadow entries. 54 54 - * 55 55 - * Avoid acquiring the list_lru lock if already untracked. 56 56 - * The list_empty() test is safe as node->private_list is 57 57 - * protected by mapping->tree_lock. 58 58 - */ 59 59 - if (!workingset_node_shadows(node) && 60 60 - !list_empty(&node->private_list)) 61 61 - list_lru_del(&workingset_shadow_nodes, &node->private_list); 62 62 - __radix_tree_delete_node(&mapping->page_tree, node); 37 37 + 38 38 + if (dax_mapping(mapping)) { 39 39 + if (radix_tree_delete_item(&mapping->page_tree, index, entry)) 40 40 + mapping->nrexceptional--; 41 41 + } else { 42 42 + /* 43 43 + * Regular page slots are stabilized by the page lock even 44 44 + * without the tree itself locked. These unlocked entries 45 45 + * need verification under the tree lock. 46 46 + */ 47 47 + if (!__radix_tree_lookup(&mapping->page_tree, index, &node, 48 48 + &slot)) 49 49 + goto unlock; 50 50 + if (*slot != entry) 51 51 + goto unlock; 52 52 + radix_tree_replace_slot(slot, NULL); 53 53 + mapping->nrexceptional--; 54 54 + if (!node) 55 55 + goto unlock; 56 56 + workingset_node_shadows_dec(node); 57 57 + /* 58 58 + * Don't track node without shadow entries. 59 59 + * 60 60 + * Avoid acquiring the list_lru lock if already untracked. 61 61 + * The list_empty() test is safe as node->private_list is 62 62 + * protected by mapping->tree_lock. 63 63 + */ 64 64 + if (!workingset_node_shadows(node) && 65 65 + !list_empty(&node->private_list)) 66 66 + list_lru_del(&workingset_shadow_nodes, 67 67 + &node->private_list); 68 68 + __radix_tree_delete_node(&mapping->page_tree, node); 69 69 + } 63 70 unlock: 64 71 spin_unlock_irq(&mapping->tree_lock); 65 72 } ··· 237 228 int i; 238 229 239 230 cleancache_invalidate_inode(mapping); 240 240 - if (mapping->nrpages == 0 && mapping->nrshadows == 0) 231 231 + if (mapping->nrpages == 0 && mapping->nrexceptional == 0) 241 232 return; 242 233 243 234 /* Offsets within partial pages */ ··· 411 402 */ 412 403 void truncate_inode_pages_final(struct address_space *mapping) 413 404 { 414 414 - unsigned long nrshadows; 405 405 + unsigned long nrexceptional; 415 406 unsigned long nrpages; 416 407 417 408 /* ··· 425 416 426 417 /* 427 418 * When reclaim installs eviction entries, it increases 428 428 - * nrshadows first, then decreases nrpages. Make sure we see 419 419 + * nrexceptional first, then decreases nrpages. Make sure we see 429 420 * this in the right order or we might miss an entry. 430 421 */ 431 422 nrpages = mapping->nrpages; 432 423 smp_rmb(); 433 433 - nrshadows = mapping->nrshadows; 424 424 + nrexceptional = mapping->nrexceptional; 434 425 435 435 - if (nrpages || nrshadows) { 426 426 + if (nrpages || nrexceptional) { 436 427 /* 437 428 * As truncation uses a lockless tree lookup, cycle 438 429 * the tree lock to make sure any ongoing tree

+8 -1

mm/vmscan.c

reviewed

··· 46 46 #include <linux/oom.h> 47 47 #include <linux/prefetch.h> 48 48 #include <linux/printk.h> 49 49 + #include <linux/dax.h> 49 50 50 51 #include <asm/tlbflush.h> 51 52 #include <asm/div64.h> ··· 672 671 * inode reclaim needs to empty out the radix tree or 673 672 * the nodes are lost. Don't plant shadows behind its 674 673 * back. 674 674 + * 675 675 + * We also don't store shadows for DAX mappings because the 676 676 + * only page cache pages found in these are zero pages 677 677 + * covering holes, and because we don't want to mix DAX 678 678 + * exceptional entries and shadow exceptional entries in the 679 679 + * same page_tree. 675 680 */ 676 681 if (reclaimed && page_is_file_cache(page) && 677 677 - !mapping_exiting(mapping)) 682 682 + !mapping_exiting(mapping) && !dax_mapping(mapping)) 678 683 shadow = workingset_eviction(mapping, page); 679 684 __delete_from_page_cache(page, shadow, memcg); 680 685 spin_unlock_irqrestore(&mapping->tree_lock, flags);

+2 -2

mm/workingset.c

reviewed

··· 351 351 node->slots[i] = NULL; 352 352 BUG_ON(node->count < (1U << RADIX_TREE_COUNT_SHIFT)); 353 353 node->count -= 1U << RADIX_TREE_COUNT_SHIFT; 354 354 - BUG_ON(!mapping->nrshadows); 355 355 - mapping->nrshadows--; 354 354 + BUG_ON(!mapping->nrexceptional); 355 355 + mapping->nrexceptional--; 356 356 } 357 357 } 358 358 BUG_ON(node->count);

+1 -3

net/ipv4/fib_trie.c

reviewed

··· 289 289 290 290 if (!n->tn_bits) 291 291 kmem_cache_free(trie_leaf_kmem, n); 292 292 - else if (n->tn_bits <= TNODE_KMALLOC_MAX) 293 293 - kfree(n); 294 292 else 295 295 - vfree(n); 293 293 + kvfree(n); 296 294 } 297 295 298 296 #define node_free(n) call_rcu(&tn_info(n)->rcu, __node_free_rcu)