Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4

* 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4:
jbd2: fix race between write_metadata_buffer and get_write_access
ext4: Fix ext4_mb_initialize_context() to initialize all fields
ext4: fix null handler of ioctls in no journal mode
ext4: Fix buffer head reference leak in no-journal mode
ext4: Move __ext4_journalled_writepage() to avoid forward declaration
ext4: Fix mmap/truncate race when blocksize < pagesize && !nodellaoc
ext4: Fix mmap/truncate race when blocksize < pagesize && delayed allocation
ext4: Don't look at buffer_heads outside i_size.
ext4: Fix goal inum check in the inode allocator
ext4: fix no journal corruption with locale-gen
ext4: Calculate required journal credits for inserting an extent properly
ext4: Fix truncation of symlinks after failed write
jbd2: Fix a race between checkpointing code and journal_get_write_access()
ext4: Use rcu_barrier() on module unload.
ext4: naturally align struct ext4_allocation_request
ext4: mark several more functions in mballoc.c as noinline
ext4: Fix potential reclaim deadlock when truncating partial block
jbd2: Remove GFP_ATOMIC kmalloc from inside spinlock critical region
ext4: Fix type warning on 64-bit platforms in tracing events header

+240 -398
+12 -12
fs/ext4/ext4.h
··· 93 93 struct ext4_allocation_request { 94 94 /* target inode for block we're allocating */ 95 95 struct inode *inode; 96 - /* logical block in target inode */ 97 - ext4_lblk_t logical; 98 - /* phys. target (a hint) */ 99 - ext4_fsblk_t goal; 100 - /* the closest logical allocated block to the left */ 101 - ext4_lblk_t lleft; 102 - /* phys. block for ^^^ */ 103 - ext4_fsblk_t pleft; 104 - /* the closest logical allocated block to the right */ 105 - ext4_lblk_t lright; 106 - /* phys. block for ^^^ */ 107 - ext4_fsblk_t pright; 108 96 /* how many blocks we want to allocate */ 109 97 unsigned int len; 98 + /* logical block in target inode */ 99 + ext4_lblk_t logical; 100 + /* the closest logical allocated block to the left */ 101 + ext4_lblk_t lleft; 102 + /* the closest logical allocated block to the right */ 103 + ext4_lblk_t lright; 104 + /* phys. target (a hint) */ 105 + ext4_fsblk_t goal; 106 + /* phys. block for the closest logical allocated block to the left */ 107 + ext4_fsblk_t pleft; 108 + /* phys. block for the closest logical allocated block to the right */ 109 + ext4_fsblk_t pright; 110 110 /* flags. see above EXT4_MB_HINT_* */ 111 111 unsigned int flags; 112 112 };
+4
fs/ext4/ext4_jbd2.c
··· 43 43 ext4_journal_abort_handle(where, __func__, bh, 44 44 handle, err); 45 45 } 46 + else 47 + brelse(bh); 46 48 return err; 47 49 } 48 50 ··· 59 57 ext4_journal_abort_handle(where, __func__, bh, 60 58 handle, err); 61 59 } 60 + else 61 + brelse(bh); 62 62 return err; 63 63 } 64 64
+4 -2
fs/ext4/ext4_jbd2.h
··· 131 131 int __ext4_journal_get_write_access(const char *where, handle_t *handle, 132 132 struct buffer_head *bh); 133 133 134 + /* When called with an invalid handle, this will still do a put on the BH */ 134 135 int __ext4_journal_forget(const char *where, handle_t *handle, 135 136 struct buffer_head *bh); 136 137 138 + /* When called with an invalid handle, this will still do a put on the BH */ 137 139 int __ext4_journal_revoke(const char *where, handle_t *handle, 138 140 ext4_fsblk_t blocknr, struct buffer_head *bh); 139 141 ··· 283 281 284 282 static inline int ext4_should_writeback_data(struct inode *inode) 285 283 { 286 - if (EXT4_JOURNAL(inode) == NULL) 287 - return 0; 288 284 if (!S_ISREG(inode->i_mode)) 289 285 return 0; 286 + if (EXT4_JOURNAL(inode) == NULL) 287 + return 1; 290 288 if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL) 291 289 return 0; 292 290 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
+1
fs/ext4/extents.c
··· 1977 1977 */ 1978 1978 /* 1 bitmap, 1 block group descriptor */ 1979 1979 ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb); 1980 + return ret; 1980 1981 } 1981 1982 } 1982 1983
+1 -1
fs/ext4/ialloc.c
··· 833 833 if (!goal) 834 834 goal = sbi->s_inode_goal; 835 835 836 - if (goal && goal < le32_to_cpu(sbi->s_es->s_inodes_count)) { 836 + if (goal && goal <= le32_to_cpu(sbi->s_es->s_inodes_count)) { 837 837 group = (goal - 1) / EXT4_INODES_PER_GROUP(sb); 838 838 ino = (goal - 1) % EXT4_INODES_PER_GROUP(sb); 839 839 ret2 = 0;
+128 -256
fs/ext4/inode.c
··· 78 78 * but there may still be a record of it in the journal, and that record 79 79 * still needs to be revoked. 80 80 * 81 - * If the handle isn't valid we're not journaling so there's nothing to do. 81 + * If the handle isn't valid we're not journaling, but we still need to 82 + * call into ext4_journal_revoke() to put the buffer head. 82 83 */ 83 84 int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode, 84 85 struct buffer_head *bh, ext4_fsblk_t blocknr) 85 86 { 86 87 int err; 87 - 88 - if (!ext4_handle_valid(handle)) 89 - return 0; 90 88 91 89 might_sleep(); 92 90 ··· 1511 1513 * Add inode to orphan list in case we crash before 1512 1514 * truncate finishes 1513 1515 */ 1514 - if (pos + len > inode->i_size) 1516 + if (pos + len > inode->i_size && ext4_can_truncate(inode)) 1515 1517 ext4_orphan_add(handle, inode); 1516 1518 1517 1519 ext4_journal_stop(handle); 1518 1520 if (pos + len > inode->i_size) { 1519 - vmtruncate(inode, inode->i_size); 1521 + ext4_truncate(inode); 1520 1522 /* 1521 - * If vmtruncate failed early the inode might 1523 + * If truncate failed early the inode might 1522 1524 * still be on the orphan list; we need to 1523 1525 * make sure the inode is removed from the 1524 1526 * orphan list in that case. ··· 1612 1614 ret2 = ext4_generic_write_end(file, mapping, pos, len, copied, 1613 1615 page, fsdata); 1614 1616 copied = ret2; 1615 - if (pos + len > inode->i_size) 1617 + if (pos + len > inode->i_size && ext4_can_truncate(inode)) 1616 1618 /* if we have allocated more blocks and copied 1617 1619 * less. We will have blocks allocated outside 1618 1620 * inode->i_size. So truncate them ··· 1626 1628 ret = ret2; 1627 1629 1628 1630 if (pos + len > inode->i_size) { 1629 - vmtruncate(inode, inode->i_size); 1631 + ext4_truncate(inode); 1630 1632 /* 1631 - * If vmtruncate failed early the inode might still be 1633 + * If truncate failed early the inode might still be 1632 1634 * on the orphan list; we need to make sure the inode 1633 1635 * is removed from the orphan list in that case. 1634 1636 */ ··· 1653 1655 ret2 = ext4_generic_write_end(file, mapping, pos, len, copied, 1654 1656 page, fsdata); 1655 1657 copied = ret2; 1656 - if (pos + len > inode->i_size) 1658 + if (pos + len > inode->i_size && ext4_can_truncate(inode)) 1657 1659 /* if we have allocated more blocks and copied 1658 1660 * less. We will have blocks allocated outside 1659 1661 * inode->i_size. So truncate them ··· 1668 1670 ret = ret2; 1669 1671 1670 1672 if (pos + len > inode->i_size) { 1671 - vmtruncate(inode, inode->i_size); 1673 + ext4_truncate(inode); 1672 1674 /* 1673 - * If vmtruncate failed early the inode might still be 1675 + * If truncate failed early the inode might still be 1674 1676 * on the orphan list; we need to make sure the inode 1675 1677 * is removed from the orphan list in that case. 1676 1678 */ ··· 1720 1722 1721 1723 unlock_page(page); 1722 1724 page_cache_release(page); 1723 - if (pos + len > inode->i_size) 1725 + if (pos + len > inode->i_size && ext4_can_truncate(inode)) 1724 1726 /* if we have allocated more blocks and copied 1725 1727 * less. We will have blocks allocated outside 1726 1728 * inode->i_size. So truncate them ··· 1731 1733 if (!ret) 1732 1734 ret = ret2; 1733 1735 if (pos + len > inode->i_size) { 1734 - vmtruncate(inode, inode->i_size); 1736 + ext4_truncate(inode); 1735 1737 /* 1736 - * If vmtruncate failed early the inode might still be 1738 + * If truncate failed early the inode might still be 1737 1739 * on the orphan list; we need to make sure the inode 1738 1740 * is removed from the orphan list in that case. 1739 1741 */ ··· 2303 2305 return; 2304 2306 } 2305 2307 2306 - static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh) 2308 + static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh) 2307 2309 { 2308 - /* 2309 - * unmapped buffer is possible for holes. 2310 - * delay buffer is possible with delayed allocation. 2311 - * We also need to consider unwritten buffer as unmapped. 2312 - */ 2313 - return (!buffer_mapped(bh) || buffer_delay(bh) || 2314 - buffer_unwritten(bh)) && buffer_dirty(bh); 2310 + return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh); 2315 2311 } 2316 2312 2317 2313 /* ··· 2390 2398 * We need to try to allocate 2391 2399 * unmapped blocks in the same page. 2392 2400 * Otherwise we won't make progress 2393 - * with the page in ext4_da_writepage 2401 + * with the page in ext4_writepage 2394 2402 */ 2395 - if (ext4_bh_unmapped_or_delay(NULL, bh)) { 2403 + if (ext4_bh_delay_or_unwritten(NULL, bh)) { 2396 2404 mpage_add_bh_to_extent(mpd, logical, 2397 2405 bh->b_size, 2398 2406 bh->b_state); ··· 2509 2517 * so call get_block_wrap with create = 0 2510 2518 */ 2511 2519 ret = ext4_get_blocks(NULL, inode, iblock, max_blocks, bh_result, 0); 2512 - BUG_ON(create && ret == 0); 2513 2520 if (ret > 0) { 2514 2521 bh_result->b_size = (ret << inode->i_blkbits); 2515 2522 ret = 0; ··· 2516 2525 return ret; 2517 2526 } 2518 2527 2528 + static int bget_one(handle_t *handle, struct buffer_head *bh) 2529 + { 2530 + get_bh(bh); 2531 + return 0; 2532 + } 2533 + 2534 + static int bput_one(handle_t *handle, struct buffer_head *bh) 2535 + { 2536 + put_bh(bh); 2537 + return 0; 2538 + } 2539 + 2540 + static int __ext4_journalled_writepage(struct page *page, 2541 + struct writeback_control *wbc, 2542 + unsigned int len) 2543 + { 2544 + struct address_space *mapping = page->mapping; 2545 + struct inode *inode = mapping->host; 2546 + struct buffer_head *page_bufs; 2547 + handle_t *handle = NULL; 2548 + int ret = 0; 2549 + int err; 2550 + 2551 + page_bufs = page_buffers(page); 2552 + BUG_ON(!page_bufs); 2553 + walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one); 2554 + /* As soon as we unlock the page, it can go away, but we have 2555 + * references to buffers so we are safe */ 2556 + unlock_page(page); 2557 + 2558 + handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); 2559 + if (IS_ERR(handle)) { 2560 + ret = PTR_ERR(handle); 2561 + goto out; 2562 + } 2563 + 2564 + ret = walk_page_buffers(handle, page_bufs, 0, len, NULL, 2565 + do_journal_get_write_access); 2566 + 2567 + err = walk_page_buffers(handle, page_bufs, 0, len, NULL, 2568 + write_end_fn); 2569 + if (ret == 0) 2570 + ret = err; 2571 + err = ext4_journal_stop(handle); 2572 + if (!ret) 2573 + ret = err; 2574 + 2575 + walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one); 2576 + EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; 2577 + out: 2578 + return ret; 2579 + } 2580 + 2519 2581 /* 2582 + * Note that we don't need to start a transaction unless we're journaling data 2583 + * because we should have holes filled from ext4_page_mkwrite(). We even don't 2584 + * need to file the inode to the transaction's list in ordered mode because if 2585 + * we are writing back data added by write(), the inode is already there and if 2586 + * we are writing back data modified via mmap(), noone guarantees in which 2587 + * transaction the data will hit the disk. In case we are journaling data, we 2588 + * cannot start transaction directly because transaction start ranks above page 2589 + * lock so we have to do some magic. 2590 + * 2520 2591 * This function can get called via... 2521 2592 * - ext4_da_writepages after taking page lock (have journal handle) 2522 2593 * - journal_submit_inode_data_buffers (no journal handle) 2523 2594 * - shrink_page_list via pdflush (no journal handle) 2524 2595 * - grab_page_cache when doing write_begin (have journal handle) 2596 + * 2597 + * We don't do any block allocation in this function. If we have page with 2598 + * multiple blocks we need to write those buffer_heads that are mapped. This 2599 + * is important for mmaped based write. So if we do with blocksize 1K 2600 + * truncate(f, 1024); 2601 + * a = mmap(f, 0, 4096); 2602 + * a[0] = 'a'; 2603 + * truncate(f, 4096); 2604 + * we have in the page first buffer_head mapped via page_mkwrite call back 2605 + * but other bufer_heads would be unmapped but dirty(dirty done via the 2606 + * do_wp_page). So writepage should write the first block. If we modify 2607 + * the mmap area beyond 1024 we will again get a page_fault and the 2608 + * page_mkwrite callback will do the block allocation and mark the 2609 + * buffer_heads mapped. 2610 + * 2611 + * We redirty the page if we have any buffer_heads that is either delay or 2612 + * unwritten in the page. 2613 + * 2614 + * We can get recursively called as show below. 2615 + * 2616 + * ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() -> 2617 + * ext4_writepage() 2618 + * 2619 + * But since we don't do any block allocation we should not deadlock. 2620 + * Page also have the dirty flag cleared so we don't get recurive page_lock. 2525 2621 */ 2526 - static int ext4_da_writepage(struct page *page, 2527 - struct writeback_control *wbc) 2622 + static int ext4_writepage(struct page *page, 2623 + struct writeback_control *wbc) 2528 2624 { 2529 2625 int ret = 0; 2530 2626 loff_t size; ··· 2619 2541 struct buffer_head *page_bufs; 2620 2542 struct inode *inode = page->mapping->host; 2621 2543 2622 - trace_ext4_da_writepage(inode, page); 2544 + trace_ext4_writepage(inode, page); 2623 2545 size = i_size_read(inode); 2624 2546 if (page->index == size >> PAGE_CACHE_SHIFT) 2625 2547 len = size & ~PAGE_CACHE_MASK; ··· 2629 2551 if (page_has_buffers(page)) { 2630 2552 page_bufs = page_buffers(page); 2631 2553 if (walk_page_buffers(NULL, page_bufs, 0, len, NULL, 2632 - ext4_bh_unmapped_or_delay)) { 2554 + ext4_bh_delay_or_unwritten)) { 2633 2555 /* 2634 2556 * We don't want to do block allocation 2635 2557 * So redirty the page and return ··· 2656 2578 * all are mapped and non delay. We don't want to 2657 2579 * do block allocation here. 2658 2580 */ 2659 - ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, 2581 + ret = block_prepare_write(page, 0, len, 2660 2582 noalloc_get_block_write); 2661 2583 if (!ret) { 2662 2584 page_bufs = page_buffers(page); 2663 2585 /* check whether all are mapped and non delay */ 2664 2586 if (walk_page_buffers(NULL, page_bufs, 0, len, NULL, 2665 - ext4_bh_unmapped_or_delay)) { 2587 + ext4_bh_delay_or_unwritten)) { 2666 2588 redirty_page_for_writepage(wbc, page); 2667 2589 unlock_page(page); 2668 2590 return 0; ··· 2678 2600 return 0; 2679 2601 } 2680 2602 /* now mark the buffer_heads as dirty and uptodate */ 2681 - block_commit_write(page, 0, PAGE_CACHE_SIZE); 2603 + block_commit_write(page, 0, len); 2604 + } 2605 + 2606 + if (PageChecked(page) && ext4_should_journal_data(inode)) { 2607 + /* 2608 + * It's mmapped pagecache. Add buffers and journal it. There 2609 + * doesn't seem much point in redirtying the page here. 2610 + */ 2611 + ClearPageChecked(page); 2612 + return __ext4_journalled_writepage(page, wbc, len); 2682 2613 } 2683 2614 2684 2615 if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) ··· 2994 2907 * i_size_read because we hold i_mutex. 2995 2908 */ 2996 2909 if (pos + len > inode->i_size) 2997 - vmtruncate(inode, inode->i_size); 2910 + ext4_truncate(inode); 2998 2911 } 2999 2912 3000 2913 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) ··· 3217 3130 return generic_block_bmap(mapping, block, ext4_get_block); 3218 3131 } 3219 3132 3220 - static int bget_one(handle_t *handle, struct buffer_head *bh) 3221 - { 3222 - get_bh(bh); 3223 - return 0; 3224 - } 3225 - 3226 - static int bput_one(handle_t *handle, struct buffer_head *bh) 3227 - { 3228 - put_bh(bh); 3229 - return 0; 3230 - } 3231 - 3232 - /* 3233 - * Note that we don't need to start a transaction unless we're journaling data 3234 - * because we should have holes filled from ext4_page_mkwrite(). We even don't 3235 - * need to file the inode to the transaction's list in ordered mode because if 3236 - * we are writing back data added by write(), the inode is already there and if 3237 - * we are writing back data modified via mmap(), noone guarantees in which 3238 - * transaction the data will hit the disk. In case we are journaling data, we 3239 - * cannot start transaction directly because transaction start ranks above page 3240 - * lock so we have to do some magic. 3241 - * 3242 - * In all journaling modes block_write_full_page() will start the I/O. 3243 - * 3244 - * Problem: 3245 - * 3246 - * ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() -> 3247 - * ext4_writepage() 3248 - * 3249 - * Similar for: 3250 - * 3251 - * ext4_file_write() -> generic_file_write() -> __alloc_pages() -> ... 3252 - * 3253 - * Same applies to ext4_get_block(). We will deadlock on various things like 3254 - * lock_journal and i_data_sem 3255 - * 3256 - * Setting PF_MEMALLOC here doesn't work - too many internal memory 3257 - * allocations fail. 3258 - * 3259 - * 16May01: If we're reentered then journal_current_handle() will be 3260 - * non-zero. We simply *return*. 3261 - * 3262 - * 1 July 2001: @@@ FIXME: 3263 - * In journalled data mode, a data buffer may be metadata against the 3264 - * current transaction. But the same file is part of a shared mapping 3265 - * and someone does a writepage() on it. 3266 - * 3267 - * We will move the buffer onto the async_data list, but *after* it has 3268 - * been dirtied. So there's a small window where we have dirty data on 3269 - * BJ_Metadata. 3270 - * 3271 - * Note that this only applies to the last partial page in the file. The 3272 - * bit which block_write_full_page() uses prepare/commit for. (That's 3273 - * broken code anyway: it's wrong for msync()). 3274 - * 3275 - * It's a rare case: affects the final partial page, for journalled data 3276 - * where the file is subject to bith write() and writepage() in the same 3277 - * transction. To fix it we'll need a custom block_write_full_page(). 3278 - * We'll probably need that anyway for journalling writepage() output. 3279 - * 3280 - * We don't honour synchronous mounts for writepage(). That would be 3281 - * disastrous. Any write() or metadata operation will sync the fs for 3282 - * us. 3283 - * 3284 - */ 3285 - static int __ext4_normal_writepage(struct page *page, 3286 - struct writeback_control *wbc) 3287 - { 3288 - struct inode *inode = page->mapping->host; 3289 - 3290 - if (test_opt(inode->i_sb, NOBH)) 3291 - return nobh_writepage(page, noalloc_get_block_write, wbc); 3292 - else 3293 - return block_write_full_page(page, noalloc_get_block_write, 3294 - wbc); 3295 - } 3296 - 3297 - static int ext4_normal_writepage(struct page *page, 3298 - struct writeback_control *wbc) 3299 - { 3300 - struct inode *inode = page->mapping->host; 3301 - loff_t size = i_size_read(inode); 3302 - loff_t len; 3303 - 3304 - trace_ext4_normal_writepage(inode, page); 3305 - J_ASSERT(PageLocked(page)); 3306 - if (page->index == size >> PAGE_CACHE_SHIFT) 3307 - len = size & ~PAGE_CACHE_MASK; 3308 - else 3309 - len = PAGE_CACHE_SIZE; 3310 - 3311 - if (page_has_buffers(page)) { 3312 - /* if page has buffers it should all be mapped 3313 - * and allocated. If there are not buffers attached 3314 - * to the page we know the page is dirty but it lost 3315 - * buffers. That means that at some moment in time 3316 - * after write_begin() / write_end() has been called 3317 - * all buffers have been clean and thus they must have been 3318 - * written at least once. So they are all mapped and we can 3319 - * happily proceed with mapping them and writing the page. 3320 - */ 3321 - BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, 3322 - ext4_bh_unmapped_or_delay)); 3323 - } 3324 - 3325 - if (!ext4_journal_current_handle()) 3326 - return __ext4_normal_writepage(page, wbc); 3327 - 3328 - redirty_page_for_writepage(wbc, page); 3329 - unlock_page(page); 3330 - return 0; 3331 - } 3332 - 3333 - static int __ext4_journalled_writepage(struct page *page, 3334 - struct writeback_control *wbc) 3335 - { 3336 - struct address_space *mapping = page->mapping; 3337 - struct inode *inode = mapping->host; 3338 - struct buffer_head *page_bufs; 3339 - handle_t *handle = NULL; 3340 - int ret = 0; 3341 - int err; 3342 - 3343 - ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, 3344 - noalloc_get_block_write); 3345 - if (ret != 0) 3346 - goto out_unlock; 3347 - 3348 - page_bufs = page_buffers(page); 3349 - walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL, 3350 - bget_one); 3351 - /* As soon as we unlock the page, it can go away, but we have 3352 - * references to buffers so we are safe */ 3353 - unlock_page(page); 3354 - 3355 - handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); 3356 - if (IS_ERR(handle)) { 3357 - ret = PTR_ERR(handle); 3358 - goto out; 3359 - } 3360 - 3361 - ret = walk_page_buffers(handle, page_bufs, 0, 3362 - PAGE_CACHE_SIZE, NULL, do_journal_get_write_access); 3363 - 3364 - err = walk_page_buffers(handle, page_bufs, 0, 3365 - PAGE_CACHE_SIZE, NULL, write_end_fn); 3366 - if (ret == 0) 3367 - ret = err; 3368 - err = ext4_journal_stop(handle); 3369 - if (!ret) 3370 - ret = err; 3371 - 3372 - walk_page_buffers(handle, page_bufs, 0, 3373 - PAGE_CACHE_SIZE, NULL, bput_one); 3374 - EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; 3375 - goto out; 3376 - 3377 - out_unlock: 3378 - unlock_page(page); 3379 - out: 3380 - return ret; 3381 - } 3382 - 3383 - static int ext4_journalled_writepage(struct page *page, 3384 - struct writeback_control *wbc) 3385 - { 3386 - struct inode *inode = page->mapping->host; 3387 - loff_t size = i_size_read(inode); 3388 - loff_t len; 3389 - 3390 - trace_ext4_journalled_writepage(inode, page); 3391 - J_ASSERT(PageLocked(page)); 3392 - if (page->index == size >> PAGE_CACHE_SHIFT) 3393 - len = size & ~PAGE_CACHE_MASK; 3394 - else 3395 - len = PAGE_CACHE_SIZE; 3396 - 3397 - if (page_has_buffers(page)) { 3398 - /* if page has buffers it should all be mapped 3399 - * and allocated. If there are not buffers attached 3400 - * to the page we know the page is dirty but it lost 3401 - * buffers. That means that at some moment in time 3402 - * after write_begin() / write_end() has been called 3403 - * all buffers have been clean and thus they must have been 3404 - * written at least once. So they are all mapped and we can 3405 - * happily proceed with mapping them and writing the page. 3406 - */ 3407 - BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, 3408 - ext4_bh_unmapped_or_delay)); 3409 - } 3410 - 3411 - if (ext4_journal_current_handle()) 3412 - goto no_write; 3413 - 3414 - if (PageChecked(page)) { 3415 - /* 3416 - * It's mmapped pagecache. Add buffers and journal it. There 3417 - * doesn't seem much point in redirtying the page here. 3418 - */ 3419 - ClearPageChecked(page); 3420 - return __ext4_journalled_writepage(page, wbc); 3421 - } else { 3422 - /* 3423 - * It may be a page full of checkpoint-mode buffers. We don't 3424 - * really know unless we go poke around in the buffer_heads. 3425 - * But block_write_full_page will do the right thing. 3426 - */ 3427 - return block_write_full_page(page, noalloc_get_block_write, 3428 - wbc); 3429 - } 3430 - no_write: 3431 - redirty_page_for_writepage(wbc, page); 3432 - unlock_page(page); 3433 - return 0; 3434 - } 3435 - 3436 3133 static int ext4_readpage(struct file *file, struct page *page) 3437 3134 { 3438 3135 return mpage_readpage(page, ext4_get_block); ··· 3363 3492 static const struct address_space_operations ext4_ordered_aops = { 3364 3493 .readpage = ext4_readpage, 3365 3494 .readpages = ext4_readpages, 3366 - .writepage = ext4_normal_writepage, 3495 + .writepage = ext4_writepage, 3367 3496 .sync_page = block_sync_page, 3368 3497 .write_begin = ext4_write_begin, 3369 3498 .write_end = ext4_ordered_write_end, ··· 3378 3507 static const struct address_space_operations ext4_writeback_aops = { 3379 3508 .readpage = ext4_readpage, 3380 3509 .readpages = ext4_readpages, 3381 - .writepage = ext4_normal_writepage, 3510 + .writepage = ext4_writepage, 3382 3511 .sync_page = block_sync_page, 3383 3512 .write_begin = ext4_write_begin, 3384 3513 .write_end = ext4_writeback_write_end, ··· 3393 3522 static const struct address_space_operations ext4_journalled_aops = { 3394 3523 .readpage = ext4_readpage, 3395 3524 .readpages = ext4_readpages, 3396 - .writepage = ext4_journalled_writepage, 3525 + .writepage = ext4_writepage, 3397 3526 .sync_page = block_sync_page, 3398 3527 .write_begin = ext4_write_begin, 3399 3528 .write_end = ext4_journalled_write_end, ··· 3407 3536 static const struct address_space_operations ext4_da_aops = { 3408 3537 .readpage = ext4_readpage, 3409 3538 .readpages = ext4_readpages, 3410 - .writepage = ext4_da_writepage, 3539 + .writepage = ext4_writepage, 3411 3540 .writepages = ext4_da_writepages, 3412 3541 .sync_page = block_sync_page, 3413 3542 .write_begin = ext4_da_write_begin, ··· 3454 3583 struct page *page; 3455 3584 int err = 0; 3456 3585 3457 - page = grab_cache_page(mapping, from >> PAGE_CACHE_SHIFT); 3586 + page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT, 3587 + mapping_gfp_mask(mapping) & ~__GFP_FS); 3458 3588 if (!page) 3459 3589 return -EINVAL; 3460 3590
+12 -8
fs/ext4/ioctl.c
··· 191 191 case EXT4_IOC_GROUP_EXTEND: { 192 192 ext4_fsblk_t n_blocks_count; 193 193 struct super_block *sb = inode->i_sb; 194 - int err, err2; 194 + int err, err2=0; 195 195 196 196 if (!capable(CAP_SYS_RESOURCE)) 197 197 return -EPERM; ··· 204 204 return err; 205 205 206 206 err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count); 207 - jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); 208 - err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal); 209 - jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); 207 + if (EXT4_SB(sb)->s_journal) { 208 + jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); 209 + err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal); 210 + jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); 211 + } 210 212 if (err == 0) 211 213 err = err2; 212 214 mnt_drop_write(filp->f_path.mnt); ··· 253 251 case EXT4_IOC_GROUP_ADD: { 254 252 struct ext4_new_group_data input; 255 253 struct super_block *sb = inode->i_sb; 256 - int err, err2; 254 + int err, err2=0; 257 255 258 256 if (!capable(CAP_SYS_RESOURCE)) 259 257 return -EPERM; ··· 267 265 return err; 268 266 269 267 err = ext4_group_add(sb, &input); 270 - jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); 271 - err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal); 272 - jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); 268 + if (EXT4_SB(sb)->s_journal) { 269 + jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); 270 + err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal); 271 + jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); 272 + } 273 273 if (err == 0) 274 274 err = err2; 275 275 mnt_drop_write(filp->f_path.mnt);
+23 -27
fs/ext4/mballoc.c
··· 657 657 } 658 658 } 659 659 660 - static void ext4_mb_generate_buddy(struct super_block *sb, 660 + static noinline_for_stack 661 + void ext4_mb_generate_buddy(struct super_block *sb, 661 662 void *buddy, void *bitmap, ext4_group_t group) 662 663 { 663 664 struct ext4_group_info *grp = ext4_get_group_info(sb, group); ··· 1481 1480 ext4_mb_check_limits(ac, e4b, 0); 1482 1481 } 1483 1482 1484 - static int ext4_mb_try_best_found(struct ext4_allocation_context *ac, 1483 + static noinline_for_stack 1484 + int ext4_mb_try_best_found(struct ext4_allocation_context *ac, 1485 1485 struct ext4_buddy *e4b) 1486 1486 { 1487 1487 struct ext4_free_extent ex = ac->ac_b_ex; ··· 1509 1507 return 0; 1510 1508 } 1511 1509 1512 - static int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, 1510 + static noinline_for_stack 1511 + int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, 1513 1512 struct ext4_buddy *e4b) 1514 1513 { 1515 1514 ext4_group_t group = ac->ac_g_ex.fe_group; ··· 1569 1566 * The routine scans buddy structures (not bitmap!) from given order 1570 1567 * to max order and tries to find big enough chunk to satisfy the req 1571 1568 */ 1572 - static void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac, 1569 + static noinline_for_stack 1570 + void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac, 1573 1571 struct ext4_buddy *e4b) 1574 1572 { 1575 1573 struct super_block *sb = ac->ac_sb; ··· 1613 1609 * In order to optimize scanning, caller must pass number of 1614 1610 * free blocks in the group, so the routine can know upper limit. 1615 1611 */ 1616 - static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, 1612 + static noinline_for_stack 1613 + void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, 1617 1614 struct ext4_buddy *e4b) 1618 1615 { 1619 1616 struct super_block *sb = ac->ac_sb; ··· 1673 1668 * we try to find stripe-aligned chunks for stripe-size requests 1674 1669 * XXX should do so at least for multiples of stripe size as well 1675 1670 */ 1676 - static void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, 1671 + static noinline_for_stack 1672 + void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, 1677 1673 struct ext4_buddy *e4b) 1678 1674 { 1679 1675 struct super_block *sb = ac->ac_sb; ··· 1837 1831 1838 1832 } 1839 1833 1840 - static int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) 1834 + static noinline_for_stack 1835 + int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) 1841 1836 { 1842 1837 1843 1838 int ret; ··· 2909 2902 2910 2903 void exit_ext4_mballoc(void) 2911 2904 { 2912 - /* XXX: synchronize_rcu(); */ 2905 + /* 2906 + * Wait for completion of call_rcu()'s on ext4_pspace_cachep 2907 + * before destroying the slab cache. 2908 + */ 2909 + rcu_barrier(); 2913 2910 kmem_cache_destroy(ext4_pspace_cachep); 2914 2911 kmem_cache_destroy(ext4_ac_cachep); 2915 2912 kmem_cache_destroy(ext4_free_ext_cachep); ··· 3468 3457 * used in in-core bitmap. buddy must be generated from this bitmap 3469 3458 * Need to be called with ext4 group lock held 3470 3459 */ 3471 - static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, 3460 + static noinline_for_stack 3461 + void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, 3472 3462 ext4_group_t group) 3473 3463 { 3474 3464 struct ext4_group_info *grp = ext4_get_group_info(sb, group); ··· 4227 4215 ext4_get_group_no_and_offset(sb, goal, &group, &block); 4228 4216 4229 4217 /* set up allocation goals */ 4218 + memset(ac, 0, sizeof(struct ext4_allocation_context)); 4230 4219 ac->ac_b_ex.fe_logical = ar->logical; 4231 - ac->ac_b_ex.fe_group = 0; 4232 - ac->ac_b_ex.fe_start = 0; 4233 - ac->ac_b_ex.fe_len = 0; 4234 4220 ac->ac_status = AC_STATUS_CONTINUE; 4235 - ac->ac_groups_scanned = 0; 4236 - ac->ac_ex_scanned = 0; 4237 - ac->ac_found = 0; 4238 4221 ac->ac_sb = sb; 4239 4222 ac->ac_inode = ar->inode; 4240 4223 ac->ac_o_ex.fe_logical = ar->logical; ··· 4240 4233 ac->ac_g_ex.fe_group = group; 4241 4234 ac->ac_g_ex.fe_start = block; 4242 4235 ac->ac_g_ex.fe_len = len; 4243 - ac->ac_f_ex.fe_len = 0; 4244 4236 ac->ac_flags = ar->flags; 4245 - ac->ac_2order = 0; 4246 - ac->ac_criteria = 0; 4247 - ac->ac_pa = NULL; 4248 - ac->ac_bitmap_page = NULL; 4249 - ac->ac_buddy_page = NULL; 4250 - ac->alloc_semp = NULL; 4251 - ac->ac_lg = NULL; 4252 4237 4253 4238 /* we have to define context: we'll we work with a file or 4254 4239 * locality group. this is a policy, actually */ ··· 4508 4509 } 4509 4510 4510 4511 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); 4511 - if (ac) { 4512 - ac->ac_sb = sb; 4513 - ac->ac_inode = ar->inode; 4514 - } else { 4512 + if (!ac) { 4515 4513 ar->len = 0; 4516 4514 *errp = -ENOMEM; 4517 4515 goto out1;
+17 -14
fs/jbd2/journal.c
··· 297 297 unsigned int new_offset; 298 298 struct buffer_head *bh_in = jh2bh(jh_in); 299 299 struct jbd2_buffer_trigger_type *triggers; 300 + journal_t *journal = transaction->t_journal; 300 301 301 302 /* 302 303 * The buffer really shouldn't be locked: only the current committing ··· 311 310 J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in)); 312 311 313 312 new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL); 313 + /* keep subsequent assertions sane */ 314 + new_bh->b_state = 0; 315 + init_buffer(new_bh, NULL, NULL); 316 + atomic_set(&new_bh->b_count, 1); 317 + new_jh = jbd2_journal_add_journal_head(new_bh); /* This sleeps */ 314 318 315 319 /* 316 320 * If a new transaction has already done a buffer copy-out, then ··· 394 388 kunmap_atomic(mapped_data, KM_USER0); 395 389 } 396 390 397 - /* keep subsequent assertions sane */ 398 - new_bh->b_state = 0; 399 - init_buffer(new_bh, NULL, NULL); 400 - atomic_set(&new_bh->b_count, 1); 401 - jbd_unlock_bh_state(bh_in); 402 - 403 - new_jh = jbd2_journal_add_journal_head(new_bh); /* This sleeps */ 404 - 405 391 set_bh_page(new_bh, new_page, new_offset); 406 392 new_jh->b_transaction = NULL; 407 393 new_bh->b_size = jh2bh(jh_in)->b_size; ··· 410 412 * copying is moved to the transaction's shadow queue. 411 413 */ 412 414 JBUFFER_TRACE(jh_in, "file as BJ_Shadow"); 413 - jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow); 415 + spin_lock(&journal->j_list_lock); 416 + __jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow); 417 + spin_unlock(&journal->j_list_lock); 418 + jbd_unlock_bh_state(bh_in); 419 + 414 420 JBUFFER_TRACE(new_jh, "file as BJ_IO"); 415 421 jbd2_journal_file_buffer(new_jh, transaction, BJ_IO); 416 422 ··· 2412 2410 int i = hash_32(device, CACHE_SIZE_BITS); 2413 2411 char *ret; 2414 2412 struct block_device *bd; 2413 + static struct devname_cache *new_dev; 2415 2414 2416 2415 rcu_read_lock(); 2417 2416 if (devcache[i] && devcache[i]->device == device) { ··· 2422 2419 } 2423 2420 rcu_read_unlock(); 2424 2421 2422 + new_dev = kmalloc(sizeof(struct devname_cache), GFP_KERNEL); 2423 + if (!new_dev) 2424 + return "NODEV-ALLOCFAILURE"; /* Something non-NULL */ 2425 2425 spin_lock(&devname_cache_lock); 2426 2426 if (devcache[i]) { 2427 2427 if (devcache[i]->device == device) { 2428 + kfree(new_dev); 2428 2429 ret = devcache[i]->devname; 2429 2430 spin_unlock(&devname_cache_lock); 2430 2431 return ret; 2431 2432 } 2432 2433 call_rcu(&devcache[i]->rcu, free_devcache); 2433 2434 } 2434 - devcache[i] = kmalloc(sizeof(struct devname_cache), GFP_KERNEL); 2435 - if (!devcache[i]) { 2436 - spin_unlock(&devname_cache_lock); 2437 - return "NODEV-ALLOCFAILURE"; /* Something non-NULL */ 2438 - } 2435 + devcache[i] = new_dev; 2439 2436 devcache[i]->device = device; 2440 2437 bd = bdget(device); 2441 2438 if (bd) {
+35 -33
fs/jbd2/transaction.c
··· 499 499 wake_up(&journal->j_wait_transaction_locked); 500 500 } 501 501 502 - /* 503 - * Report any unexpected dirty buffers which turn up. Normally those 504 - * indicate an error, but they can occur if the user is running (say) 505 - * tune2fs to modify the live filesystem, so we need the option of 506 - * continuing as gracefully as possible. # 507 - * 508 - * The caller should already hold the journal lock and 509 - * j_list_lock spinlock: most callers will need those anyway 510 - * in order to probe the buffer's journaling state safely. 511 - */ 512 - static void jbd_unexpected_dirty_buffer(struct journal_head *jh) 502 + static void warn_dirty_buffer(struct buffer_head *bh) 513 503 { 514 - int jlist; 504 + char b[BDEVNAME_SIZE]; 515 505 516 - /* If this buffer is one which might reasonably be dirty 517 - * --- ie. data, or not part of this journal --- then 518 - * we're OK to leave it alone, but otherwise we need to 519 - * move the dirty bit to the journal's own internal 520 - * JBDDirty bit. */ 521 - jlist = jh->b_jlist; 522 - 523 - if (jlist == BJ_Metadata || jlist == BJ_Reserved || 524 - jlist == BJ_Shadow || jlist == BJ_Forget) { 525 - struct buffer_head *bh = jh2bh(jh); 526 - 527 - if (test_clear_buffer_dirty(bh)) 528 - set_buffer_jbddirty(bh); 529 - } 506 + printk(KERN_WARNING 507 + "JBD: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). " 508 + "There's a risk of filesystem corruption in case of system " 509 + "crash.\n", 510 + bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr); 530 511 } 531 512 532 513 /* ··· 574 593 if (jh->b_next_transaction) 575 594 J_ASSERT_JH(jh, jh->b_next_transaction == 576 595 transaction); 596 + warn_dirty_buffer(bh); 577 597 } 578 598 /* 579 599 * In any case we need to clean the dirty flag and we must 580 600 * do it under the buffer lock to be sure we don't race 581 601 * with running write-out. 582 602 */ 583 - JBUFFER_TRACE(jh, "Unexpected dirty buffer"); 584 - jbd_unexpected_dirty_buffer(jh); 603 + JBUFFER_TRACE(jh, "Journalling dirty buffer"); 604 + clear_buffer_dirty(bh); 605 + set_buffer_jbddirty(bh); 585 606 } 586 607 587 608 unlock_buffer(bh); ··· 826 843 J_ASSERT_JH(jh, buffer_locked(jh2bh(jh))); 827 844 828 845 if (jh->b_transaction == NULL) { 846 + /* 847 + * Previous jbd2_journal_forget() could have left the buffer 848 + * with jbddirty bit set because it was being committed. When 849 + * the commit finished, we've filed the buffer for 850 + * checkpointing and marked it dirty. Now we are reallocating 851 + * the buffer so the transaction freeing it must have 852 + * committed and so it's safe to clear the dirty bit. 853 + */ 854 + clear_buffer_dirty(jh2bh(jh)); 829 855 jh->b_transaction = transaction; 830 856 831 857 /* first access by this transaction */ ··· 1636 1644 1637 1645 if (jh->b_cp_transaction) { 1638 1646 JBUFFER_TRACE(jh, "on running+cp transaction"); 1647 + /* 1648 + * We don't want to write the buffer anymore, clear the 1649 + * bit so that we don't confuse checks in 1650 + * __journal_file_buffer 1651 + */ 1652 + clear_buffer_dirty(bh); 1639 1653 __jbd2_journal_file_buffer(jh, transaction, BJ_Forget); 1640 - clear_buffer_jbddirty(bh); 1641 1654 may_free = 0; 1642 1655 } else { 1643 1656 JBUFFER_TRACE(jh, "on running transaction"); ··· 1893 1896 if (jh->b_transaction && jh->b_jlist == jlist) 1894 1897 return; 1895 1898 1896 - /* The following list of buffer states needs to be consistent 1897 - * with __jbd_unexpected_dirty_buffer()'s handling of dirty 1898 - * state. */ 1899 - 1900 1899 if (jlist == BJ_Metadata || jlist == BJ_Reserved || 1901 1900 jlist == BJ_Shadow || jlist == BJ_Forget) { 1901 + /* 1902 + * For metadata buffers, we track dirty bit in buffer_jbddirty 1903 + * instead of buffer_dirty. We should not see a dirty bit set 1904 + * here because we clear it in do_get_write_access but e.g. 1905 + * tune2fs can modify the sb and set the dirty bit at any time 1906 + * so we try to gracefully handle that. 1907 + */ 1908 + if (buffer_dirty(bh)) 1909 + warn_dirty_buffer(bh); 1902 1910 if (test_clear_buffer_dirty(bh) || 1903 1911 test_clear_buffer_jbddirty(bh)) 1904 1912 was_dirty = 1;
+3 -45
include/trace/events/ext4.h
··· 34 34 35 35 TP_printk("dev %s ino %lu mode %d uid %u gid %u blocks %llu", 36 36 jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->mode, 37 - __entry->uid, __entry->gid, __entry->blocks) 37 + __entry->uid, __entry->gid, 38 + (unsigned long long) __entry->blocks) 38 39 ); 39 40 40 41 TRACE_EVENT(ext4_request_inode, ··· 190 189 __entry->copied) 191 190 ); 192 191 193 - TRACE_EVENT(ext4_da_writepage, 192 + TRACE_EVENT(ext4_writepage, 194 193 TP_PROTO(struct inode *inode, struct page *page), 195 194 196 195 TP_ARGS(inode, page), ··· 340 339 TP_printk("dev %s ino %lu pos %llu len %u copied %u", 341 340 jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->pos, __entry->len, 342 341 __entry->copied) 343 - ); 344 - 345 - TRACE_EVENT(ext4_normal_writepage, 346 - TP_PROTO(struct inode *inode, struct page *page), 347 - 348 - TP_ARGS(inode, page), 349 - 350 - TP_STRUCT__entry( 351 - __field( dev_t, dev ) 352 - __field( ino_t, ino ) 353 - __field( pgoff_t, index ) 354 - ), 355 - 356 - TP_fast_assign( 357 - __entry->dev = inode->i_sb->s_dev; 358 - __entry->ino = inode->i_ino; 359 - __entry->index = page->index; 360 - ), 361 - 362 - TP_printk("dev %s ino %lu page_index %lu", 363 - jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->index) 364 - ); 365 - 366 - TRACE_EVENT(ext4_journalled_writepage, 367 - TP_PROTO(struct inode *inode, struct page *page), 368 - 369 - TP_ARGS(inode, page), 370 - 371 - TP_STRUCT__entry( 372 - __field( dev_t, dev ) 373 - __field( ino_t, ino ) 374 - __field( pgoff_t, index ) 375 - 376 - ), 377 - 378 - TP_fast_assign( 379 - __entry->dev = inode->i_sb->s_dev; 380 - __entry->ino = inode->i_ino; 381 - __entry->index = page->index; 382 - ), 383 - 384 - TP_printk("dev %s ino %lu page_index %lu", 385 - jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->index) 386 342 ); 387 343 388 344 TRACE_EVENT(ext4_discard_blocks,