fs/buffer.c at b488997b9cb006e175908b70fc0a2f3601a763d1

tjh.dev / kernel
fork
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork
kernel / fs / buffer.c
at b488997b9cb006e175908b70fc0a2f3601a763d1 3163 lines 85 kB view raw
wrap content
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 *  linux/fs/buffer.c
   4 *
   5 *  Copyright (C) 1991, 1992, 2002  Linus Torvalds
   6 */
   7
   8/*
   9 * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
  10 *
  11 * Removed a lot of unnecessary code and simplified things now that
  12 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
  13 *
  14 * Speed up hash, lru, and free list operations.  Use gfp() for allocating
  15 * hash table, use SLAB cache for buffer heads. SMP threading.  -DaveM
  16 *
  17 * Added 32k buffer block sizes - these are required older ARM systems. - RMK
  18 *
  19 * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
  20 */
  21
  22#include <linux/kernel.h>
  23#include <linux/sched/signal.h>
  24#include <linux/syscalls.h>
  25#include <linux/fs.h>
  26#include <linux/iomap.h>
  27#include <linux/mm.h>
  28#include <linux/percpu.h>
  29#include <linux/slab.h>
  30#include <linux/capability.h>
  31#include <linux/blkdev.h>
  32#include <linux/blk-crypto.h>
  33#include <linux/file.h>
  34#include <linux/quotaops.h>
  35#include <linux/highmem.h>
  36#include <linux/export.h>
  37#include <linux/backing-dev.h>
  38#include <linux/writeback.h>
  39#include <linux/hash.h>
  40#include <linux/suspend.h>
  41#include <linux/buffer_head.h>
  42#include <linux/task_io_accounting_ops.h>
  43#include <linux/bio.h>
  44#include <linux/cpu.h>
  45#include <linux/bitops.h>
  46#include <linux/mpage.h>
  47#include <linux/bit_spinlock.h>
  48#include <linux/pagevec.h>
  49#include <linux/sched/mm.h>
  50#include <trace/events/block.h>
  51#include <linux/fscrypt.h>
  52#include <linux/fsverity.h>
  53#include <linux/sched/isolation.h>
  54
  55#include "internal.h"
  56
  57static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
  58static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
  59			  enum rw_hint hint, struct writeback_control *wbc);
  60
  61#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
  62
  63inline void touch_buffer(struct buffer_head *bh)
  64{
  65	trace_block_touch_buffer(bh);
  66	folio_mark_accessed(bh->b_folio);
  67}
  68EXPORT_SYMBOL(touch_buffer);
  69
  70void __lock_buffer(struct buffer_head *bh)
  71{
  72	wait_on_bit_lock_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE);
  73}
  74EXPORT_SYMBOL(__lock_buffer);
  75
  76void unlock_buffer(struct buffer_head *bh)
  77{
  78	clear_bit_unlock(BH_Lock, &bh->b_state);
  79	smp_mb__after_atomic();
  80	wake_up_bit(&bh->b_state, BH_Lock);
  81}
  82EXPORT_SYMBOL(unlock_buffer);
  83
  84/*
  85 * Returns if the folio has dirty or writeback buffers. If all the buffers
  86 * are unlocked and clean then the folio_test_dirty information is stale. If
  87 * any of the buffers are locked, it is assumed they are locked for IO.
  88 */
  89void buffer_check_dirty_writeback(struct folio *folio,
  90				     bool *dirty, bool *writeback)
  91{
  92	struct buffer_head *head, *bh;
  93	*dirty = false;
  94	*writeback = false;
  95
  96	BUG_ON(!folio_test_locked(folio));
  97
  98	head = folio_buffers(folio);
  99	if (!head)
 100		return;
 101
 102	if (folio_test_writeback(folio))
 103		*writeback = true;
 104
 105	bh = head;
 106	do {
 107		if (buffer_locked(bh))
 108			*writeback = true;
 109
 110		if (buffer_dirty(bh))
 111			*dirty = true;
 112
 113		bh = bh->b_this_page;
 114	} while (bh != head);
 115}
 116
 117/*
 118 * Block until a buffer comes unlocked.  This doesn't stop it
 119 * from becoming locked again - you have to lock it yourself
 120 * if you want to preserve its state.
 121 */
 122void __wait_on_buffer(struct buffer_head * bh)
 123{
 124	wait_on_bit_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE);
 125}
 126EXPORT_SYMBOL(__wait_on_buffer);
 127
 128static void buffer_io_error(struct buffer_head *bh, char *msg)
 129{
 130	if (!test_bit(BH_Quiet, &bh->b_state))
 131		printk_ratelimited(KERN_ERR
 132			"Buffer I/O error on dev %pg, logical block %llu%s\n",
 133			bh->b_bdev, (unsigned long long)bh->b_blocknr, msg);
 134}
 135
 136/*
 137 * End-of-IO handler helper function which does not touch the bh after
 138 * unlocking it.
 139 * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
 140 * a race there is benign: unlock_buffer() only use the bh's address for
 141 * hashing after unlocking the buffer, so it doesn't actually touch the bh
 142 * itself.
 143 */
 144static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)
 145{
 146	if (uptodate) {
 147		set_buffer_uptodate(bh);
 148	} else {
 149		/* This happens, due to failed read-ahead attempts. */
 150		clear_buffer_uptodate(bh);
 151	}
 152	unlock_buffer(bh);
 153}
 154
 155/*
 156 * Default synchronous end-of-IO handler..  Just mark it up-to-date and
 157 * unlock the buffer.
 158 */
 159void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
 160{
 161	put_bh(bh);
 162	__end_buffer_read_notouch(bh, uptodate);
 163}
 164EXPORT_SYMBOL(end_buffer_read_sync);
 165
 166void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
 167{
 168	if (uptodate) {
 169		set_buffer_uptodate(bh);
 170	} else {
 171		buffer_io_error(bh, ", lost sync page write");
 172		mark_buffer_write_io_error(bh);
 173		clear_buffer_uptodate(bh);
 174	}
 175	unlock_buffer(bh);
 176	put_bh(bh);
 177}
 178EXPORT_SYMBOL(end_buffer_write_sync);
 179
 180static struct buffer_head *
 181__find_get_block_slow(struct block_device *bdev, sector_t block, bool atomic)
 182{
 183	struct address_space *bd_mapping = bdev->bd_mapping;
 184	const int blkbits = bd_mapping->host->i_blkbits;
 185	struct buffer_head *ret = NULL;
 186	pgoff_t index;
 187	struct buffer_head *bh;
 188	struct buffer_head *head;
 189	struct folio *folio;
 190	int all_mapped = 1;
 191	static DEFINE_RATELIMIT_STATE(last_warned, HZ, 1);
 192
 193	index = ((loff_t)block << blkbits) / PAGE_SIZE;
 194	folio = __filemap_get_folio(bd_mapping, index, FGP_ACCESSED, 0);
 195	if (IS_ERR(folio))
 196		goto out;
 197
 198	/*
 199	 * Folio lock protects the buffers. Callers that cannot block
 200	 * will fallback to serializing vs try_to_free_buffers() via
 201	 * the i_private_lock.
 202	 */
 203	if (atomic)
 204		spin_lock(&bd_mapping->i_private_lock);
 205	else
 206		folio_lock(folio);
 207
 208	head = folio_buffers(folio);
 209	if (!head)
 210		goto out_unlock;
 211	/*
 212	 * Upon a noref migration, the folio lock serializes here;
 213	 * otherwise bail.
 214	 */
 215	if (test_bit_acquire(BH_Migrate, &head->b_state)) {
 216		WARN_ON(!atomic);
 217		goto out_unlock;
 218	}
 219
 220	bh = head;
 221	do {
 222		if (!buffer_mapped(bh))
 223			all_mapped = 0;
 224		else if (bh->b_blocknr == block) {
 225			ret = bh;
 226			get_bh(bh);
 227			goto out_unlock;
 228		}
 229		bh = bh->b_this_page;
 230	} while (bh != head);
 231
 232	/* we might be here because some of the buffers on this page are
 233	 * not mapped.  This is due to various races between
 234	 * file io on the block device and getblk.  It gets dealt with
 235	 * elsewhere, don't buffer_error if we had some unmapped buffers
 236	 */
 237	ratelimit_set_flags(&last_warned, RATELIMIT_MSG_ON_RELEASE);
 238	if (all_mapped && __ratelimit(&last_warned)) {
 239		printk("__find_get_block_slow() failed. block=%llu, "
 240		       "b_blocknr=%llu, b_state=0x%08lx, b_size=%zu, "
 241		       "device %pg blocksize: %d\n",
 242		       (unsigned long long)block,
 243		       (unsigned long long)bh->b_blocknr,
 244		       bh->b_state, bh->b_size, bdev,
 245		       1 << blkbits);
 246	}
 247out_unlock:
 248	if (atomic)
 249		spin_unlock(&bd_mapping->i_private_lock);
 250	else
 251		folio_unlock(folio);
 252	folio_put(folio);
 253out:
 254	return ret;
 255}
 256
 257static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
 258{
 259	unsigned long flags;
 260	struct buffer_head *first;
 261	struct buffer_head *tmp;
 262	struct folio *folio;
 263	int folio_uptodate = 1;
 264
 265	BUG_ON(!buffer_async_read(bh));
 266
 267	folio = bh->b_folio;
 268	if (uptodate) {
 269		set_buffer_uptodate(bh);
 270	} else {
 271		clear_buffer_uptodate(bh);
 272		buffer_io_error(bh, ", async page read");
 273	}
 274
 275	/*
 276	 * Be _very_ careful from here on. Bad things can happen if
 277	 * two buffer heads end IO at almost the same time and both
 278	 * decide that the page is now completely done.
 279	 */
 280	first = folio_buffers(folio);
 281	spin_lock_irqsave(&first->b_uptodate_lock, flags);
 282	clear_buffer_async_read(bh);
 283	unlock_buffer(bh);
 284	tmp = bh;
 285	do {
 286		if (!buffer_uptodate(tmp))
 287			folio_uptodate = 0;
 288		if (buffer_async_read(tmp)) {
 289			BUG_ON(!buffer_locked(tmp));
 290			goto still_busy;
 291		}
 292		tmp = tmp->b_this_page;
 293	} while (tmp != bh);
 294	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
 295
 296	folio_end_read(folio, folio_uptodate);
 297	return;
 298
 299still_busy:
 300	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
 301}
 302
 303struct postprocess_bh_ctx {
 304	struct work_struct work;
 305	struct buffer_head *bh;
 306	struct fsverity_info *vi;
 307};
 308
 309static void verify_bh(struct work_struct *work)
 310{
 311	struct postprocess_bh_ctx *ctx =
 312		container_of(work, struct postprocess_bh_ctx, work);
 313	struct buffer_head *bh = ctx->bh;
 314	bool valid;
 315
 316	valid = fsverity_verify_blocks(ctx->vi, bh->b_folio, bh->b_size,
 317				       bh_offset(bh));
 318	end_buffer_async_read(bh, valid);
 319	kfree(ctx);
 320}
 321
 322static void decrypt_bh(struct work_struct *work)
 323{
 324	struct postprocess_bh_ctx *ctx =
 325		container_of(work, struct postprocess_bh_ctx, work);
 326	struct buffer_head *bh = ctx->bh;
 327	int err;
 328
 329	err = fscrypt_decrypt_pagecache_blocks(bh->b_folio, bh->b_size,
 330					       bh_offset(bh));
 331	if (err == 0 && ctx->vi) {
 332		/*
 333		 * We use different work queues for decryption and for verity
 334		 * because verity may require reading metadata pages that need
 335		 * decryption, and we shouldn't recurse to the same workqueue.
 336		 */
 337		INIT_WORK(&ctx->work, verify_bh);
 338		fsverity_enqueue_verify_work(&ctx->work);
 339		return;
 340	}
 341	end_buffer_async_read(bh, err == 0);
 342	kfree(ctx);
 343}
 344
 345/*
 346 * I/O completion handler for block_read_full_folio() - pages
 347 * which come unlocked at the end of I/O.
 348 */
 349static void end_buffer_async_read_io(struct buffer_head *bh, int uptodate)
 350{
 351	struct inode *inode = bh->b_folio->mapping->host;
 352	bool decrypt = fscrypt_inode_uses_fs_layer_crypto(inode);
 353	struct fsverity_info *vi = NULL;
 354
 355	/* needed by ext4 */
 356	if (bh->b_folio->index < DIV_ROUND_UP(inode->i_size, PAGE_SIZE))
 357		vi = fsverity_get_info(inode);
 358
 359	/* Decrypt (with fscrypt) and/or verify (with fsverity) if needed. */
 360	if (uptodate && (decrypt || vi)) {
 361		struct postprocess_bh_ctx *ctx = kmalloc_obj(*ctx, GFP_ATOMIC);
 362
 363		if (ctx) {
 364			ctx->bh = bh;
 365			ctx->vi = vi;
 366			if (decrypt) {
 367				INIT_WORK(&ctx->work, decrypt_bh);
 368				fscrypt_enqueue_decrypt_work(&ctx->work);
 369			} else {
 370				INIT_WORK(&ctx->work, verify_bh);
 371				fsverity_enqueue_verify_work(&ctx->work);
 372			}
 373			return;
 374		}
 375		uptodate = 0;
 376	}
 377	end_buffer_async_read(bh, uptodate);
 378}
 379
 380/*
 381 * Completion handler for block_write_full_folio() - folios which are unlocked
 382 * during I/O, and which have the writeback flag cleared upon I/O completion.
 383 */
 384static void end_buffer_async_write(struct buffer_head *bh, int uptodate)
 385{
 386	unsigned long flags;
 387	struct buffer_head *first;
 388	struct buffer_head *tmp;
 389	struct folio *folio;
 390
 391	BUG_ON(!buffer_async_write(bh));
 392
 393	folio = bh->b_folio;
 394	if (uptodate) {
 395		set_buffer_uptodate(bh);
 396	} else {
 397		buffer_io_error(bh, ", lost async page write");
 398		mark_buffer_write_io_error(bh);
 399		clear_buffer_uptodate(bh);
 400	}
 401
 402	first = folio_buffers(folio);
 403	spin_lock_irqsave(&first->b_uptodate_lock, flags);
 404
 405	clear_buffer_async_write(bh);
 406	unlock_buffer(bh);
 407	tmp = bh->b_this_page;
 408	while (tmp != bh) {
 409		if (buffer_async_write(tmp)) {
 410			BUG_ON(!buffer_locked(tmp));
 411			goto still_busy;
 412		}
 413		tmp = tmp->b_this_page;
 414	}
 415	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
 416	folio_end_writeback(folio);
 417	return;
 418
 419still_busy:
 420	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
 421}
 422
 423/*
 424 * If a page's buffers are under async readin (end_buffer_async_read
 425 * completion) then there is a possibility that another thread of
 426 * control could lock one of the buffers after it has completed
 427 * but while some of the other buffers have not completed.  This
 428 * locked buffer would confuse end_buffer_async_read() into not unlocking
 429 * the page.  So the absence of BH_Async_Read tells end_buffer_async_read()
 430 * that this buffer is not under async I/O.
 431 *
 432 * The page comes unlocked when it has no locked buffer_async buffers
 433 * left.
 434 *
 435 * PageLocked prevents anyone starting new async I/O reads any of
 436 * the buffers.
 437 *
 438 * PageWriteback is used to prevent simultaneous writeout of the same
 439 * page.
 440 *
 441 * PageLocked prevents anyone from starting writeback of a page which is
 442 * under read I/O (PageWriteback is only ever set against a locked page).
 443 */
 444static void mark_buffer_async_read(struct buffer_head *bh)
 445{
 446	bh->b_end_io = end_buffer_async_read_io;
 447	set_buffer_async_read(bh);
 448}
 449
 450static void mark_buffer_async_write_endio(struct buffer_head *bh,
 451					  bh_end_io_t *handler)
 452{
 453	bh->b_end_io = handler;
 454	set_buffer_async_write(bh);
 455}
 456
 457void mark_buffer_async_write(struct buffer_head *bh)
 458{
 459	mark_buffer_async_write_endio(bh, end_buffer_async_write);
 460}
 461EXPORT_SYMBOL(mark_buffer_async_write);
 462
 463
 464/*
 465 * fs/buffer.c contains helper functions for buffer-backed address space's
 466 * fsync functions.  A common requirement for buffer-based filesystems is
 467 * that certain data from the backing blockdev needs to be written out for
 468 * a successful fsync().  For example, ext2 indirect blocks need to be
 469 * written back and waited upon before fsync() returns.
 470 *
 471 * The functions mark_buffer_dirty_inode(), fsync_inode_buffers(),
 472 * inode_has_buffers() and invalidate_inode_buffers() are provided for the
 473 * management of a list of dependent buffers at ->i_mapping->i_private_list.
 474 *
 475 * Locking is a little subtle: try_to_free_buffers() will remove buffers
 476 * from their controlling inode's queue when they are being freed.  But
 477 * try_to_free_buffers() will be operating against the *blockdev* mapping
 478 * at the time, not against the S_ISREG file which depends on those buffers.
 479 * So the locking for i_private_list is via the i_private_lock in the address_space
 480 * which backs the buffers.  Which is different from the address_space 
 481 * against which the buffers are listed.  So for a particular address_space,
 482 * mapping->i_private_lock does *not* protect mapping->i_private_list!  In fact,
 483 * mapping->i_private_list will always be protected by the backing blockdev's
 484 * ->i_private_lock.
 485 *
 486 * Which introduces a requirement: all buffers on an address_space's
 487 * ->i_private_list must be from the same address_space: the blockdev's.
 488 *
 489 * address_spaces which do not place buffers at ->i_private_list via these
 490 * utility functions are free to use i_private_lock and i_private_list for
 491 * whatever they want.  The only requirement is that list_empty(i_private_list)
 492 * be true at clear_inode() time.
 493 *
 494 * FIXME: clear_inode should not call invalidate_inode_buffers().  The
 495 * filesystems should do that.  invalidate_inode_buffers() should just go
 496 * BUG_ON(!list_empty).
 497 *
 498 * FIXME: mark_buffer_dirty_inode() is a data-plane operation.  It should
 499 * take an address_space, not an inode.  And it should be called
 500 * mark_buffer_dirty_fsync() to clearly define why those buffers are being
 501 * queued up.
 502 *
 503 * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
 504 * list if it is already on a list.  Because if the buffer is on a list,
 505 * it *must* already be on the right one.  If not, the filesystem is being
 506 * silly.  This will save a ton of locking.  But first we have to ensure
 507 * that buffers are taken *off* the old inode's list when they are freed
 508 * (presumably in truncate).  That requires careful auditing of all
 509 * filesystems (do it inside bforget()).  It could also be done by bringing
 510 * b_inode back.
 511 */
 512
 513/*
 514 * The buffer's backing address_space's i_private_lock must be held
 515 */
 516static void __remove_assoc_queue(struct buffer_head *bh)
 517{
 518	list_del_init(&bh->b_assoc_buffers);
 519	WARN_ON(!bh->b_assoc_map);
 520	bh->b_assoc_map = NULL;
 521}
 522
 523int inode_has_buffers(struct inode *inode)
 524{
 525	return !list_empty(&inode->i_data.i_private_list);
 526}
 527
 528/*
 529 * osync is designed to support O_SYNC io.  It waits synchronously for
 530 * all already-submitted IO to complete, but does not queue any new
 531 * writes to the disk.
 532 *
 533 * To do O_SYNC writes, just queue the buffer writes with write_dirty_buffer
 534 * as you dirty the buffers, and then use osync_inode_buffers to wait for
 535 * completion.  Any other dirty buffers which are not yet queued for
 536 * write will not be flushed to disk by the osync.
 537 */
 538static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
 539{
 540	struct buffer_head *bh;
 541	struct list_head *p;
 542	int err = 0;
 543
 544	spin_lock(lock);
 545repeat:
 546	list_for_each_prev(p, list) {
 547		bh = BH_ENTRY(p);
 548		if (buffer_locked(bh)) {
 549			get_bh(bh);
 550			spin_unlock(lock);
 551			wait_on_buffer(bh);
 552			if (!buffer_uptodate(bh))
 553				err = -EIO;
 554			brelse(bh);
 555			spin_lock(lock);
 556			goto repeat;
 557		}
 558	}
 559	spin_unlock(lock);
 560	return err;
 561}
 562
 563/**
 564 * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
 565 * @mapping: the mapping which wants those buffers written
 566 *
 567 * Starts I/O against the buffers at mapping->i_private_list, and waits upon
 568 * that I/O.
 569 *
 570 * Basically, this is a convenience function for fsync().
 571 * @mapping is a file or directory which needs those buffers to be written for
 572 * a successful fsync().
 573 */
 574int sync_mapping_buffers(struct address_space *mapping)
 575{
 576	struct address_space *buffer_mapping = mapping->i_private_data;
 577
 578	if (buffer_mapping == NULL || list_empty(&mapping->i_private_list))
 579		return 0;
 580
 581	return fsync_buffers_list(&buffer_mapping->i_private_lock,
 582					&mapping->i_private_list);
 583}
 584EXPORT_SYMBOL(sync_mapping_buffers);
 585
 586/**
 587 * generic_buffers_fsync_noflush - generic buffer fsync implementation
 588 * for simple filesystems with no inode lock
 589 *
 590 * @file:	file to synchronize
 591 * @start:	start offset in bytes
 592 * @end:	end offset in bytes (inclusive)
 593 * @datasync:	only synchronize essential metadata if true
 594 *
 595 * This is a generic implementation of the fsync method for simple
 596 * filesystems which track all non-inode metadata in the buffers list
 597 * hanging off the address_space structure.
 598 */
 599int generic_buffers_fsync_noflush(struct file *file, loff_t start, loff_t end,
 600				  bool datasync)
 601{
 602	struct inode *inode = file->f_mapping->host;
 603	int err;
 604	int ret;
 605
 606	err = file_write_and_wait_range(file, start, end);
 607	if (err)
 608		return err;
 609
 610	ret = sync_mapping_buffers(inode->i_mapping);
 611	if (!(inode_state_read_once(inode) & I_DIRTY_ALL))
 612		goto out;
 613	if (datasync && !(inode_state_read_once(inode) & I_DIRTY_DATASYNC))
 614		goto out;
 615
 616	err = sync_inode_metadata(inode, 1);
 617	if (ret == 0)
 618		ret = err;
 619
 620out:
 621	/* check and advance again to catch errors after syncing out buffers */
 622	err = file_check_and_advance_wb_err(file);
 623	if (ret == 0)
 624		ret = err;
 625	return ret;
 626}
 627EXPORT_SYMBOL(generic_buffers_fsync_noflush);
 628
 629/**
 630 * generic_buffers_fsync - generic buffer fsync implementation
 631 * for simple filesystems with no inode lock
 632 *
 633 * @file:	file to synchronize
 634 * @start:	start offset in bytes
 635 * @end:	end offset in bytes (inclusive)
 636 * @datasync:	only synchronize essential metadata if true
 637 *
 638 * This is a generic implementation of the fsync method for simple
 639 * filesystems which track all non-inode metadata in the buffers list
 640 * hanging off the address_space structure. This also makes sure that
 641 * a device cache flush operation is called at the end.
 642 */
 643int generic_buffers_fsync(struct file *file, loff_t start, loff_t end,
 644			  bool datasync)
 645{
 646	struct inode *inode = file->f_mapping->host;
 647	int ret;
 648
 649	ret = generic_buffers_fsync_noflush(file, start, end, datasync);
 650	if (!ret)
 651		ret = blkdev_issue_flush(inode->i_sb->s_bdev);
 652	return ret;
 653}
 654EXPORT_SYMBOL(generic_buffers_fsync);
 655
 656/*
 657 * Called when we've recently written block `bblock', and it is known that
 658 * `bblock' was for a buffer_boundary() buffer.  This means that the block at
 659 * `bblock + 1' is probably a dirty indirect block.  Hunt it down and, if it's
 660 * dirty, schedule it for IO.  So that indirects merge nicely with their data.
 661 */
 662void write_boundary_block(struct block_device *bdev,
 663			sector_t bblock, unsigned blocksize)
 664{
 665	struct buffer_head *bh;
 666
 667	bh = __find_get_block_nonatomic(bdev, bblock + 1, blocksize);
 668	if (bh) {
 669		if (buffer_dirty(bh))
 670			write_dirty_buffer(bh, 0);
 671		put_bh(bh);
 672	}
 673}
 674
 675void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
 676{
 677	struct address_space *mapping = inode->i_mapping;
 678	struct address_space *buffer_mapping = bh->b_folio->mapping;
 679
 680	mark_buffer_dirty(bh);
 681	if (!mapping->i_private_data) {
 682		mapping->i_private_data = buffer_mapping;
 683	} else {
 684		BUG_ON(mapping->i_private_data != buffer_mapping);
 685	}
 686	if (!bh->b_assoc_map) {
 687		spin_lock(&buffer_mapping->i_private_lock);
 688		list_move_tail(&bh->b_assoc_buffers,
 689				&mapping->i_private_list);
 690		bh->b_assoc_map = mapping;
 691		spin_unlock(&buffer_mapping->i_private_lock);
 692	}
 693}
 694EXPORT_SYMBOL(mark_buffer_dirty_inode);
 695
 696/**
 697 * block_dirty_folio - Mark a folio as dirty.
 698 * @mapping: The address space containing this folio.
 699 * @folio: The folio to mark dirty.
 700 *
 701 * Filesystems which use buffer_heads can use this function as their
 702 * ->dirty_folio implementation.  Some filesystems need to do a little
 703 * work before calling this function.  Filesystems which do not use
 704 * buffer_heads should call filemap_dirty_folio() instead.
 705 *
 706 * If the folio has buffers, the uptodate buffers are set dirty, to
 707 * preserve dirty-state coherency between the folio and the buffers.
 708 * Buffers added to a dirty folio are created dirty.
 709 *
 710 * The buffers are dirtied before the folio is dirtied.  There's a small
 711 * race window in which writeback may see the folio cleanness but not the
 712 * buffer dirtiness.  That's fine.  If this code were to set the folio
 713 * dirty before the buffers, writeback could clear the folio dirty flag,
 714 * see a bunch of clean buffers and we'd end up with dirty buffers/clean
 715 * folio on the dirty folio list.
 716 *
 717 * We use i_private_lock to lock against try_to_free_buffers() while
 718 * using the folio's buffer list.  This also prevents clean buffers
 719 * being added to the folio after it was set dirty.
 720 *
 721 * Context: May only be called from process context.  Does not sleep.
 722 * Caller must ensure that @folio cannot be truncated during this call,
 723 * typically by holding the folio lock or having a page in the folio
 724 * mapped and holding the page table lock.
 725 *
 726 * Return: True if the folio was dirtied; false if it was already dirtied.
 727 */
 728bool block_dirty_folio(struct address_space *mapping, struct folio *folio)
 729{
 730	struct buffer_head *head;
 731	bool newly_dirty;
 732
 733	spin_lock(&mapping->i_private_lock);
 734	head = folio_buffers(folio);
 735	if (head) {
 736		struct buffer_head *bh = head;
 737
 738		do {
 739			set_buffer_dirty(bh);
 740			bh = bh->b_this_page;
 741		} while (bh != head);
 742	}
 743	/*
 744	 * Lock out page's memcg migration to keep PageDirty
 745	 * synchronized with per-memcg dirty page counters.
 746	 */
 747	newly_dirty = !folio_test_set_dirty(folio);
 748	spin_unlock(&mapping->i_private_lock);
 749
 750	if (newly_dirty)
 751		__folio_mark_dirty(folio, mapping, 1);
 752
 753	if (newly_dirty)
 754		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 755
 756	return newly_dirty;
 757}
 758EXPORT_SYMBOL(block_dirty_folio);
 759
 760/*
 761 * Write out and wait upon a list of buffers.
 762 *
 763 * We have conflicting pressures: we want to make sure that all
 764 * initially dirty buffers get waited on, but that any subsequently
 765 * dirtied buffers don't.  After all, we don't want fsync to last
 766 * forever if somebody is actively writing to the file.
 767 *
 768 * Do this in two main stages: first we copy dirty buffers to a
 769 * temporary inode list, queueing the writes as we go.  Then we clean
 770 * up, waiting for those writes to complete.
 771 * 
 772 * During this second stage, any subsequent updates to the file may end
 773 * up refiling the buffer on the original inode's dirty list again, so
 774 * there is a chance we will end up with a buffer queued for write but
 775 * not yet completed on that list.  So, as a final cleanup we go through
 776 * the osync code to catch these locked, dirty buffers without requeuing
 777 * any newly dirty buffers for write.
 778 */
 779static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
 780{
 781	struct buffer_head *bh;
 782	struct address_space *mapping;
 783	int err = 0, err2;
 784	struct blk_plug plug;
 785	LIST_HEAD(tmp);
 786
 787	blk_start_plug(&plug);
 788
 789	spin_lock(lock);
 790	while (!list_empty(list)) {
 791		bh = BH_ENTRY(list->next);
 792		mapping = bh->b_assoc_map;
 793		__remove_assoc_queue(bh);
 794		/* Avoid race with mark_buffer_dirty_inode() which does
 795		 * a lockless check and we rely on seeing the dirty bit */
 796		smp_mb();
 797		if (buffer_dirty(bh) || buffer_locked(bh)) {
 798			list_add(&bh->b_assoc_buffers, &tmp);
 799			bh->b_assoc_map = mapping;
 800			if (buffer_dirty(bh)) {
 801				get_bh(bh);
 802				spin_unlock(lock);
 803				/*
 804				 * Ensure any pending I/O completes so that
 805				 * write_dirty_buffer() actually writes the
 806				 * current contents - it is a noop if I/O is
 807				 * still in flight on potentially older
 808				 * contents.
 809				 */
 810				write_dirty_buffer(bh, REQ_SYNC);
 811
 812				/*
 813				 * Kick off IO for the previous mapping. Note
 814				 * that we will not run the very last mapping,
 815				 * wait_on_buffer() will do that for us
 816				 * through sync_buffer().
 817				 */
 818				brelse(bh);
 819				spin_lock(lock);
 820			}
 821		}
 822	}
 823
 824	spin_unlock(lock);
 825	blk_finish_plug(&plug);
 826	spin_lock(lock);
 827
 828	while (!list_empty(&tmp)) {
 829		bh = BH_ENTRY(tmp.prev);
 830		get_bh(bh);
 831		mapping = bh->b_assoc_map;
 832		__remove_assoc_queue(bh);
 833		/* Avoid race with mark_buffer_dirty_inode() which does
 834		 * a lockless check and we rely on seeing the dirty bit */
 835		smp_mb();
 836		if (buffer_dirty(bh)) {
 837			list_add(&bh->b_assoc_buffers,
 838				 &mapping->i_private_list);
 839			bh->b_assoc_map = mapping;
 840		}
 841		spin_unlock(lock);
 842		wait_on_buffer(bh);
 843		if (!buffer_uptodate(bh))
 844			err = -EIO;
 845		brelse(bh);
 846		spin_lock(lock);
 847	}
 848	
 849	spin_unlock(lock);
 850	err2 = osync_buffers_list(lock, list);
 851	if (err)
 852		return err;
 853	else
 854		return err2;
 855}
 856
 857/*
 858 * Invalidate any and all dirty buffers on a given inode.  We are
 859 * probably unmounting the fs, but that doesn't mean we have already
 860 * done a sync().  Just drop the buffers from the inode list.
 861 *
 862 * NOTE: we take the inode's blockdev's mapping's i_private_lock.  Which
 863 * assumes that all the buffers are against the blockdev.
 864 */
 865void invalidate_inode_buffers(struct inode *inode)
 866{
 867	if (inode_has_buffers(inode)) {
 868		struct address_space *mapping = &inode->i_data;
 869		struct list_head *list = &mapping->i_private_list;
 870		struct address_space *buffer_mapping = mapping->i_private_data;
 871
 872		spin_lock(&buffer_mapping->i_private_lock);
 873		while (!list_empty(list))
 874			__remove_assoc_queue(BH_ENTRY(list->next));
 875		spin_unlock(&buffer_mapping->i_private_lock);
 876	}
 877}
 878EXPORT_SYMBOL(invalidate_inode_buffers);
 879
 880/*
 881 * Remove any clean buffers from the inode's buffer list.  This is called
 882 * when we're trying to free the inode itself.  Those buffers can pin it.
 883 *
 884 * Returns true if all buffers were removed.
 885 */
 886int remove_inode_buffers(struct inode *inode)
 887{
 888	int ret = 1;
 889
 890	if (inode_has_buffers(inode)) {
 891		struct address_space *mapping = &inode->i_data;
 892		struct list_head *list = &mapping->i_private_list;
 893		struct address_space *buffer_mapping = mapping->i_private_data;
 894
 895		spin_lock(&buffer_mapping->i_private_lock);
 896		while (!list_empty(list)) {
 897			struct buffer_head *bh = BH_ENTRY(list->next);
 898			if (buffer_dirty(bh)) {
 899				ret = 0;
 900				break;
 901			}
 902			__remove_assoc_queue(bh);
 903		}
 904		spin_unlock(&buffer_mapping->i_private_lock);
 905	}
 906	return ret;
 907}
 908
 909/*
 910 * Create the appropriate buffers when given a folio for data area and
 911 * the size of each buffer.. Use the bh->b_this_page linked list to
 912 * follow the buffers created.  Return NULL if unable to create more
 913 * buffers.
 914 *
 915 * The retry flag is used to differentiate async IO (paging, swapping)
 916 * which may not fail from ordinary buffer allocations.
 917 */
 918struct buffer_head *folio_alloc_buffers(struct folio *folio, unsigned long size,
 919					gfp_t gfp)
 920{
 921	struct buffer_head *bh, *head;
 922	long offset;
 923	struct mem_cgroup *memcg, *old_memcg;
 924
 925	/* The folio lock pins the memcg */
 926	memcg = folio_memcg(folio);
 927	old_memcg = set_active_memcg(memcg);
 928
 929	head = NULL;
 930	offset = folio_size(folio);
 931	while ((offset -= size) >= 0) {
 932		bh = alloc_buffer_head(gfp);
 933		if (!bh)
 934			goto no_grow;
 935
 936		bh->b_this_page = head;
 937		bh->b_blocknr = -1;
 938		head = bh;
 939
 940		bh->b_size = size;
 941
 942		/* Link the buffer to its folio */
 943		folio_set_bh(bh, folio, offset);
 944	}
 945out:
 946	set_active_memcg(old_memcg);
 947	return head;
 948/*
 949 * In case anything failed, we just free everything we got.
 950 */
 951no_grow:
 952	if (head) {
 953		do {
 954			bh = head;
 955			head = head->b_this_page;
 956			free_buffer_head(bh);
 957		} while (head);
 958	}
 959
 960	goto out;
 961}
 962EXPORT_SYMBOL_GPL(folio_alloc_buffers);
 963
 964struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size)
 965{
 966	gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT;
 967
 968	return folio_alloc_buffers(page_folio(page), size, gfp);
 969}
 970EXPORT_SYMBOL_GPL(alloc_page_buffers);
 971
 972static inline void link_dev_buffers(struct folio *folio,
 973		struct buffer_head *head)
 974{
 975	struct buffer_head *bh, *tail;
 976
 977	bh = head;
 978	do {
 979		tail = bh;
 980		bh = bh->b_this_page;
 981	} while (bh);
 982	tail->b_this_page = head;
 983	folio_attach_private(folio, head);
 984}
 985
 986static sector_t blkdev_max_block(struct block_device *bdev, unsigned int size)
 987{
 988	sector_t retval = ~((sector_t)0);
 989	loff_t sz = bdev_nr_bytes(bdev);
 990
 991	if (sz) {
 992		unsigned int sizebits = blksize_bits(size);
 993		retval = (sz >> sizebits);
 994	}
 995	return retval;
 996}
 997
 998/*
 999 * Initialise the state of a blockdev folio's buffers.
1000 */ 
1001static sector_t folio_init_buffers(struct folio *folio,
1002		struct block_device *bdev, unsigned size)
1003{
1004	struct buffer_head *head = folio_buffers(folio);
1005	struct buffer_head *bh = head;
1006	bool uptodate = folio_test_uptodate(folio);
1007	sector_t block = div_u64(folio_pos(folio), size);
1008	sector_t end_block = blkdev_max_block(bdev, size);
1009
1010	do {
1011		if (!buffer_mapped(bh)) {
1012			bh->b_end_io = NULL;
1013			bh->b_private = NULL;
1014			bh->b_bdev = bdev;
1015			bh->b_blocknr = block;
1016			if (uptodate)
1017				set_buffer_uptodate(bh);
1018			if (block < end_block)
1019				set_buffer_mapped(bh);
1020		}
1021		block++;
1022		bh = bh->b_this_page;
1023	} while (bh != head);
1024
1025	/*
1026	 * Caller needs to validate requested block against end of device.
1027	 */
1028	return end_block;
1029}
1030
1031/*
1032 * Create the page-cache folio that contains the requested block.
1033 *
1034 * This is used purely for blockdev mappings.
1035 *
1036 * Returns false if we have a failure which cannot be cured by retrying
1037 * without sleeping.  Returns true if we succeeded, or the caller should retry.
1038 */
1039static bool grow_dev_folio(struct block_device *bdev, sector_t block,
1040		pgoff_t index, unsigned size, gfp_t gfp)
1041{
1042	struct address_space *mapping = bdev->bd_mapping;
1043	struct folio *folio;
1044	struct buffer_head *bh;
1045	sector_t end_block = 0;
1046
1047	folio = __filemap_get_folio(mapping, index,
1048			FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp);
1049	if (IS_ERR(folio))
1050		return false;
1051
1052	bh = folio_buffers(folio);
1053	if (bh) {
1054		if (bh->b_size == size) {
1055			end_block = folio_init_buffers(folio, bdev, size);
1056			goto unlock;
1057		}
1058
1059		/*
1060		 * Retrying may succeed; for example the folio may finish
1061		 * writeback, or buffers may be cleaned.  This should not
1062		 * happen very often; maybe we have old buffers attached to
1063		 * this blockdev's page cache and we're trying to change
1064		 * the block size?
1065		 */
1066		if (!try_to_free_buffers(folio)) {
1067			end_block = ~0ULL;
1068			goto unlock;
1069		}
1070	}
1071
1072	bh = folio_alloc_buffers(folio, size, gfp | __GFP_ACCOUNT);
1073	if (!bh)
1074		goto unlock;
1075
1076	/*
1077	 * Link the folio to the buffers and initialise them.  Take the
1078	 * lock to be atomic wrt __find_get_block(), which does not
1079	 * run under the folio lock.
1080	 */
1081	spin_lock(&mapping->i_private_lock);
1082	link_dev_buffers(folio, bh);
1083	end_block = folio_init_buffers(folio, bdev, size);
1084	spin_unlock(&mapping->i_private_lock);
1085unlock:
1086	folio_unlock(folio);
1087	folio_put(folio);
1088	return block < end_block;
1089}
1090
1091/*
1092 * Create buffers for the specified block device block's folio.  If
1093 * that folio was dirty, the buffers are set dirty also.  Returns false
1094 * if we've hit a permanent error.
1095 */
1096static bool grow_buffers(struct block_device *bdev, sector_t block,
1097		unsigned size, gfp_t gfp)
1098{
1099	loff_t pos;
1100
1101	/*
1102	 * Check for a block which lies outside our maximum possible
1103	 * pagecache index.
1104	 */
1105	if (check_mul_overflow(block, (sector_t)size, &pos) || pos > MAX_LFS_FILESIZE) {
1106		printk(KERN_ERR "%s: requested out-of-range block %llu for device %pg\n",
1107			__func__, (unsigned long long)block,
1108			bdev);
1109		return false;
1110	}
1111
1112	/* Create a folio with the proper size buffers */
1113	return grow_dev_folio(bdev, block, pos / PAGE_SIZE, size, gfp);
1114}
1115
1116static struct buffer_head *
1117__getblk_slow(struct block_device *bdev, sector_t block,
1118	     unsigned size, gfp_t gfp)
1119{
1120	bool blocking = gfpflags_allow_blocking(gfp);
1121
1122	if (WARN_ON_ONCE(!IS_ALIGNED(size, bdev_logical_block_size(bdev)))) {
1123		printk(KERN_ERR "getblk(): block size %d not aligned to logical block size %d\n",
1124		       size, bdev_logical_block_size(bdev));
1125		return NULL;
1126	}
1127
1128	for (;;) {
1129		struct buffer_head *bh;
1130
1131		if (!grow_buffers(bdev, block, size, gfp))
1132			return NULL;
1133
1134		if (blocking)
1135			bh = __find_get_block_nonatomic(bdev, block, size);
1136		else
1137			bh = __find_get_block(bdev, block, size);
1138		if (bh)
1139			return bh;
1140	}
1141}
1142
1143/*
1144 * The relationship between dirty buffers and dirty pages:
1145 *
1146 * Whenever a page has any dirty buffers, the page's dirty bit is set, and
1147 * the page is tagged dirty in the page cache.
1148 *
1149 * At all times, the dirtiness of the buffers represents the dirtiness of
1150 * subsections of the page.  If the page has buffers, the page dirty bit is
1151 * merely a hint about the true dirty state.
1152 *
1153 * When a page is set dirty in its entirety, all its buffers are marked dirty
1154 * (if the page has buffers).
1155 *
1156 * When a buffer is marked dirty, its page is dirtied, but the page's other
1157 * buffers are not.
1158 *
1159 * Also.  When blockdev buffers are explicitly read with bread(), they
1160 * individually become uptodate.  But their backing page remains not
1161 * uptodate - even if all of its buffers are uptodate.  A subsequent
1162 * block_read_full_folio() against that folio will discover all the uptodate
1163 * buffers, will set the folio uptodate and will perform no I/O.
1164 */
1165
1166/**
1167 * mark_buffer_dirty - mark a buffer_head as needing writeout
1168 * @bh: the buffer_head to mark dirty
1169 *
1170 * mark_buffer_dirty() will set the dirty bit against the buffer, then set
1171 * its backing page dirty, then tag the page as dirty in the page cache
1172 * and then attach the address_space's inode to its superblock's dirty
1173 * inode list.
1174 *
1175 * mark_buffer_dirty() is atomic.  It takes bh->b_folio->mapping->i_private_lock,
1176 * i_pages lock and mapping->host->i_lock.
1177 */
1178void mark_buffer_dirty(struct buffer_head *bh)
1179{
1180	WARN_ON_ONCE(!buffer_uptodate(bh));
1181
1182	trace_block_dirty_buffer(bh);
1183
1184	/*
1185	 * Very *carefully* optimize the it-is-already-dirty case.
1186	 *
1187	 * Don't let the final "is it dirty" escape to before we
1188	 * perhaps modified the buffer.
1189	 */
1190	if (buffer_dirty(bh)) {
1191		smp_mb();
1192		if (buffer_dirty(bh))
1193			return;
1194	}
1195
1196	if (!test_set_buffer_dirty(bh)) {
1197		struct folio *folio = bh->b_folio;
1198		struct address_space *mapping = NULL;
1199
1200		if (!folio_test_set_dirty(folio)) {
1201			mapping = folio->mapping;
1202			if (mapping)
1203				__folio_mark_dirty(folio, mapping, 0);
1204		}
1205		if (mapping)
1206			__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
1207	}
1208}
1209EXPORT_SYMBOL(mark_buffer_dirty);
1210
1211void mark_buffer_write_io_error(struct buffer_head *bh)
1212{
1213	set_buffer_write_io_error(bh);
1214	/* FIXME: do we need to set this in both places? */
1215	if (bh->b_folio && bh->b_folio->mapping)
1216		mapping_set_error(bh->b_folio->mapping, -EIO);
1217	if (bh->b_assoc_map)
1218		mapping_set_error(bh->b_assoc_map, -EIO);
1219}
1220EXPORT_SYMBOL(mark_buffer_write_io_error);
1221
1222/**
1223 * __brelse - Release a buffer.
1224 * @bh: The buffer to release.
1225 *
1226 * This variant of brelse() can be called if @bh is guaranteed to not be NULL.
1227 */
1228void __brelse(struct buffer_head *bh)
1229{
1230	if (atomic_read(&bh->b_count)) {
1231		put_bh(bh);
1232		return;
1233	}
1234	WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1235}
1236EXPORT_SYMBOL(__brelse);
1237
1238/**
1239 * __bforget - Discard any dirty data in a buffer.
1240 * @bh: The buffer to forget.
1241 *
1242 * This variant of bforget() can be called if @bh is guaranteed to not
1243 * be NULL.
1244 */
1245void __bforget(struct buffer_head *bh)
1246{
1247	clear_buffer_dirty(bh);
1248	if (bh->b_assoc_map) {
1249		struct address_space *buffer_mapping = bh->b_folio->mapping;
1250
1251		spin_lock(&buffer_mapping->i_private_lock);
1252		list_del_init(&bh->b_assoc_buffers);
1253		bh->b_assoc_map = NULL;
1254		spin_unlock(&buffer_mapping->i_private_lock);
1255	}
1256	__brelse(bh);
1257}
1258EXPORT_SYMBOL(__bforget);
1259
1260static struct buffer_head *__bread_slow(struct buffer_head *bh)
1261{
1262	lock_buffer(bh);
1263	if (buffer_uptodate(bh)) {
1264		unlock_buffer(bh);
1265		return bh;
1266	} else {
1267		get_bh(bh);
1268		bh->b_end_io = end_buffer_read_sync;
1269		submit_bh(REQ_OP_READ, bh);
1270		wait_on_buffer(bh);
1271		if (buffer_uptodate(bh))
1272			return bh;
1273	}
1274	brelse(bh);
1275	return NULL;
1276}
1277
1278/*
1279 * Per-cpu buffer LRU implementation.  To reduce the cost of __find_get_block().
1280 * The bhs[] array is sorted - newest buffer is at bhs[0].  Buffers have their
1281 * refcount elevated by one when they're in an LRU.  A buffer can only appear
1282 * once in a particular CPU's LRU.  A single buffer can be present in multiple
1283 * CPU's LRUs at the same time.
1284 *
1285 * This is a transparent caching front-end to sb_bread(), sb_getblk() and
1286 * sb_find_get_block().
1287 *
1288 * The LRUs themselves only need locking against invalidate_bh_lrus.  We use
1289 * a local interrupt disable for that.
1290 */
1291
1292#define BH_LRU_SIZE	16
1293
1294struct bh_lru {
1295	struct buffer_head *bhs[BH_LRU_SIZE];
1296};
1297
1298static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
1299
1300#ifdef CONFIG_SMP
1301#define bh_lru_lock()	local_irq_disable()
1302#define bh_lru_unlock()	local_irq_enable()
1303#else
1304#define bh_lru_lock()	preempt_disable()
1305#define bh_lru_unlock()	preempt_enable()
1306#endif
1307
1308static inline void check_irqs_on(void)
1309{
1310#ifdef irqs_disabled
1311	BUG_ON(irqs_disabled());
1312#endif
1313}
1314
1315/*
1316 * Install a buffer_head into this cpu's LRU.  If not already in the LRU, it is
1317 * inserted at the front, and the buffer_head at the back if any is evicted.
1318 * Or, if already in the LRU it is moved to the front.
1319 */
1320static void bh_lru_install(struct buffer_head *bh)
1321{
1322	struct buffer_head *evictee = bh;
1323	struct bh_lru *b;
1324	int i;
1325
1326	check_irqs_on();
1327	bh_lru_lock();
1328
1329	/*
1330	 * the refcount of buffer_head in bh_lru prevents dropping the
1331	 * attached page(i.e., try_to_free_buffers) so it could cause
1332	 * failing page migration.
1333	 * Skip putting upcoming bh into bh_lru until migration is done.
1334	 */
1335	if (lru_cache_disabled() || cpu_is_isolated(smp_processor_id())) {
1336		bh_lru_unlock();
1337		return;
1338	}
1339
1340	b = this_cpu_ptr(&bh_lrus);
1341	for (i = 0; i < BH_LRU_SIZE; i++) {
1342		swap(evictee, b->bhs[i]);
1343		if (evictee == bh) {
1344			bh_lru_unlock();
1345			return;
1346		}
1347	}
1348
1349	get_bh(bh);
1350	bh_lru_unlock();
1351	brelse(evictee);
1352}
1353
1354/*
1355 * Look up the bh in this cpu's LRU.  If it's there, move it to the head.
1356 */
1357static struct buffer_head *
1358lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
1359{
1360	struct buffer_head *ret = NULL;
1361	unsigned int i;
1362
1363	check_irqs_on();
1364	bh_lru_lock();
1365	if (cpu_is_isolated(smp_processor_id())) {
1366		bh_lru_unlock();
1367		return NULL;
1368	}
1369	for (i = 0; i < BH_LRU_SIZE; i++) {
1370		struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]);
1371
1372		if (bh && bh->b_blocknr == block && bh->b_bdev == bdev &&
1373		    bh->b_size == size) {
1374			if (i) {
1375				while (i) {
1376					__this_cpu_write(bh_lrus.bhs[i],
1377						__this_cpu_read(bh_lrus.bhs[i - 1]));
1378					i--;
1379				}
1380				__this_cpu_write(bh_lrus.bhs[0], bh);
1381			}
1382			get_bh(bh);
1383			ret = bh;
1384			break;
1385		}
1386	}
1387	bh_lru_unlock();
1388	return ret;
1389}
1390
1391/*
1392 * Perform a pagecache lookup for the matching buffer.  If it's there, refresh
1393 * it in the LRU and mark it as accessed.  If it is not present then return
1394 * NULL. Atomic context callers may also return NULL if the buffer is being
1395 * migrated; similarly the page is not marked accessed either.
1396 */
1397static struct buffer_head *
1398find_get_block_common(struct block_device *bdev, sector_t block,
1399			unsigned size, bool atomic)
1400{
1401	struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
1402
1403	if (bh == NULL) {
1404		/* __find_get_block_slow will mark the page accessed */
1405		bh = __find_get_block_slow(bdev, block, atomic);
1406		if (bh)
1407			bh_lru_install(bh);
1408	} else
1409		touch_buffer(bh);
1410
1411	return bh;
1412}
1413
1414struct buffer_head *
1415__find_get_block(struct block_device *bdev, sector_t block, unsigned size)
1416{
1417	return find_get_block_common(bdev, block, size, true);
1418}
1419EXPORT_SYMBOL(__find_get_block);
1420
1421/* same as __find_get_block() but allows sleeping contexts */
1422struct buffer_head *
1423__find_get_block_nonatomic(struct block_device *bdev, sector_t block,
1424			   unsigned size)
1425{
1426	return find_get_block_common(bdev, block, size, false);
1427}
1428EXPORT_SYMBOL(__find_get_block_nonatomic);
1429
1430/**
1431 * bdev_getblk - Get a buffer_head in a block device's buffer cache.
1432 * @bdev: The block device.
1433 * @block: The block number.
1434 * @size: The size of buffer_heads for this @bdev.
1435 * @gfp: The memory allocation flags to use.
1436 *
1437 * The returned buffer head has its reference count incremented, but is
1438 * not locked.  The caller should call brelse() when it has finished
1439 * with the buffer.  The buffer may not be uptodate.  If needed, the
1440 * caller can bring it uptodate either by reading it or overwriting it.
1441 *
1442 * Return: The buffer head, or NULL if memory could not be allocated.
1443 */
1444struct buffer_head *bdev_getblk(struct block_device *bdev, sector_t block,
1445		unsigned size, gfp_t gfp)
1446{
1447	struct buffer_head *bh;
1448
1449	if (gfpflags_allow_blocking(gfp))
1450		bh = __find_get_block_nonatomic(bdev, block, size);
1451	else
1452		bh = __find_get_block(bdev, block, size);
1453
1454	might_alloc(gfp);
1455	if (bh)
1456		return bh;
1457
1458	return __getblk_slow(bdev, block, size, gfp);
1459}
1460EXPORT_SYMBOL(bdev_getblk);
1461
1462/*
1463 * Do async read-ahead on a buffer..
1464 */
1465void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
1466{
1467	struct buffer_head *bh = bdev_getblk(bdev, block, size,
1468			GFP_NOWAIT | __GFP_MOVABLE);
1469
1470	if (likely(bh)) {
1471		bh_readahead(bh, REQ_RAHEAD);
1472		brelse(bh);
1473	}
1474}
1475EXPORT_SYMBOL(__breadahead);
1476
1477/**
1478 * __bread_gfp() - Read a block.
1479 * @bdev: The block device to read from.
1480 * @block: Block number in units of block size.
1481 * @size: The block size of this device in bytes.
1482 * @gfp: Not page allocation flags; see below.
1483 *
1484 * You are not expected to call this function.  You should use one of
1485 * sb_bread(), sb_bread_unmovable() or __bread().
1486 *
1487 * Read a specified block, and return the buffer head that refers to it.
1488 * If @gfp is 0, the memory will be allocated using the block device's
1489 * default GFP flags.  If @gfp is __GFP_MOVABLE, the memory may be
1490 * allocated from a movable area.  Do not pass in a complete set of
1491 * GFP flags.
1492 *
1493 * The returned buffer head has its refcount increased.  The caller should
1494 * call brelse() when it has finished with the buffer.
1495 *
1496 * Context: May sleep waiting for I/O.
1497 * Return: NULL if the block was unreadable.
1498 */
1499struct buffer_head *__bread_gfp(struct block_device *bdev, sector_t block,
1500		unsigned size, gfp_t gfp)
1501{
1502	struct buffer_head *bh;
1503
1504	gfp |= mapping_gfp_constraint(bdev->bd_mapping, ~__GFP_FS);
1505
1506	/*
1507	 * Prefer looping in the allocator rather than here, at least that
1508	 * code knows what it's doing.
1509	 */
1510	gfp |= __GFP_NOFAIL;
1511
1512	bh = bdev_getblk(bdev, block, size, gfp);
1513
1514	if (likely(bh) && !buffer_uptodate(bh))
1515		bh = __bread_slow(bh);
1516	return bh;
1517}
1518EXPORT_SYMBOL(__bread_gfp);
1519
1520static void __invalidate_bh_lrus(struct bh_lru *b)
1521{
1522	int i;
1523
1524	for (i = 0; i < BH_LRU_SIZE; i++) {
1525		brelse(b->bhs[i]);
1526		b->bhs[i] = NULL;
1527	}
1528}
1529/*
1530 * invalidate_bh_lrus() is called rarely - but not only at unmount.
1531 * This doesn't race because it runs in each cpu either in irq
1532 * or with preempt disabled.
1533 */
1534static void invalidate_bh_lru(void *arg)
1535{
1536	struct bh_lru *b = &get_cpu_var(bh_lrus);
1537
1538	__invalidate_bh_lrus(b);
1539	put_cpu_var(bh_lrus);
1540}
1541
1542bool has_bh_in_lru(int cpu, void *dummy)
1543{
1544	struct bh_lru *b = per_cpu_ptr(&bh_lrus, cpu);
1545	int i;
1546	
1547	for (i = 0; i < BH_LRU_SIZE; i++) {
1548		if (b->bhs[i])
1549			return true;
1550	}
1551
1552	return false;
1553}
1554
1555void invalidate_bh_lrus(void)
1556{
1557	on_each_cpu_cond(has_bh_in_lru, invalidate_bh_lru, NULL, 1);
1558}
1559EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
1560
1561/*
1562 * It's called from workqueue context so we need a bh_lru_lock to close
1563 * the race with preemption/irq.
1564 */
1565void invalidate_bh_lrus_cpu(void)
1566{
1567	struct bh_lru *b;
1568
1569	bh_lru_lock();
1570	b = this_cpu_ptr(&bh_lrus);
1571	__invalidate_bh_lrus(b);
1572	bh_lru_unlock();
1573}
1574
1575void folio_set_bh(struct buffer_head *bh, struct folio *folio,
1576		  unsigned long offset)
1577{
1578	bh->b_folio = folio;
1579	BUG_ON(offset >= folio_size(folio));
1580	if (folio_test_highmem(folio))
1581		/*
1582		 * This catches illegal uses and preserves the offset:
1583		 */
1584		bh->b_data = (char *)(0 + offset);
1585	else
1586		bh->b_data = folio_address(folio) + offset;
1587}
1588EXPORT_SYMBOL(folio_set_bh);
1589
1590/*
1591 * Called when truncating a buffer on a page completely.
1592 */
1593
1594/* Bits that are cleared during an invalidate */
1595#define BUFFER_FLAGS_DISCARD \
1596	(1 << BH_Mapped | 1 << BH_New | 1 << BH_Req | \
1597	 1 << BH_Delay | 1 << BH_Unwritten)
1598
1599static void discard_buffer(struct buffer_head * bh)
1600{
1601	unsigned long b_state;
1602
1603	lock_buffer(bh);
1604	clear_buffer_dirty(bh);
1605	bh->b_bdev = NULL;
1606	b_state = READ_ONCE(bh->b_state);
1607	do {
1608	} while (!try_cmpxchg_relaxed(&bh->b_state, &b_state,
1609				      b_state & ~BUFFER_FLAGS_DISCARD));
1610	unlock_buffer(bh);
1611}
1612
1613/**
1614 * block_invalidate_folio - Invalidate part or all of a buffer-backed folio.
1615 * @folio: The folio which is affected.
1616 * @offset: start of the range to invalidate
1617 * @length: length of the range to invalidate
1618 *
1619 * block_invalidate_folio() is called when all or part of the folio has been
1620 * invalidated by a truncate operation.
1621 *
1622 * block_invalidate_folio() does not have to release all buffers, but it must
1623 * ensure that no dirty buffer is left outside @offset and that no I/O
1624 * is underway against any of the blocks which are outside the truncation
1625 * point.  Because the caller is about to free (and possibly reuse) those
1626 * blocks on-disk.
1627 */
1628void block_invalidate_folio(struct folio *folio, size_t offset, size_t length)
1629{
1630	struct buffer_head *head, *bh, *next;
1631	size_t curr_off = 0;
1632	size_t stop = length + offset;
1633
1634	BUG_ON(!folio_test_locked(folio));
1635
1636	/*
1637	 * Check for overflow
1638	 */
1639	BUG_ON(stop > folio_size(folio) || stop < length);
1640
1641	head = folio_buffers(folio);
1642	if (!head)
1643		return;
1644
1645	bh = head;
1646	do {
1647		size_t next_off = curr_off + bh->b_size;
1648		next = bh->b_this_page;
1649
1650		/*
1651		 * Are we still fully in range ?
1652		 */
1653		if (next_off > stop)
1654			goto out;
1655
1656		/*
1657		 * is this block fully invalidated?
1658		 */
1659		if (offset <= curr_off)
1660			discard_buffer(bh);
1661		curr_off = next_off;
1662		bh = next;
1663	} while (bh != head);
1664
1665	/*
1666	 * We release buffers only if the entire folio is being invalidated.
1667	 * The get_block cached value has been unconditionally invalidated,
1668	 * so real IO is not possible anymore.
1669	 */
1670	if (length == folio_size(folio))
1671		filemap_release_folio(folio, 0);
1672out:
1673	folio_clear_mappedtodisk(folio);
1674}
1675EXPORT_SYMBOL(block_invalidate_folio);
1676
1677/*
1678 * We attach and possibly dirty the buffers atomically wrt
1679 * block_dirty_folio() via i_private_lock.  try_to_free_buffers
1680 * is already excluded via the folio lock.
1681 */
1682struct buffer_head *create_empty_buffers(struct folio *folio,
1683		unsigned long blocksize, unsigned long b_state)
1684{
1685	struct buffer_head *bh, *head, *tail;
1686	gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT | __GFP_NOFAIL;
1687
1688	head = folio_alloc_buffers(folio, blocksize, gfp);
1689	bh = head;
1690	do {
1691		bh->b_state |= b_state;
1692		tail = bh;
1693		bh = bh->b_this_page;
1694	} while (bh);
1695	tail->b_this_page = head;
1696
1697	spin_lock(&folio->mapping->i_private_lock);
1698	if (folio_test_uptodate(folio) || folio_test_dirty(folio)) {
1699		bh = head;
1700		do {
1701			if (folio_test_dirty(folio))
1702				set_buffer_dirty(bh);
1703			if (folio_test_uptodate(folio))
1704				set_buffer_uptodate(bh);
1705			bh = bh->b_this_page;
1706		} while (bh != head);
1707	}
1708	folio_attach_private(folio, head);
1709	spin_unlock(&folio->mapping->i_private_lock);
1710
1711	return head;
1712}
1713EXPORT_SYMBOL(create_empty_buffers);
1714
1715/**
1716 * clean_bdev_aliases: clean a range of buffers in block device
1717 * @bdev: Block device to clean buffers in
1718 * @block: Start of a range of blocks to clean
1719 * @len: Number of blocks to clean
1720 *
1721 * We are taking a range of blocks for data and we don't want writeback of any
1722 * buffer-cache aliases starting from return from this function and until the
1723 * moment when something will explicitly mark the buffer dirty (hopefully that
1724 * will not happen until we will free that block ;-) We don't even need to mark
1725 * it not-uptodate - nobody can expect anything from a newly allocated buffer
1726 * anyway. We used to use unmap_buffer() for such invalidation, but that was
1727 * wrong. We definitely don't want to mark the alias unmapped, for example - it
1728 * would confuse anyone who might pick it with bread() afterwards...
1729 *
1730 * Also..  Note that bforget() doesn't lock the buffer.  So there can be
1731 * writeout I/O going on against recently-freed buffers.  We don't wait on that
1732 * I/O in bforget() - it's more efficient to wait on the I/O only if we really
1733 * need to.  That happens here.
1734 */
1735void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len)
1736{
1737	struct address_space *bd_mapping = bdev->bd_mapping;
1738	const int blkbits = bd_mapping->host->i_blkbits;
1739	struct folio_batch fbatch;
1740	pgoff_t index = ((loff_t)block << blkbits) / PAGE_SIZE;
1741	pgoff_t end;
1742	int i, count;
1743	struct buffer_head *bh;
1744	struct buffer_head *head;
1745
1746	end = ((loff_t)(block + len - 1) << blkbits) / PAGE_SIZE;
1747	folio_batch_init(&fbatch);
1748	while (filemap_get_folios(bd_mapping, &index, end, &fbatch)) {
1749		count = folio_batch_count(&fbatch);
1750		for (i = 0; i < count; i++) {
1751			struct folio *folio = fbatch.folios[i];
1752
1753			if (!folio_buffers(folio))
1754				continue;
1755			/*
1756			 * We use folio lock instead of bd_mapping->i_private_lock
1757			 * to pin buffers here since we can afford to sleep and
1758			 * it scales better than a global spinlock lock.
1759			 */
1760			folio_lock(folio);
1761			/* Recheck when the folio is locked which pins bhs */
1762			head = folio_buffers(folio);
1763			if (!head)
1764				goto unlock_page;
1765			bh = head;
1766			do {
1767				if (!buffer_mapped(bh) || (bh->b_blocknr < block))
1768					goto next;
1769				if (bh->b_blocknr >= block + len)
1770					break;
1771				clear_buffer_dirty(bh);
1772				wait_on_buffer(bh);
1773				clear_buffer_req(bh);
1774next:
1775				bh = bh->b_this_page;
1776			} while (bh != head);
1777unlock_page:
1778			folio_unlock(folio);
1779		}
1780		folio_batch_release(&fbatch);
1781		cond_resched();
1782		/* End of range already reached? */
1783		if (index > end || !index)
1784			break;
1785	}
1786}
1787EXPORT_SYMBOL(clean_bdev_aliases);
1788
1789static struct buffer_head *folio_create_buffers(struct folio *folio,
1790						struct inode *inode,
1791						unsigned int b_state)
1792{
1793	struct buffer_head *bh;
1794
1795	BUG_ON(!folio_test_locked(folio));
1796
1797	bh = folio_buffers(folio);
1798	if (!bh)
1799		bh = create_empty_buffers(folio,
1800				1 << READ_ONCE(inode->i_blkbits), b_state);
1801	return bh;
1802}
1803
1804/*
1805 * NOTE! All mapped/uptodate combinations are valid:
1806 *
1807 *	Mapped	Uptodate	Meaning
1808 *
1809 *	No	No		"unknown" - must do get_block()
1810 *	No	Yes		"hole" - zero-filled
1811 *	Yes	No		"allocated" - allocated on disk, not read in
1812 *	Yes	Yes		"valid" - allocated and up-to-date in memory.
1813 *
1814 * "Dirty" is valid only with the last case (mapped+uptodate).
1815 */
1816
1817/*
1818 * While block_write_full_folio is writing back the dirty buffers under
1819 * the page lock, whoever dirtied the buffers may decide to clean them
1820 * again at any time.  We handle that by only looking at the buffer
1821 * state inside lock_buffer().
1822 *
1823 * If block_write_full_folio() is called for regular writeback
1824 * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
1825 * locked buffer.   This only can happen if someone has written the buffer
1826 * directly, with submit_bh().  At the address_space level PageWriteback
1827 * prevents this contention from occurring.
1828 *
1829 * If block_write_full_folio() is called with wbc->sync_mode ==
1830 * WB_SYNC_ALL, the writes are posted using REQ_SYNC; this
1831 * causes the writes to be flagged as synchronous writes.
1832 */
1833int __block_write_full_folio(struct inode *inode, struct folio *folio,
1834			get_block_t *get_block, struct writeback_control *wbc)
1835{
1836	int err;
1837	sector_t block;
1838	sector_t last_block;
1839	struct buffer_head *bh, *head;
1840	size_t blocksize;
1841	int nr_underway = 0;
1842	blk_opf_t write_flags = wbc_to_write_flags(wbc);
1843
1844	head = folio_create_buffers(folio, inode,
1845				    (1 << BH_Dirty) | (1 << BH_Uptodate));
1846
1847	/*
1848	 * Be very careful.  We have no exclusion from block_dirty_folio
1849	 * here, and the (potentially unmapped) buffers may become dirty at
1850	 * any time.  If a buffer becomes dirty here after we've inspected it
1851	 * then we just miss that fact, and the folio stays dirty.
1852	 *
1853	 * Buffers outside i_size may be dirtied by block_dirty_folio;
1854	 * handle that here by just cleaning them.
1855	 */
1856
1857	bh = head;
1858	blocksize = bh->b_size;
1859
1860	block = div_u64(folio_pos(folio), blocksize);
1861	last_block = div_u64(i_size_read(inode) - 1, blocksize);
1862
1863	/*
1864	 * Get all the dirty buffers mapped to disk addresses and
1865	 * handle any aliases from the underlying blockdev's mapping.
1866	 */
1867	do {
1868		if (block > last_block) {
1869			/*
1870			 * mapped buffers outside i_size will occur, because
1871			 * this folio can be outside i_size when there is a
1872			 * truncate in progress.
1873			 */
1874			/*
1875			 * The buffer was zeroed by block_write_full_folio()
1876			 */
1877			clear_buffer_dirty(bh);
1878			set_buffer_uptodate(bh);
1879		} else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
1880			   buffer_dirty(bh)) {
1881			WARN_ON(bh->b_size != blocksize);
1882			err = get_block(inode, block, bh, 1);
1883			if (err)
1884				goto recover;
1885			clear_buffer_delay(bh);
1886			if (buffer_new(bh)) {
1887				/* blockdev mappings never come here */
1888				clear_buffer_new(bh);
1889				clean_bdev_bh_alias(bh);
1890			}
1891		}
1892		bh = bh->b_this_page;
1893		block++;
1894	} while (bh != head);
1895
1896	do {
1897		if (!buffer_mapped(bh))
1898			continue;
1899		/*
1900		 * If it's a fully non-blocking write attempt and we cannot
1901		 * lock the buffer then redirty the folio.  Note that this can
1902		 * potentially cause a busy-wait loop from writeback threads
1903		 * and kswapd activity, but those code paths have their own
1904		 * higher-level throttling.
1905		 */
1906		if (wbc->sync_mode != WB_SYNC_NONE) {
1907			lock_buffer(bh);
1908		} else if (!trylock_buffer(bh)) {
1909			folio_redirty_for_writepage(wbc, folio);
1910			continue;
1911		}
1912		if (test_clear_buffer_dirty(bh)) {
1913			mark_buffer_async_write_endio(bh,
1914				end_buffer_async_write);
1915		} else {
1916			unlock_buffer(bh);
1917		}
1918	} while ((bh = bh->b_this_page) != head);
1919
1920	/*
1921	 * The folio and its buffers are protected by the writeback flag,
1922	 * so we can drop the bh refcounts early.
1923	 */
1924	BUG_ON(folio_test_writeback(folio));
1925	folio_start_writeback(folio);
1926
1927	do {
1928		struct buffer_head *next = bh->b_this_page;
1929		if (buffer_async_write(bh)) {
1930			submit_bh_wbc(REQ_OP_WRITE | write_flags, bh,
1931				      inode->i_write_hint, wbc);
1932			nr_underway++;
1933		}
1934		bh = next;
1935	} while (bh != head);
1936	folio_unlock(folio);
1937
1938	err = 0;
1939done:
1940	if (nr_underway == 0) {
1941		/*
1942		 * The folio was marked dirty, but the buffers were
1943		 * clean.  Someone wrote them back by hand with
1944		 * write_dirty_buffer/submit_bh.  A rare case.
1945		 */
1946		folio_end_writeback(folio);
1947
1948		/*
1949		 * The folio and buffer_heads can be released at any time from
1950		 * here on.
1951		 */
1952	}
1953	return err;
1954
1955recover:
1956	/*
1957	 * ENOSPC, or some other error.  We may already have added some
1958	 * blocks to the file, so we need to write these out to avoid
1959	 * exposing stale data.
1960	 * The folio is currently locked and not marked for writeback
1961	 */
1962	bh = head;
1963	/* Recovery: lock and submit the mapped buffers */
1964	do {
1965		if (buffer_mapped(bh) && buffer_dirty(bh) &&
1966		    !buffer_delay(bh)) {
1967			lock_buffer(bh);
1968			mark_buffer_async_write_endio(bh,
1969				end_buffer_async_write);
1970		} else {
1971			/*
1972			 * The buffer may have been set dirty during
1973			 * attachment to a dirty folio.
1974			 */
1975			clear_buffer_dirty(bh);
1976		}
1977	} while ((bh = bh->b_this_page) != head);
1978	BUG_ON(folio_test_writeback(folio));
1979	mapping_set_error(folio->mapping, err);
1980	folio_start_writeback(folio);
1981	do {
1982		struct buffer_head *next = bh->b_this_page;
1983		if (buffer_async_write(bh)) {
1984			clear_buffer_dirty(bh);
1985			submit_bh_wbc(REQ_OP_WRITE | write_flags, bh,
1986				      inode->i_write_hint, wbc);
1987			nr_underway++;
1988		}
1989		bh = next;
1990	} while (bh != head);
1991	folio_unlock(folio);
1992	goto done;
1993}
1994EXPORT_SYMBOL(__block_write_full_folio);
1995
1996/*
1997 * If a folio has any new buffers, zero them out here, and mark them uptodate
1998 * and dirty so they'll be written out (in order to prevent uninitialised
1999 * block data from leaking). And clear the new bit.
2000 */
2001void folio_zero_new_buffers(struct folio *folio, size_t from, size_t to)
2002{
2003	size_t block_start, block_end;
2004	struct buffer_head *head, *bh;
2005
2006	BUG_ON(!folio_test_locked(folio));
2007	head = folio_buffers(folio);
2008	if (!head)
2009		return;
2010
2011	bh = head;
2012	block_start = 0;
2013	do {
2014		block_end = block_start + bh->b_size;
2015
2016		if (buffer_new(bh)) {
2017			if (block_end > from && block_start < to) {
2018				if (!folio_test_uptodate(folio)) {
2019					size_t start, xend;
2020
2021					start = max(from, block_start);
2022					xend = min(to, block_end);
2023
2024					folio_zero_segment(folio, start, xend);
2025					set_buffer_uptodate(bh);
2026				}
2027
2028				clear_buffer_new(bh);
2029				mark_buffer_dirty(bh);
2030			}
2031		}
2032
2033		block_start = block_end;
2034		bh = bh->b_this_page;
2035	} while (bh != head);
2036}
2037EXPORT_SYMBOL(folio_zero_new_buffers);
2038
2039static int
2040iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh,
2041		const struct iomap *iomap)
2042{
2043	loff_t offset = (loff_t)block << inode->i_blkbits;
2044
2045	bh->b_bdev = iomap->bdev;
2046
2047	/*
2048	 * Block points to offset in file we need to map, iomap contains
2049	 * the offset at which the map starts. If the map ends before the
2050	 * current block, then do not map the buffer and let the caller
2051	 * handle it.
2052	 */
2053	if (offset >= iomap->offset + iomap->length)
2054		return -EIO;
2055
2056	switch (iomap->type) {
2057	case IOMAP_HOLE:
2058		/*
2059		 * If the buffer is not up to date or beyond the current EOF,
2060		 * we need to mark it as new to ensure sub-block zeroing is
2061		 * executed if necessary.
2062		 */
2063		if (!buffer_uptodate(bh) ||
2064		    (offset >= i_size_read(inode)))
2065			set_buffer_new(bh);
2066		return 0;
2067	case IOMAP_DELALLOC:
2068		if (!buffer_uptodate(bh) ||
2069		    (offset >= i_size_read(inode)))
2070			set_buffer_new(bh);
2071		set_buffer_uptodate(bh);
2072		set_buffer_mapped(bh);
2073		set_buffer_delay(bh);
2074		return 0;
2075	case IOMAP_UNWRITTEN:
2076		/*
2077		 * For unwritten regions, we always need to ensure that regions
2078		 * in the block we are not writing to are zeroed. Mark the
2079		 * buffer as new to ensure this.
2080		 */
2081		set_buffer_new(bh);
2082		set_buffer_unwritten(bh);
2083		fallthrough;
2084	case IOMAP_MAPPED:
2085		if ((iomap->flags & IOMAP_F_NEW) ||
2086		    offset >= i_size_read(inode)) {
2087			/*
2088			 * This can happen if truncating the block device races
2089			 * with the check in the caller as i_size updates on
2090			 * block devices aren't synchronized by i_rwsem for
2091			 * block devices.
2092			 */
2093			if (S_ISBLK(inode->i_mode))
2094				return -EIO;
2095			set_buffer_new(bh);
2096		}
2097		bh->b_blocknr = (iomap->addr + offset - iomap->offset) >>
2098				inode->i_blkbits;
2099		set_buffer_mapped(bh);
2100		return 0;
2101	default:
2102		WARN_ON_ONCE(1);
2103		return -EIO;
2104	}
2105}
2106
2107int __block_write_begin_int(struct folio *folio, loff_t pos, unsigned len,
2108		get_block_t *get_block, const struct iomap *iomap)
2109{
2110	size_t from = offset_in_folio(folio, pos);
2111	size_t to = from + len;
2112	struct inode *inode = folio->mapping->host;
2113	size_t block_start, block_end;
2114	sector_t block;
2115	int err = 0;
2116	size_t blocksize;
2117	struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
2118
2119	BUG_ON(!folio_test_locked(folio));
2120	BUG_ON(to > folio_size(folio));
2121	BUG_ON(from > to);
2122
2123	head = folio_create_buffers(folio, inode, 0);
2124	blocksize = head->b_size;
2125	block = div_u64(folio_pos(folio), blocksize);
2126
2127	for (bh = head, block_start = 0; bh != head || !block_start;
2128	    block++, block_start=block_end, bh = bh->b_this_page) {
2129		block_end = block_start + blocksize;
2130		if (block_end <= from || block_start >= to) {
2131			if (folio_test_uptodate(folio)) {
2132				if (!buffer_uptodate(bh))
2133					set_buffer_uptodate(bh);
2134			}
2135			continue;
2136		}
2137		if (buffer_new(bh))
2138			clear_buffer_new(bh);
2139		if (!buffer_mapped(bh)) {
2140			WARN_ON(bh->b_size != blocksize);
2141			if (get_block)
2142				err = get_block(inode, block, bh, 1);
2143			else
2144				err = iomap_to_bh(inode, block, bh, iomap);
2145			if (err)
2146				break;
2147
2148			if (buffer_new(bh)) {
2149				clean_bdev_bh_alias(bh);
2150				if (folio_test_uptodate(folio)) {
2151					clear_buffer_new(bh);
2152					set_buffer_uptodate(bh);
2153					mark_buffer_dirty(bh);
2154					continue;
2155				}
2156				if (block_end > to || block_start < from)
2157					folio_zero_segments(folio,
2158						to, block_end,
2159						block_start, from);
2160				continue;
2161			}
2162		}
2163		if (folio_test_uptodate(folio)) {
2164			if (!buffer_uptodate(bh))
2165				set_buffer_uptodate(bh);
2166			continue; 
2167		}
2168		if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
2169		    !buffer_unwritten(bh) &&
2170		     (block_start < from || block_end > to)) {
2171			bh_read_nowait(bh, 0);
2172			*wait_bh++=bh;
2173		}
2174	}
2175	/*
2176	 * If we issued read requests - let them complete.
2177	 */
2178	while(wait_bh > wait) {
2179		wait_on_buffer(*--wait_bh);
2180		if (!buffer_uptodate(*wait_bh))
2181			err = -EIO;
2182	}
2183	if (unlikely(err))
2184		folio_zero_new_buffers(folio, from, to);
2185	return err;
2186}
2187
2188int __block_write_begin(struct folio *folio, loff_t pos, unsigned len,
2189		get_block_t *get_block)
2190{
2191	return __block_write_begin_int(folio, pos, len, get_block, NULL);
2192}
2193EXPORT_SYMBOL(__block_write_begin);
2194
2195void block_commit_write(struct folio *folio, size_t from, size_t to)
2196{
2197	size_t block_start, block_end;
2198	bool partial = false;
2199	unsigned blocksize;
2200	struct buffer_head *bh, *head;
2201
2202	bh = head = folio_buffers(folio);
2203	if (!bh)
2204		return;
2205	blocksize = bh->b_size;
2206
2207	block_start = 0;
2208	do {
2209		block_end = block_start + blocksize;
2210		if (block_end <= from || block_start >= to) {
2211			if (!buffer_uptodate(bh))
2212				partial = true;
2213		} else {
2214			set_buffer_uptodate(bh);
2215			mark_buffer_dirty(bh);
2216		}
2217		if (buffer_new(bh))
2218			clear_buffer_new(bh);
2219
2220		block_start = block_end;
2221		bh = bh->b_this_page;
2222	} while (bh != head);
2223
2224	/*
2225	 * If this is a partial write which happened to make all buffers
2226	 * uptodate then we can optimize away a bogus read_folio() for
2227	 * the next read(). Here we 'discover' whether the folio went
2228	 * uptodate as a result of this (potentially partial) write.
2229	 */
2230	if (!partial)
2231		folio_mark_uptodate(folio);
2232}
2233EXPORT_SYMBOL(block_commit_write);
2234
2235/*
2236 * block_write_begin takes care of the basic task of block allocation and
2237 * bringing partial write blocks uptodate first.
2238 *
2239 * The filesystem needs to handle block truncation upon failure.
2240 */
2241int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
2242		struct folio **foliop, get_block_t *get_block)
2243{
2244	pgoff_t index = pos >> PAGE_SHIFT;
2245	struct folio *folio;
2246	int status;
2247
2248	folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
2249			mapping_gfp_mask(mapping));
2250	if (IS_ERR(folio))
2251		return PTR_ERR(folio);
2252
2253	status = __block_write_begin_int(folio, pos, len, get_block, NULL);
2254	if (unlikely(status)) {
2255		folio_unlock(folio);
2256		folio_put(folio);
2257		folio = NULL;
2258	}
2259
2260	*foliop = folio;
2261	return status;
2262}
2263EXPORT_SYMBOL(block_write_begin);
2264
2265int block_write_end(loff_t pos, unsigned len, unsigned copied,
2266		struct folio *folio)
2267{
2268	size_t start = pos - folio_pos(folio);
2269
2270	if (unlikely(copied < len)) {
2271		/*
2272		 * The buffers that were written will now be uptodate, so
2273		 * we don't have to worry about a read_folio reading them
2274		 * and overwriting a partial write. However if we have
2275		 * encountered a short write and only partially written
2276		 * into a buffer, it will not be marked uptodate, so a
2277		 * read_folio might come in and destroy our partial write.
2278		 *
2279		 * Do the simplest thing, and just treat any short write to a
2280		 * non uptodate folio as a zero-length write, and force the
2281		 * caller to redo the whole thing.
2282		 */
2283		if (!folio_test_uptodate(folio))
2284			copied = 0;
2285
2286		folio_zero_new_buffers(folio, start+copied, start+len);
2287	}
2288	flush_dcache_folio(folio);
2289
2290	/* This could be a short (even 0-length) commit */
2291	block_commit_write(folio, start, start + copied);
2292
2293	return copied;
2294}
2295EXPORT_SYMBOL(block_write_end);
2296
2297int generic_write_end(const struct kiocb *iocb, struct address_space *mapping,
2298		      loff_t pos, unsigned len, unsigned copied,
2299		      struct folio *folio, void *fsdata)
2300{
2301	struct inode *inode = mapping->host;
2302	loff_t old_size = inode->i_size;
2303	bool i_size_changed = false;
2304
2305	copied = block_write_end(pos, len, copied, folio);
2306
2307	/*
2308	 * No need to use i_size_read() here, the i_size cannot change under us
2309	 * because we hold i_rwsem.
2310	 *
2311	 * But it's important to update i_size while still holding folio lock:
2312	 * page writeout could otherwise come in and zero beyond i_size.
2313	 */
2314	if (pos + copied > inode->i_size) {
2315		i_size_write(inode, pos + copied);
2316		i_size_changed = true;
2317	}
2318
2319	folio_unlock(folio);
2320	folio_put(folio);
2321
2322	if (old_size < pos)
2323		pagecache_isize_extended(inode, old_size, pos);
2324	/*
2325	 * Don't mark the inode dirty under page lock. First, it unnecessarily
2326	 * makes the holding time of page lock longer. Second, it forces lock
2327	 * ordering of page lock and transaction start for journaling
2328	 * filesystems.
2329	 */
2330	if (i_size_changed)
2331		mark_inode_dirty(inode);
2332	return copied;
2333}
2334EXPORT_SYMBOL(generic_write_end);
2335
2336/*
2337 * block_is_partially_uptodate checks whether buffers within a folio are
2338 * uptodate or not.
2339 *
2340 * Returns true if all buffers which correspond to the specified part
2341 * of the folio are uptodate.
2342 */
2343bool block_is_partially_uptodate(struct folio *folio, size_t from, size_t count)
2344{
2345	unsigned block_start, block_end, blocksize;
2346	unsigned to;
2347	struct buffer_head *bh, *head;
2348	bool ret = true;
2349
2350	head = folio_buffers(folio);
2351	if (!head)
2352		return false;
2353	blocksize = head->b_size;
2354	to = min(folio_size(folio) - from, count);
2355	to = from + to;
2356	if (from < blocksize && to > folio_size(folio) - blocksize)
2357		return false;
2358
2359	bh = head;
2360	block_start = 0;
2361	do {
2362		block_end = block_start + blocksize;
2363		if (block_end > from && block_start < to) {
2364			if (!buffer_uptodate(bh)) {
2365				ret = false;
2366				break;
2367			}
2368			if (block_end >= to)
2369				break;
2370		}
2371		block_start = block_end;
2372		bh = bh->b_this_page;
2373	} while (bh != head);
2374
2375	return ret;
2376}
2377EXPORT_SYMBOL(block_is_partially_uptodate);
2378
2379/*
2380 * Generic "read_folio" function for block devices that have the normal
2381 * get_block functionality. This is most of the block device filesystems.
2382 * Reads the folio asynchronously --- the unlock_buffer() and
2383 * set/clear_buffer_uptodate() functions propagate buffer state into the
2384 * folio once IO has completed.
2385 */
2386int block_read_full_folio(struct folio *folio, get_block_t *get_block)
2387{
2388	struct inode *inode = folio->mapping->host;
2389	sector_t iblock, lblock;
2390	struct buffer_head *bh, *head, *prev = NULL;
2391	size_t blocksize;
2392	int fully_mapped = 1;
2393	bool page_error = false;
2394	loff_t limit = i_size_read(inode);
2395
2396	/* This is needed for ext4. */
2397	if (IS_ENABLED(CONFIG_FS_VERITY) && IS_VERITY(inode))
2398		limit = inode->i_sb->s_maxbytes;
2399
2400	head = folio_create_buffers(folio, inode, 0);
2401	blocksize = head->b_size;
2402
2403	iblock = div_u64(folio_pos(folio), blocksize);
2404	lblock = div_u64(limit + blocksize - 1, blocksize);
2405	bh = head;
2406
2407	do {
2408		if (buffer_uptodate(bh))
2409			continue;
2410
2411		if (!buffer_mapped(bh)) {
2412			int err = 0;
2413
2414			fully_mapped = 0;
2415			if (iblock < lblock) {
2416				WARN_ON(bh->b_size != blocksize);
2417				err = get_block(inode, iblock, bh, 0);
2418				if (err)
2419					page_error = true;
2420			}
2421			if (!buffer_mapped(bh)) {
2422				folio_zero_range(folio, bh_offset(bh),
2423						blocksize);
2424				if (!err)
2425					set_buffer_uptodate(bh);
2426				continue;
2427			}
2428			/*
2429			 * get_block() might have updated the buffer
2430			 * synchronously
2431			 */
2432			if (buffer_uptodate(bh))
2433				continue;
2434		}
2435
2436		lock_buffer(bh);
2437		if (buffer_uptodate(bh)) {
2438			unlock_buffer(bh);
2439			continue;
2440		}
2441
2442		mark_buffer_async_read(bh);
2443		if (prev)
2444			submit_bh(REQ_OP_READ, prev);
2445		prev = bh;
2446	} while (iblock++, (bh = bh->b_this_page) != head);
2447
2448	if (fully_mapped)
2449		folio_set_mappedtodisk(folio);
2450
2451	/*
2452	 * All buffers are uptodate or get_block() returned an error
2453	 * when trying to map them - we must finish the read because
2454	 * end_buffer_async_read() will never be called on any buffer
2455	 * in this folio.
2456	 */
2457	if (prev)
2458		submit_bh(REQ_OP_READ, prev);
2459	else
2460		folio_end_read(folio, !page_error);
2461
2462	return 0;
2463}
2464EXPORT_SYMBOL(block_read_full_folio);
2465
2466/* utility function for filesystems that need to do work on expanding
2467 * truncates.  Uses filesystem pagecache writes to allow the filesystem to
2468 * deal with the hole.  
2469 */
2470int generic_cont_expand_simple(struct inode *inode, loff_t size)
2471{
2472	struct address_space *mapping = inode->i_mapping;
2473	const struct address_space_operations *aops = mapping->a_ops;
2474	struct folio *folio;
2475	void *fsdata = NULL;
2476	int err;
2477
2478	err = inode_newsize_ok(inode, size);
2479	if (err)
2480		goto out;
2481
2482	err = aops->write_begin(NULL, mapping, size, 0, &folio, &fsdata);
2483	if (err)
2484		goto out;
2485
2486	err = aops->write_end(NULL, mapping, size, 0, 0, folio, fsdata);
2487	BUG_ON(err > 0);
2488
2489out:
2490	return err;
2491}
2492EXPORT_SYMBOL(generic_cont_expand_simple);
2493
2494static int cont_expand_zero(const struct kiocb *iocb,
2495			    struct address_space *mapping,
2496			    loff_t pos, loff_t *bytes)
2497{
2498	struct inode *inode = mapping->host;
2499	const struct address_space_operations *aops = mapping->a_ops;
2500	unsigned int blocksize = i_blocksize(inode);
2501	struct folio *folio;
2502	void *fsdata = NULL;
2503	pgoff_t index, curidx;
2504	loff_t curpos;
2505	unsigned zerofrom, offset, len;
2506	int err = 0;
2507
2508	index = pos >> PAGE_SHIFT;
2509	offset = pos & ~PAGE_MASK;
2510
2511	while (index > (curidx = (curpos = *bytes)>>PAGE_SHIFT)) {
2512		zerofrom = curpos & ~PAGE_MASK;
2513		if (zerofrom & (blocksize-1)) {
2514			*bytes |= (blocksize-1);
2515			(*bytes)++;
2516		}
2517		len = PAGE_SIZE - zerofrom;
2518
2519		err = aops->write_begin(iocb, mapping, curpos, len,
2520					    &folio, &fsdata);
2521		if (err)
2522			goto out;
2523		folio_zero_range(folio, offset_in_folio(folio, curpos), len);
2524		err = aops->write_end(iocb, mapping, curpos, len, len,
2525						folio, fsdata);
2526		if (err < 0)
2527			goto out;
2528		BUG_ON(err != len);
2529		err = 0;
2530
2531		balance_dirty_pages_ratelimited(mapping);
2532
2533		if (fatal_signal_pending(current)) {
2534			err = -EINTR;
2535			goto out;
2536		}
2537	}
2538
2539	/* page covers the boundary, find the boundary offset */
2540	if (index == curidx) {
2541		zerofrom = curpos & ~PAGE_MASK;
2542		/* if we will expand the thing last block will be filled */
2543		if (offset <= zerofrom) {
2544			goto out;
2545		}
2546		if (zerofrom & (blocksize-1)) {
2547			*bytes |= (blocksize-1);
2548			(*bytes)++;
2549		}
2550		len = offset - zerofrom;
2551
2552		err = aops->write_begin(iocb, mapping, curpos, len,
2553					    &folio, &fsdata);
2554		if (err)
2555			goto out;
2556		folio_zero_range(folio, offset_in_folio(folio, curpos), len);
2557		err = aops->write_end(iocb, mapping, curpos, len, len,
2558						folio, fsdata);
2559		if (err < 0)
2560			goto out;
2561		BUG_ON(err != len);
2562		err = 0;
2563	}
2564out:
2565	return err;
2566}
2567
2568/*
2569 * For moronic filesystems that do not allow holes in file.
2570 * We may have to extend the file.
2571 */
2572int cont_write_begin(const struct kiocb *iocb, struct address_space *mapping,
2573		     loff_t pos, unsigned len, struct folio **foliop,
2574		     void **fsdata, get_block_t *get_block, loff_t *bytes)
2575{
2576	struct inode *inode = mapping->host;
2577	unsigned int blocksize = i_blocksize(inode);
2578	unsigned int zerofrom;
2579	int err;
2580
2581	err = cont_expand_zero(iocb, mapping, pos, bytes);
2582	if (err)
2583		return err;
2584
2585	zerofrom = *bytes & ~PAGE_MASK;
2586	if (pos+len > *bytes && zerofrom & (blocksize-1)) {
2587		*bytes |= (blocksize-1);
2588		(*bytes)++;
2589	}
2590
2591	return block_write_begin(mapping, pos, len, foliop, get_block);
2592}
2593EXPORT_SYMBOL(cont_write_begin);
2594
2595/*
2596 * block_page_mkwrite() is not allowed to change the file size as it gets
2597 * called from a page fault handler when a page is first dirtied. Hence we must
2598 * be careful to check for EOF conditions here. We set the page up correctly
2599 * for a written page which means we get ENOSPC checking when writing into
2600 * holes and correct delalloc and unwritten extent mapping on filesystems that
2601 * support these features.
2602 *
2603 * We are not allowed to take the i_rwsem here so we have to play games to
2604 * protect against truncate races as the page could now be beyond EOF.  Because
2605 * truncate writes the inode size before removing pages, once we have the
2606 * page lock we can determine safely if the page is beyond EOF. If it is not
2607 * beyond EOF, then the page is guaranteed safe against truncation until we
2608 * unlock the page.
2609 *
2610 * Direct callers of this function should protect against filesystem freezing
2611 * using sb_start_pagefault() - sb_end_pagefault() functions.
2612 */
2613int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
2614			 get_block_t get_block)
2615{
2616	struct folio *folio = page_folio(vmf->page);
2617	struct inode *inode = file_inode(vma->vm_file);
2618	unsigned long end;
2619	loff_t size;
2620	int ret;
2621
2622	folio_lock(folio);
2623	size = i_size_read(inode);
2624	if ((folio->mapping != inode->i_mapping) ||
2625	    (folio_pos(folio) >= size)) {
2626		/* We overload EFAULT to mean page got truncated */
2627		ret = -EFAULT;
2628		goto out_unlock;
2629	}
2630
2631	end = folio_size(folio);
2632	/* folio is wholly or partially inside EOF */
2633	if (folio_pos(folio) + end > size)
2634		end = size - folio_pos(folio);
2635
2636	ret = __block_write_begin_int(folio, 0, end, get_block, NULL);
2637	if (unlikely(ret))
2638		goto out_unlock;
2639
2640	block_commit_write(folio, 0, end);
2641
2642	folio_mark_dirty(folio);
2643	folio_wait_stable(folio);
2644	return 0;
2645out_unlock:
2646	folio_unlock(folio);
2647	return ret;
2648}
2649EXPORT_SYMBOL(block_page_mkwrite);
2650
2651int block_truncate_page(struct address_space *mapping,
2652			loff_t from, get_block_t *get_block)
2653{
2654	pgoff_t index = from >> PAGE_SHIFT;
2655	unsigned blocksize;
2656	sector_t iblock;
2657	size_t offset, length, pos;
2658	struct inode *inode = mapping->host;
2659	struct folio *folio;
2660	struct buffer_head *bh;
2661	int err = 0;
2662
2663	blocksize = i_blocksize(inode);
2664	length = from & (blocksize - 1);
2665
2666	/* Block boundary? Nothing to do */
2667	if (!length)
2668		return 0;
2669
2670	length = blocksize - length;
2671	iblock = ((loff_t)index * PAGE_SIZE) >> inode->i_blkbits;
2672
2673	folio = filemap_grab_folio(mapping, index);
2674	if (IS_ERR(folio))
2675		return PTR_ERR(folio);
2676
2677	bh = folio_buffers(folio);
2678	if (!bh)
2679		bh = create_empty_buffers(folio, blocksize, 0);
2680
2681	/* Find the buffer that contains "offset" */
2682	offset = offset_in_folio(folio, from);
2683	pos = blocksize;
2684	while (offset >= pos) {
2685		bh = bh->b_this_page;
2686		iblock++;
2687		pos += blocksize;
2688	}
2689
2690	if (!buffer_mapped(bh)) {
2691		WARN_ON(bh->b_size != blocksize);
2692		err = get_block(inode, iblock, bh, 0);
2693		if (err)
2694			goto unlock;
2695		/* unmapped? It's a hole - nothing to do */
2696		if (!buffer_mapped(bh))
2697			goto unlock;
2698	}
2699
2700	/* Ok, it's mapped. Make sure it's up-to-date */
2701	if (folio_test_uptodate(folio))
2702		set_buffer_uptodate(bh);
2703
2704	if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
2705		err = bh_read(bh, 0);
2706		/* Uhhuh. Read error. Complain and punt. */
2707		if (err < 0)
2708			goto unlock;
2709	}
2710
2711	folio_zero_range(folio, offset, length);
2712	mark_buffer_dirty(bh);
2713
2714unlock:
2715	folio_unlock(folio);
2716	folio_put(folio);
2717
2718	return err;
2719}
2720EXPORT_SYMBOL(block_truncate_page);
2721
2722/*
2723 * The generic write folio function for buffer-backed address_spaces
2724 */
2725int block_write_full_folio(struct folio *folio, struct writeback_control *wbc,
2726		void *get_block)
2727{
2728	struct inode * const inode = folio->mapping->host;
2729	loff_t i_size = i_size_read(inode);
2730
2731	/* Is the folio fully inside i_size? */
2732	if (folio_next_pos(folio) <= i_size)
2733		return __block_write_full_folio(inode, folio, get_block, wbc);
2734
2735	/* Is the folio fully outside i_size? (truncate in progress) */
2736	if (folio_pos(folio) >= i_size) {
2737		folio_unlock(folio);
2738		return 0; /* don't care */
2739	}
2740
2741	/*
2742	 * The folio straddles i_size.  It must be zeroed out on each and every
2743	 * writeback invocation because it may be mmapped.  "A file is mapped
2744	 * in multiples of the page size.  For a file that is not a multiple of
2745	 * the page size, the remaining memory is zeroed when mapped, and
2746	 * writes to that region are not written out to the file."
2747	 */
2748	folio_zero_segment(folio, offset_in_folio(folio, i_size),
2749			folio_size(folio));
2750	return __block_write_full_folio(inode, folio, get_block, wbc);
2751}
2752
2753sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
2754			    get_block_t *get_block)
2755{
2756	struct inode *inode = mapping->host;
2757	struct buffer_head tmp = {
2758		.b_size = i_blocksize(inode),
2759	};
2760
2761	get_block(inode, block, &tmp, 0);
2762	return tmp.b_blocknr;
2763}
2764EXPORT_SYMBOL(generic_block_bmap);
2765
2766static void end_bio_bh_io_sync(struct bio *bio)
2767{
2768	struct buffer_head *bh = bio->bi_private;
2769
2770	if (unlikely(bio_flagged(bio, BIO_QUIET)))
2771		set_bit(BH_Quiet, &bh->b_state);
2772
2773	bh->b_end_io(bh, !bio->bi_status);
2774	bio_put(bio);
2775}
2776
2777static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
2778			  enum rw_hint write_hint,
2779			  struct writeback_control *wbc)
2780{
2781	const enum req_op op = opf & REQ_OP_MASK;
2782	struct bio *bio;
2783
2784	BUG_ON(!buffer_locked(bh));
2785	BUG_ON(!buffer_mapped(bh));
2786	BUG_ON(!bh->b_end_io);
2787	BUG_ON(buffer_delay(bh));
2788	BUG_ON(buffer_unwritten(bh));
2789
2790	/*
2791	 * Only clear out a write error when rewriting
2792	 */
2793	if (test_set_buffer_req(bh) && (op == REQ_OP_WRITE))
2794		clear_buffer_write_io_error(bh);
2795
2796	if (buffer_meta(bh))
2797		opf |= REQ_META;
2798	if (buffer_prio(bh))
2799		opf |= REQ_PRIO;
2800
2801	bio = bio_alloc(bh->b_bdev, 1, opf, GFP_NOIO);
2802
2803	fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO);
2804
2805	bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
2806	bio->bi_write_hint = write_hint;
2807
2808	bio_add_folio_nofail(bio, bh->b_folio, bh->b_size, bh_offset(bh));
2809
2810	bio->bi_end_io = end_bio_bh_io_sync;
2811	bio->bi_private = bh;
2812
2813	/* Take care of bh's that straddle the end of the device */
2814	guard_bio_eod(bio);
2815
2816	if (wbc) {
2817		wbc_init_bio(wbc, bio);
2818		wbc_account_cgroup_owner(wbc, bh->b_folio, bh->b_size);
2819	}
2820
2821	blk_crypto_submit_bio(bio);
2822}
2823
2824void submit_bh(blk_opf_t opf, struct buffer_head *bh)
2825{
2826	submit_bh_wbc(opf, bh, WRITE_LIFE_NOT_SET, NULL);
2827}
2828EXPORT_SYMBOL(submit_bh);
2829
2830void write_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags)
2831{
2832	lock_buffer(bh);
2833	if (!test_clear_buffer_dirty(bh)) {
2834		unlock_buffer(bh);
2835		return;
2836	}
2837	bh->b_end_io = end_buffer_write_sync;
2838	get_bh(bh);
2839	submit_bh(REQ_OP_WRITE | op_flags, bh);
2840}
2841EXPORT_SYMBOL(write_dirty_buffer);
2842
2843/*
2844 * For a data-integrity writeout, we need to wait upon any in-progress I/O
2845 * and then start new I/O and then wait upon it.  The caller must have a ref on
2846 * the buffer_head.
2847 */
2848int __sync_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags)
2849{
2850	WARN_ON(atomic_read(&bh->b_count) < 1);
2851	lock_buffer(bh);
2852	if (test_clear_buffer_dirty(bh)) {
2853		/*
2854		 * The bh should be mapped, but it might not be if the
2855		 * device was hot-removed. Not much we can do but fail the I/O.
2856		 */
2857		if (!buffer_mapped(bh)) {
2858			unlock_buffer(bh);
2859			return -EIO;
2860		}
2861
2862		get_bh(bh);
2863		bh->b_end_io = end_buffer_write_sync;
2864		submit_bh(REQ_OP_WRITE | op_flags, bh);
2865		wait_on_buffer(bh);
2866		if (!buffer_uptodate(bh))
2867			return -EIO;
2868	} else {
2869		unlock_buffer(bh);
2870	}
2871	return 0;
2872}
2873EXPORT_SYMBOL(__sync_dirty_buffer);
2874
2875int sync_dirty_buffer(struct buffer_head *bh)
2876{
2877	return __sync_dirty_buffer(bh, REQ_SYNC);
2878}
2879EXPORT_SYMBOL(sync_dirty_buffer);
2880
2881static inline int buffer_busy(struct buffer_head *bh)
2882{
2883	return atomic_read(&bh->b_count) |
2884		(bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
2885}
2886
2887static bool
2888drop_buffers(struct folio *folio, struct buffer_head **buffers_to_free)
2889{
2890	struct buffer_head *head = folio_buffers(folio);
2891	struct buffer_head *bh;
2892
2893	bh = head;
2894	do {
2895		if (buffer_busy(bh))
2896			goto failed;
2897		bh = bh->b_this_page;
2898	} while (bh != head);
2899
2900	do {
2901		struct buffer_head *next = bh->b_this_page;
2902
2903		if (bh->b_assoc_map)
2904			__remove_assoc_queue(bh);
2905		bh = next;
2906	} while (bh != head);
2907	*buffers_to_free = head;
2908	folio_detach_private(folio);
2909	return true;
2910failed:
2911	return false;
2912}
2913
2914/**
2915 * try_to_free_buffers - Release buffers attached to this folio.
2916 * @folio: The folio.
2917 *
2918 * If any buffers are in use (dirty, under writeback, elevated refcount),
2919 * no buffers will be freed.
2920 *
2921 * If the folio is dirty but all the buffers are clean then we need to
2922 * be sure to mark the folio clean as well.  This is because the folio
2923 * may be against a block device, and a later reattachment of buffers
2924 * to a dirty folio will set *all* buffers dirty.  Which would corrupt
2925 * filesystem data on the same device.
2926 *
2927 * The same applies to regular filesystem folios: if all the buffers are
2928 * clean then we set the folio clean and proceed.  To do that, we require
2929 * total exclusion from block_dirty_folio().  That is obtained with
2930 * i_private_lock.
2931 *
2932 * Exclusion against try_to_free_buffers may be obtained by either
2933 * locking the folio or by holding its mapping's i_private_lock.
2934 *
2935 * Context: Process context.  @folio must be locked.  Will not sleep.
2936 * Return: true if all buffers attached to this folio were freed.
2937 */
2938bool try_to_free_buffers(struct folio *folio)
2939{
2940	struct address_space * const mapping = folio->mapping;
2941	struct buffer_head *buffers_to_free = NULL;
2942	bool ret = 0;
2943
2944	BUG_ON(!folio_test_locked(folio));
2945	if (folio_test_writeback(folio))
2946		return false;
2947
2948	/* Misconfigured folio check */
2949	if (WARN_ON_ONCE(!folio_buffers(folio)))
2950		return true;
2951
2952	if (mapping == NULL) {		/* can this still happen? */
2953		ret = drop_buffers(folio, &buffers_to_free);
2954		goto out;
2955	}
2956
2957	spin_lock(&mapping->i_private_lock);
2958	ret = drop_buffers(folio, &buffers_to_free);
2959
2960	/*
2961	 * If the filesystem writes its buffers by hand (eg ext3)
2962	 * then we can have clean buffers against a dirty folio.  We
2963	 * clean the folio here; otherwise the VM will never notice
2964	 * that the filesystem did any IO at all.
2965	 *
2966	 * Also, during truncate, discard_buffer will have marked all
2967	 * the folio's buffers clean.  We discover that here and clean
2968	 * the folio also.
2969	 *
2970	 * i_private_lock must be held over this entire operation in order
2971	 * to synchronise against block_dirty_folio and prevent the
2972	 * dirty bit from being lost.
2973	 */
2974	if (ret)
2975		folio_cancel_dirty(folio);
2976	spin_unlock(&mapping->i_private_lock);
2977out:
2978	if (buffers_to_free) {
2979		struct buffer_head *bh = buffers_to_free;
2980
2981		do {
2982			struct buffer_head *next = bh->b_this_page;
2983			free_buffer_head(bh);
2984			bh = next;
2985		} while (bh != buffers_to_free);
2986	}
2987	return ret;
2988}
2989EXPORT_SYMBOL(try_to_free_buffers);
2990
2991/*
2992 * Buffer-head allocation
2993 */
2994static struct kmem_cache *bh_cachep __ro_after_init;
2995
2996/*
2997 * Once the number of bh's in the machine exceeds this level, we start
2998 * stripping them in writeback.
2999 */
3000static unsigned long max_buffer_heads __ro_after_init;
3001
3002int buffer_heads_over_limit;
3003
3004struct bh_accounting {
3005	int nr;			/* Number of live bh's */
3006	int ratelimit;		/* Limit cacheline bouncing */
3007};
3008
3009static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
3010
3011static void recalc_bh_state(void)
3012{
3013	int i;
3014	int tot = 0;
3015
3016	if (__this_cpu_inc_return(bh_accounting.ratelimit) - 1 < 4096)
3017		return;
3018	__this_cpu_write(bh_accounting.ratelimit, 0);
3019	for_each_online_cpu(i)
3020		tot += per_cpu(bh_accounting, i).nr;
3021	buffer_heads_over_limit = (tot > max_buffer_heads);
3022}
3023
3024struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
3025{
3026	struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
3027	if (ret) {
3028		INIT_LIST_HEAD(&ret->b_assoc_buffers);
3029		spin_lock_init(&ret->b_uptodate_lock);
3030		preempt_disable();
3031		__this_cpu_inc(bh_accounting.nr);
3032		recalc_bh_state();
3033		preempt_enable();
3034	}
3035	return ret;
3036}
3037EXPORT_SYMBOL(alloc_buffer_head);
3038
3039void free_buffer_head(struct buffer_head *bh)
3040{
3041	BUG_ON(!list_empty(&bh->b_assoc_buffers));
3042	kmem_cache_free(bh_cachep, bh);
3043	preempt_disable();
3044	__this_cpu_dec(bh_accounting.nr);
3045	recalc_bh_state();
3046	preempt_enable();
3047}
3048EXPORT_SYMBOL(free_buffer_head);
3049
3050static int buffer_exit_cpu_dead(unsigned int cpu)
3051{
3052	int i;
3053	struct bh_lru *b = &per_cpu(bh_lrus, cpu);
3054
3055	for (i = 0; i < BH_LRU_SIZE; i++) {
3056		brelse(b->bhs[i]);
3057		b->bhs[i] = NULL;
3058	}
3059	this_cpu_add(bh_accounting.nr, per_cpu(bh_accounting, cpu).nr);
3060	per_cpu(bh_accounting, cpu).nr = 0;
3061	return 0;
3062}
3063
3064/**
3065 * bh_uptodate_or_lock - Test whether the buffer is uptodate
3066 * @bh: struct buffer_head
3067 *
3068 * Return true if the buffer is up-to-date and false,
3069 * with the buffer locked, if not.
3070 */
3071int bh_uptodate_or_lock(struct buffer_head *bh)
3072{
3073	if (!buffer_uptodate(bh)) {
3074		lock_buffer(bh);
3075		if (!buffer_uptodate(bh))
3076			return 0;
3077		unlock_buffer(bh);
3078	}
3079	return 1;
3080}
3081EXPORT_SYMBOL(bh_uptodate_or_lock);
3082
3083/**
3084 * __bh_read - Submit read for a locked buffer
3085 * @bh: struct buffer_head
3086 * @op_flags: appending REQ_OP_* flags besides REQ_OP_READ
3087 * @wait: wait until reading finish
3088 *
3089 * Returns zero on success or don't wait, and -EIO on error.
3090 */
3091int __bh_read(struct buffer_head *bh, blk_opf_t op_flags, bool wait)
3092{
3093	int ret = 0;
3094
3095	BUG_ON(!buffer_locked(bh));
3096
3097	get_bh(bh);
3098	bh->b_end_io = end_buffer_read_sync;
3099	submit_bh(REQ_OP_READ | op_flags, bh);
3100	if (wait) {
3101		wait_on_buffer(bh);
3102		if (!buffer_uptodate(bh))
3103			ret = -EIO;
3104	}
3105	return ret;
3106}
3107EXPORT_SYMBOL(__bh_read);
3108
3109/**
3110 * __bh_read_batch - Submit read for a batch of unlocked buffers
3111 * @nr: entry number of the buffer batch
3112 * @bhs: a batch of struct buffer_head
3113 * @op_flags: appending REQ_OP_* flags besides REQ_OP_READ
3114 * @force_lock: force to get a lock on the buffer if set, otherwise drops any
3115 *              buffer that cannot lock.
3116 *
3117 * Returns zero on success or don't wait, and -EIO on error.
3118 */
3119void __bh_read_batch(int nr, struct buffer_head *bhs[],
3120		     blk_opf_t op_flags, bool force_lock)
3121{
3122	int i;
3123
3124	for (i = 0; i < nr; i++) {
3125		struct buffer_head *bh = bhs[i];
3126
3127		if (buffer_uptodate(bh))
3128			continue;
3129
3130		if (force_lock)
3131			lock_buffer(bh);
3132		else
3133			if (!trylock_buffer(bh))
3134				continue;
3135
3136		if (buffer_uptodate(bh)) {
3137			unlock_buffer(bh);
3138			continue;
3139		}
3140
3141		bh->b_end_io = end_buffer_read_sync;
3142		get_bh(bh);
3143		submit_bh(REQ_OP_READ | op_flags, bh);
3144	}
3145}
3146EXPORT_SYMBOL(__bh_read_batch);
3147
3148void __init buffer_init(void)
3149{
3150	unsigned long nrpages;
3151	int ret;
3152
3153	bh_cachep = KMEM_CACHE(buffer_head,
3154				SLAB_RECLAIM_ACCOUNT|SLAB_PANIC);
3155	/*
3156	 * Limit the bh occupancy to 10% of ZONE_NORMAL
3157	 */
3158	nrpages = (nr_free_buffer_pages() * 10) / 100;
3159	max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
3160	ret = cpuhp_setup_state_nocalls(CPUHP_FS_BUFF_DEAD, "fs/buffer:dead",
3161					NULL, buffer_exit_cpu_dead);
3162	WARN_ON(ret < 0);
3163}
Configure Feed

Configure Feed