fs/jbd2/transaction.c at master

tjh.dev / kernel
fork
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork
kernel / fs / jbd2 / transaction.c
at master 2831 lines 90 kB view raw
wrap content
   1// SPDX-License-Identifier: GPL-2.0+
   2/*
   3 * linux/fs/jbd2/transaction.c
   4 *
   5 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
   6 *
   7 * Copyright 1998 Red Hat corp --- All Rights Reserved
   8 *
   9 * Generic filesystem transaction handling code; part of the ext2fs
  10 * journaling system.
  11 *
  12 * This file manages transactions (compound commits managed by the
  13 * journaling code) and handles (individual atomic operations by the
  14 * filesystem).
  15 */
  16
  17#include <linux/time.h>
  18#include <linux/fs.h>
  19#include <linux/jbd2.h>
  20#include <linux/errno.h>
  21#include <linux/slab.h>
  22#include <linux/timer.h>
  23#include <linux/mm.h>
  24#include <linux/highmem.h>
  25#include <linux/hrtimer.h>
  26#include <linux/backing-dev.h>
  27#include <linux/bug.h>
  28#include <linux/module.h>
  29#include <linux/sched/mm.h>
  30
  31#include <trace/events/jbd2.h>
  32
  33static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);
  34static void __jbd2_journal_unfile_buffer(struct journal_head *jh);
  35
  36static struct kmem_cache *transaction_cache;
  37int __init jbd2_journal_init_transaction_cache(void)
  38{
  39	J_ASSERT(!transaction_cache);
  40	transaction_cache = kmem_cache_create("jbd2_transaction_s",
  41					sizeof(transaction_t),
  42					0,
  43					SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY,
  44					NULL);
  45	if (!transaction_cache) {
  46		pr_emerg("JBD2: failed to create transaction cache\n");
  47		return -ENOMEM;
  48	}
  49	return 0;
  50}
  51
  52void jbd2_journal_destroy_transaction_cache(void)
  53{
  54	kmem_cache_destroy(transaction_cache);
  55	transaction_cache = NULL;
  56}
  57
  58void jbd2_journal_free_transaction(transaction_t *transaction)
  59{
  60	if (unlikely(ZERO_OR_NULL_PTR(transaction)))
  61		return;
  62	kmem_cache_free(transaction_cache, transaction);
  63}
  64
  65/*
  66 * jbd2_get_transaction: obtain a new transaction_t object.
  67 *
  68 * Simply initialise a new transaction. Initialize it in
  69 * RUNNING state and add it to the current journal (which should not
  70 * have an existing running transaction: we only make a new transaction
  71 * once we have started to commit the old one).
  72 *
  73 * Preconditions:
  74 *	The journal MUST be locked.  We don't perform atomic mallocs on the
  75 *	new transaction	and we can't block without protecting against other
  76 *	processes trying to touch the journal while it is in transition.
  77 *
  78 */
  79
  80static void jbd2_get_transaction(journal_t *journal,
  81				transaction_t *transaction)
  82{
  83	transaction->t_journal = journal;
  84	transaction->t_state = T_RUNNING;
  85	transaction->t_start_time = ktime_get();
  86	transaction->t_tid = journal->j_transaction_sequence++;
  87	transaction->t_expires = jiffies + journal->j_commit_interval;
  88	atomic_set(&transaction->t_updates, 0);
  89	atomic_set(&transaction->t_outstanding_credits,
  90		   journal->j_transaction_overhead_buffers +
  91		   atomic_read(&journal->j_reserved_credits));
  92	atomic_set(&transaction->t_outstanding_revokes, 0);
  93	atomic_set(&transaction->t_handle_count, 0);
  94	INIT_LIST_HEAD(&transaction->t_inode_list);
  95
  96	/* Set up the commit timer for the new transaction. */
  97	journal->j_commit_timer.expires = round_jiffies_up(transaction->t_expires);
  98	add_timer(&journal->j_commit_timer);
  99
 100	J_ASSERT(journal->j_running_transaction == NULL);
 101	journal->j_running_transaction = transaction;
 102	transaction->t_max_wait = 0;
 103	transaction->t_start = jiffies;
 104	transaction->t_requested = 0;
 105}
 106
 107/*
 108 * Handle management.
 109 *
 110 * A handle_t is an object which represents a single atomic update to a
 111 * filesystem, and which tracks all of the modifications which form part
 112 * of that one update.
 113 */
 114
 115/*
 116 * t_max_wait is carefully updated here with use of atomic compare exchange.
 117 * Note that there could be multiplre threads trying to do this simultaneously
 118 * hence using cmpxchg to avoid any use of locks in this case.
 119 */
 120static inline void update_t_max_wait(transaction_t *transaction,
 121				     unsigned long ts)
 122{
 123	unsigned long oldts, newts;
 124
 125	if (time_after(transaction->t_start, ts)) {
 126		newts = jbd2_time_diff(ts, transaction->t_start);
 127		oldts = READ_ONCE(transaction->t_max_wait);
 128		while (oldts < newts)
 129			oldts = cmpxchg(&transaction->t_max_wait, oldts, newts);
 130	}
 131}
 132
 133/*
 134 * Wait until running transaction passes to T_FLUSH state and new transaction
 135 * can thus be started. Also starts the commit if needed. The function expects
 136 * running transaction to exist and releases j_state_lock.
 137 */
 138static void wait_transaction_locked(journal_t *journal)
 139	__releases(journal->j_state_lock)
 140{
 141	DEFINE_WAIT(wait);
 142	int need_to_start;
 143	tid_t tid = journal->j_running_transaction->t_tid;
 144
 145	prepare_to_wait_exclusive(&journal->j_wait_transaction_locked, &wait,
 146			TASK_UNINTERRUPTIBLE);
 147	need_to_start = !tid_geq(journal->j_commit_request, tid);
 148	read_unlock(&journal->j_state_lock);
 149	if (need_to_start)
 150		jbd2_log_start_commit(journal, tid);
 151	jbd2_might_wait_for_commit(journal);
 152	schedule();
 153	finish_wait(&journal->j_wait_transaction_locked, &wait);
 154}
 155
 156/*
 157 * Wait until running transaction transitions from T_SWITCH to T_FLUSH
 158 * state and new transaction can thus be started. The function releases
 159 * j_state_lock.
 160 */
 161static void wait_transaction_switching(journal_t *journal)
 162	__releases(journal->j_state_lock)
 163{
 164	DEFINE_WAIT(wait);
 165
 166	if (WARN_ON(!journal->j_running_transaction ||
 167		    journal->j_running_transaction->t_state != T_SWITCH)) {
 168		read_unlock(&journal->j_state_lock);
 169		return;
 170	}
 171	prepare_to_wait_exclusive(&journal->j_wait_transaction_locked, &wait,
 172			TASK_UNINTERRUPTIBLE);
 173	read_unlock(&journal->j_state_lock);
 174	/*
 175	 * We don't call jbd2_might_wait_for_commit() here as there's no
 176	 * waiting for outstanding handles happening anymore in T_SWITCH state
 177	 * and handling of reserved handles actually relies on that for
 178	 * correctness.
 179	 */
 180	schedule();
 181	finish_wait(&journal->j_wait_transaction_locked, &wait);
 182}
 183
 184static void sub_reserved_credits(journal_t *journal, int blocks)
 185{
 186	atomic_sub(blocks, &journal->j_reserved_credits);
 187	wake_up(&journal->j_wait_reserved);
 188}
 189
 190/* Maximum number of blocks for user transaction payload */
 191static int jbd2_max_user_trans_buffers(journal_t *journal)
 192{
 193	return journal->j_max_transaction_buffers -
 194				journal->j_transaction_overhead_buffers;
 195}
 196
 197/*
 198 * Wait until we can add credits for handle to the running transaction.  Called
 199 * with j_state_lock held for reading. Returns 0 if handle joined the running
 200 * transaction. Returns 1 if we had to wait, j_state_lock is dropped, and
 201 * caller must retry.
 202 *
 203 * Note: because j_state_lock may be dropped depending on the return
 204 * value, we need to fake out sparse so ti doesn't complain about a
 205 * locking imbalance.  Callers of add_transaction_credits will need to
 206 * make a similar accomodation.
 207 */
 208static int add_transaction_credits(journal_t *journal, int blocks,
 209				   int rsv_blocks)
 210__must_hold(&journal->j_state_lock)
 211{
 212	transaction_t *t = journal->j_running_transaction;
 213	int needed;
 214	int total = blocks + rsv_blocks;
 215
 216	/*
 217	 * If the current transaction is locked down for commit, wait
 218	 * for the lock to be released.
 219	 */
 220	if (t->t_state != T_RUNNING) {
 221		WARN_ON_ONCE(t->t_state >= T_FLUSH);
 222		wait_transaction_locked(journal);
 223		__acquire(&journal->j_state_lock); /* fake out sparse */
 224		return 1;
 225	}
 226
 227	/*
 228	 * If there is not enough space left in the log to write all
 229	 * potential buffers requested by this operation, we need to
 230	 * stall pending a log checkpoint to free some more log space.
 231	 */
 232	needed = atomic_add_return(total, &t->t_outstanding_credits);
 233	if (needed > journal->j_max_transaction_buffers) {
 234		/*
 235		 * If the current transaction is already too large,
 236		 * then start to commit it: we can then go back and
 237		 * attach this handle to a new transaction.
 238		 */
 239		atomic_sub(total, &t->t_outstanding_credits);
 240
 241		/*
 242		 * Is the number of reserved credits in the current transaction too
 243		 * big to fit this handle? Wait until reserved credits are freed.
 244		 */
 245		if (atomic_read(&journal->j_reserved_credits) + total >
 246		    jbd2_max_user_trans_buffers(journal)) {
 247			read_unlock(&journal->j_state_lock);
 248			jbd2_might_wait_for_commit(journal);
 249			wait_event(journal->j_wait_reserved,
 250				   atomic_read(&journal->j_reserved_credits) + total <=
 251				   jbd2_max_user_trans_buffers(journal));
 252			__acquire(&journal->j_state_lock); /* fake out sparse */
 253			return 1;
 254		}
 255
 256		wait_transaction_locked(journal);
 257		__acquire(&journal->j_state_lock); /* fake out sparse */
 258		return 1;
 259	}
 260
 261	/*
 262	 * The commit code assumes that it can get enough log space
 263	 * without forcing a checkpoint.  This is *critical* for
 264	 * correctness: a checkpoint of a buffer which is also
 265	 * associated with a committing transaction creates a deadlock,
 266	 * so commit simply cannot force through checkpoints.
 267	 *
 268	 * We must therefore ensure the necessary space in the journal
 269	 * *before* starting to dirty potentially checkpointed buffers
 270	 * in the new transaction.
 271	 */
 272	if (jbd2_log_space_left(journal) < journal->j_max_transaction_buffers) {
 273		atomic_sub(total, &t->t_outstanding_credits);
 274		read_unlock(&journal->j_state_lock);
 275		jbd2_might_wait_for_commit(journal);
 276		write_lock(&journal->j_state_lock);
 277		if (jbd2_log_space_left(journal) <
 278					journal->j_max_transaction_buffers)
 279			__jbd2_log_wait_for_space(journal);
 280		write_unlock(&journal->j_state_lock);
 281		__acquire(&journal->j_state_lock); /* fake out sparse */
 282		return 1;
 283	}
 284
 285	/* No reservation? We are done... */
 286	if (!rsv_blocks)
 287		return 0;
 288
 289	needed = atomic_add_return(rsv_blocks, &journal->j_reserved_credits);
 290	/* We allow at most half of a transaction to be reserved */
 291	if (needed > jbd2_max_user_trans_buffers(journal) / 2) {
 292		sub_reserved_credits(journal, rsv_blocks);
 293		atomic_sub(total, &t->t_outstanding_credits);
 294		read_unlock(&journal->j_state_lock);
 295		jbd2_might_wait_for_commit(journal);
 296		wait_event(journal->j_wait_reserved,
 297			 atomic_read(&journal->j_reserved_credits) + rsv_blocks
 298			 <= jbd2_max_user_trans_buffers(journal) / 2);
 299		__acquire(&journal->j_state_lock); /* fake out sparse */
 300		return 1;
 301	}
 302	return 0;
 303}
 304
 305/*
 306 * start_this_handle: Given a handle, deal with any locking or stalling
 307 * needed to make sure that there is enough journal space for the handle
 308 * to begin.  Attach the handle to a transaction and set up the
 309 * transaction's buffer credits.
 310 */
 311
 312static int start_this_handle(journal_t *journal, handle_t *handle,
 313			     gfp_t gfp_mask)
 314{
 315	transaction_t	*transaction, *new_transaction = NULL;
 316	int		blocks = handle->h_total_credits;
 317	int		rsv_blocks = 0;
 318	unsigned long ts = jiffies;
 319
 320	if (handle->h_rsv_handle)
 321		rsv_blocks = handle->h_rsv_handle->h_total_credits;
 322
 323	/*
 324	 * Limit the number of reserved credits to 1/2 of maximum transaction
 325	 * size and limit the number of total credits to not exceed maximum
 326	 * transaction size per operation.
 327	 */
 328	if (rsv_blocks > jbd2_max_user_trans_buffers(journal) / 2 ||
 329	    rsv_blocks + blocks > jbd2_max_user_trans_buffers(journal)) {
 330		printk(KERN_ERR "JBD2: %s wants too many credits "
 331		       "credits:%d rsv_credits:%d max:%d\n",
 332		       current->comm, blocks, rsv_blocks,
 333		       jbd2_max_user_trans_buffers(journal));
 334		WARN_ON(1);
 335		return -ENOSPC;
 336	}
 337
 338alloc_transaction:
 339	/*
 340	 * This check is racy but it is just an optimization of allocating new
 341	 * transaction early if there are high chances we'll need it. If we
 342	 * guess wrong, we'll retry or free unused transaction.
 343	 */
 344	if (!data_race(journal->j_running_transaction)) {
 345		/*
 346		 * If __GFP_FS is not present, then we may be being called from
 347		 * inside the fs writeback layer, so we MUST NOT fail.
 348		 */
 349		if ((gfp_mask & __GFP_FS) == 0)
 350			gfp_mask |= __GFP_NOFAIL;
 351		new_transaction = kmem_cache_zalloc(transaction_cache,
 352						    gfp_mask);
 353		if (!new_transaction)
 354			return -ENOMEM;
 355	}
 356
 357	jbd2_debug(3, "New handle %p going live.\n", handle);
 358
 359	/*
 360	 * We need to hold j_state_lock until t_updates has been incremented,
 361	 * for proper journal barrier handling
 362	 */
 363repeat:
 364	read_lock(&journal->j_state_lock);
 365	BUG_ON(journal->j_flags & JBD2_UNMOUNT);
 366	if (is_journal_aborted(journal) ||
 367	    (journal->j_errno != 0 && !(journal->j_flags & JBD2_ACK_ERR))) {
 368		read_unlock(&journal->j_state_lock);
 369		jbd2_journal_free_transaction(new_transaction);
 370		return -EROFS;
 371	}
 372
 373	/*
 374	 * Wait on the journal's transaction barrier if necessary. Specifically
 375	 * we allow reserved handles to proceed because otherwise commit could
 376	 * deadlock on page writeback not being able to complete.
 377	 */
 378	if (!handle->h_reserved && journal->j_barrier_count) {
 379		read_unlock(&journal->j_state_lock);
 380		wait_event(journal->j_wait_transaction_locked,
 381				journal->j_barrier_count == 0);
 382		goto repeat;
 383	}
 384
 385	if (!journal->j_running_transaction) {
 386		read_unlock(&journal->j_state_lock);
 387		if (!new_transaction)
 388			goto alloc_transaction;
 389		write_lock(&journal->j_state_lock);
 390		if (!journal->j_running_transaction &&
 391		    (handle->h_reserved || !journal->j_barrier_count)) {
 392			jbd2_get_transaction(journal, new_transaction);
 393			new_transaction = NULL;
 394		}
 395		write_unlock(&journal->j_state_lock);
 396		goto repeat;
 397	}
 398
 399	transaction = journal->j_running_transaction;
 400
 401	if (!handle->h_reserved) {
 402		/* We may have dropped j_state_lock - restart in that case */
 403		if (add_transaction_credits(journal, blocks, rsv_blocks)) {
 404			/*
 405			 * add_transaction_credits releases
 406			 * j_state_lock on a non-zero return
 407			 */
 408			__release(&journal->j_state_lock);
 409			goto repeat;
 410		}
 411	} else {
 412		/*
 413		 * We have handle reserved so we are allowed to join T_LOCKED
 414		 * transaction and we don't have to check for transaction size
 415		 * and journal space. But we still have to wait while running
 416		 * transaction is being switched to a committing one as it
 417		 * won't wait for any handles anymore.
 418		 */
 419		if (transaction->t_state == T_SWITCH) {
 420			wait_transaction_switching(journal);
 421			goto repeat;
 422		}
 423		sub_reserved_credits(journal, blocks);
 424		handle->h_reserved = 0;
 425	}
 426
 427	/* OK, account for the buffers that this operation expects to
 428	 * use and add the handle to the running transaction.
 429	 */
 430	update_t_max_wait(transaction, ts);
 431	handle->h_transaction = transaction;
 432	handle->h_requested_credits = blocks;
 433	handle->h_revoke_credits_requested = handle->h_revoke_credits;
 434	handle->h_start_jiffies = jiffies;
 435	atomic_inc(&transaction->t_updates);
 436	atomic_inc(&transaction->t_handle_count);
 437	jbd2_debug(4, "Handle %p given %d credits (total %d, free %lu)\n",
 438		  handle, blocks,
 439		  atomic_read(&transaction->t_outstanding_credits),
 440		  jbd2_log_space_left(journal));
 441	read_unlock(&journal->j_state_lock);
 442	current->journal_info = handle;
 443
 444	rwsem_acquire_read(&journal->j_trans_commit_map, 0, 1, _THIS_IP_);
 445	jbd2_journal_free_transaction(new_transaction);
 446	/*
 447	 * Ensure that no allocations done while the transaction is open are
 448	 * going to recurse back to the fs layer.
 449	 */
 450	handle->saved_alloc_context = memalloc_nofs_save();
 451	return 0;
 452}
 453
 454/* Allocate a new handle.  This should probably be in a slab... */
 455static handle_t *new_handle(int nblocks)
 456{
 457	handle_t *handle = jbd2_alloc_handle(GFP_NOFS);
 458	if (!handle)
 459		return NULL;
 460	handle->h_total_credits = nblocks;
 461	handle->h_ref = 1;
 462
 463	return handle;
 464}
 465
 466handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks,
 467			      int revoke_records, gfp_t gfp_mask,
 468			      unsigned int type, unsigned int line_no)
 469{
 470	handle_t *handle = journal_current_handle();
 471	int err;
 472
 473	if (!journal)
 474		return ERR_PTR(-EROFS);
 475
 476	if (handle) {
 477		if (WARN_ON_ONCE(handle->h_transaction->t_journal != journal))
 478			return ERR_PTR(-EINVAL);
 479		handle->h_ref++;
 480		return handle;
 481	}
 482
 483	nblocks += DIV_ROUND_UP(revoke_records,
 484				journal->j_revoke_records_per_block);
 485	handle = new_handle(nblocks);
 486	if (!handle)
 487		return ERR_PTR(-ENOMEM);
 488	if (rsv_blocks) {
 489		handle_t *rsv_handle;
 490
 491		rsv_handle = new_handle(rsv_blocks);
 492		if (!rsv_handle) {
 493			jbd2_free_handle(handle);
 494			return ERR_PTR(-ENOMEM);
 495		}
 496		rsv_handle->h_reserved = 1;
 497		rsv_handle->h_journal = journal;
 498		handle->h_rsv_handle = rsv_handle;
 499	}
 500	handle->h_revoke_credits = revoke_records;
 501
 502	err = start_this_handle(journal, handle, gfp_mask);
 503	if (err < 0) {
 504		if (handle->h_rsv_handle)
 505			jbd2_free_handle(handle->h_rsv_handle);
 506		jbd2_free_handle(handle);
 507		return ERR_PTR(err);
 508	}
 509	handle->h_type = type;
 510	handle->h_line_no = line_no;
 511	trace_jbd2_handle_start(journal->j_fs_dev->bd_dev,
 512				handle->h_transaction->t_tid, type,
 513				line_no, nblocks);
 514
 515	return handle;
 516}
 517EXPORT_SYMBOL(jbd2__journal_start);
 518
 519
 520/**
 521 * jbd2_journal_start() - Obtain a new handle.
 522 * @journal: Journal to start transaction on.
 523 * @nblocks: number of block buffer we might modify
 524 *
 525 * We make sure that the transaction can guarantee at least nblocks of
 526 * modified buffers in the log.  We block until the log can guarantee
 527 * that much space. Additionally, if rsv_blocks > 0, we also create another
 528 * handle with rsv_blocks reserved blocks in the journal. This handle is
 529 * stored in h_rsv_handle. It is not attached to any particular transaction
 530 * and thus doesn't block transaction commit. If the caller uses this reserved
 531 * handle, it has to set h_rsv_handle to NULL as otherwise jbd2_journal_stop()
 532 * on the parent handle will dispose the reserved one. Reserved handle has to
 533 * be converted to a normal handle using jbd2_journal_start_reserved() before
 534 * it can be used.
 535 *
 536 * Return a pointer to a newly allocated handle, or an ERR_PTR() value
 537 * on failure.
 538 */
 539handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
 540{
 541	return jbd2__journal_start(journal, nblocks, 0, 0, GFP_NOFS, 0, 0);
 542}
 543EXPORT_SYMBOL(jbd2_journal_start);
 544
 545static void __jbd2_journal_unreserve_handle(handle_t *handle, transaction_t *t)
 546{
 547	journal_t *journal = handle->h_journal;
 548
 549	WARN_ON(!handle->h_reserved);
 550	sub_reserved_credits(journal, handle->h_total_credits);
 551	if (t)
 552		atomic_sub(handle->h_total_credits, &t->t_outstanding_credits);
 553}
 554
 555void jbd2_journal_free_reserved(handle_t *handle)
 556{
 557	journal_t *journal = handle->h_journal;
 558
 559	/* Get j_state_lock to pin running transaction if it exists */
 560	read_lock(&journal->j_state_lock);
 561	__jbd2_journal_unreserve_handle(handle, journal->j_running_transaction);
 562	read_unlock(&journal->j_state_lock);
 563	jbd2_free_handle(handle);
 564}
 565EXPORT_SYMBOL(jbd2_journal_free_reserved);
 566
 567/**
 568 * jbd2_journal_start_reserved() - start reserved handle
 569 * @handle: handle to start
 570 * @type: for handle statistics
 571 * @line_no: for handle statistics
 572 *
 573 * Start handle that has been previously reserved with jbd2_journal_reserve().
 574 * This attaches @handle to the running transaction (or creates one if there's
 575 * not transaction running). Unlike jbd2_journal_start() this function cannot
 576 * block on journal commit, checkpointing, or similar stuff. It can block on
 577 * memory allocation or frozen journal though.
 578 *
 579 * Return 0 on success, non-zero on error - handle is freed in that case.
 580 */
 581int jbd2_journal_start_reserved(handle_t *handle, unsigned int type,
 582				unsigned int line_no)
 583{
 584	journal_t *journal = handle->h_journal;
 585	int ret = -EIO;
 586
 587	if (WARN_ON(!handle->h_reserved)) {
 588		/* Someone passed in normal handle? Just stop it. */
 589		jbd2_journal_stop(handle);
 590		return ret;
 591	}
 592	/*
 593	 * Usefulness of mixing of reserved and unreserved handles is
 594	 * questionable. So far nobody seems to need it so just error out.
 595	 */
 596	if (WARN_ON(current->journal_info)) {
 597		jbd2_journal_free_reserved(handle);
 598		return ret;
 599	}
 600
 601	handle->h_journal = NULL;
 602	/*
 603	 * GFP_NOFS is here because callers are likely from writeback or
 604	 * similarly constrained call sites
 605	 */
 606	ret = start_this_handle(journal, handle, GFP_NOFS);
 607	if (ret < 0) {
 608		handle->h_journal = journal;
 609		jbd2_journal_free_reserved(handle);
 610		return ret;
 611	}
 612	handle->h_type = type;
 613	handle->h_line_no = line_no;
 614	trace_jbd2_handle_start(journal->j_fs_dev->bd_dev,
 615				handle->h_transaction->t_tid, type,
 616				line_no, handle->h_total_credits);
 617	return 0;
 618}
 619EXPORT_SYMBOL(jbd2_journal_start_reserved);
 620
 621/**
 622 * jbd2_journal_extend() - extend buffer credits.
 623 * @handle:  handle to 'extend'
 624 * @nblocks: nr blocks to try to extend by.
 625 * @revoke_records: number of revoke records to try to extend by.
 626 *
 627 * Some transactions, such as large extends and truncates, can be done
 628 * atomically all at once or in several stages.  The operation requests
 629 * a credit for a number of buffer modifications in advance, but can
 630 * extend its credit if it needs more.
 631 *
 632 * jbd2_journal_extend tries to give the running handle more buffer credits.
 633 * It does not guarantee that allocation - this is a best-effort only.
 634 * The calling process MUST be able to deal cleanly with a failure to
 635 * extend here.
 636 *
 637 * Return 0 on success, non-zero on failure.
 638 *
 639 * return code < 0 implies an error
 640 * return code > 0 implies normal transaction-full status.
 641 */
 642int jbd2_journal_extend(handle_t *handle, int nblocks, int revoke_records)
 643{
 644	transaction_t *transaction = handle->h_transaction;
 645	journal_t *journal;
 646	int result;
 647	int wanted;
 648
 649	if (is_handle_aborted(handle))
 650		return -EROFS;
 651	journal = transaction->t_journal;
 652
 653	result = 1;
 654
 655	read_lock(&journal->j_state_lock);
 656
 657	/* Don't extend a locked-down transaction! */
 658	if (transaction->t_state != T_RUNNING) {
 659		jbd2_debug(3, "denied handle %p %d blocks: "
 660			  "transaction not running\n", handle, nblocks);
 661		goto error_out;
 662	}
 663
 664	nblocks += DIV_ROUND_UP(
 665			handle->h_revoke_credits_requested + revoke_records,
 666			journal->j_revoke_records_per_block) -
 667		DIV_ROUND_UP(
 668			handle->h_revoke_credits_requested,
 669			journal->j_revoke_records_per_block);
 670	wanted = atomic_add_return(nblocks,
 671				   &transaction->t_outstanding_credits);
 672
 673	if (wanted > journal->j_max_transaction_buffers) {
 674		jbd2_debug(3, "denied handle %p %d blocks: "
 675			  "transaction too large\n", handle, nblocks);
 676		atomic_sub(nblocks, &transaction->t_outstanding_credits);
 677		goto error_out;
 678	}
 679
 680	trace_jbd2_handle_extend(journal->j_fs_dev->bd_dev,
 681				 transaction->t_tid,
 682				 handle->h_type, handle->h_line_no,
 683				 handle->h_total_credits,
 684				 nblocks);
 685
 686	handle->h_total_credits += nblocks;
 687	handle->h_requested_credits += nblocks;
 688	handle->h_revoke_credits += revoke_records;
 689	handle->h_revoke_credits_requested += revoke_records;
 690	result = 0;
 691
 692	jbd2_debug(3, "extended handle %p by %d\n", handle, nblocks);
 693error_out:
 694	read_unlock(&journal->j_state_lock);
 695	return result;
 696}
 697
 698static void stop_this_handle(handle_t *handle)
 699{
 700	transaction_t *transaction = handle->h_transaction;
 701	journal_t *journal = transaction->t_journal;
 702	int revokes;
 703
 704	J_ASSERT(journal_current_handle() == handle);
 705	J_ASSERT(atomic_read(&transaction->t_updates) > 0);
 706	current->journal_info = NULL;
 707	/*
 708	 * Subtract necessary revoke descriptor blocks from handle credits. We
 709	 * take care to account only for revoke descriptor blocks the
 710	 * transaction will really need as large sequences of transactions with
 711	 * small numbers of revokes are relatively common.
 712	 */
 713	revokes = handle->h_revoke_credits_requested - handle->h_revoke_credits;
 714	if (revokes) {
 715		int t_revokes, revoke_descriptors;
 716		int rr_per_blk = journal->j_revoke_records_per_block;
 717
 718		WARN_ON_ONCE(DIV_ROUND_UP(revokes, rr_per_blk)
 719				> handle->h_total_credits);
 720		t_revokes = atomic_add_return(revokes,
 721				&transaction->t_outstanding_revokes);
 722		revoke_descriptors =
 723			DIV_ROUND_UP(t_revokes, rr_per_blk) -
 724			DIV_ROUND_UP(t_revokes - revokes, rr_per_blk);
 725		handle->h_total_credits -= revoke_descriptors;
 726	}
 727	atomic_sub(handle->h_total_credits,
 728		   &transaction->t_outstanding_credits);
 729	if (handle->h_rsv_handle)
 730		__jbd2_journal_unreserve_handle(handle->h_rsv_handle,
 731						transaction);
 732	if (atomic_dec_and_test(&transaction->t_updates))
 733		wake_up(&journal->j_wait_updates);
 734
 735	rwsem_release(&journal->j_trans_commit_map, _THIS_IP_);
 736	/*
 737	 * Scope of the GFP_NOFS context is over here and so we can restore the
 738	 * original alloc context.
 739	 */
 740	memalloc_nofs_restore(handle->saved_alloc_context);
 741}
 742
 743/**
 744 * jbd2__journal_restart() - restart a handle .
 745 * @handle:  handle to restart
 746 * @nblocks: nr credits requested
 747 * @revoke_records: number of revoke record credits requested
 748 * @gfp_mask: memory allocation flags (for start_this_handle)
 749 *
 750 * Restart a handle for a multi-transaction filesystem
 751 * operation.
 752 *
 753 * If the jbd2_journal_extend() call above fails to grant new buffer credits
 754 * to a running handle, a call to jbd2_journal_restart will commit the
 755 * handle's transaction so far and reattach the handle to a new
 756 * transaction capable of guaranteeing the requested number of
 757 * credits. We preserve reserved handle if there's any attached to the
 758 * passed in handle.
 759 */
 760int jbd2__journal_restart(handle_t *handle, int nblocks, int revoke_records,
 761			  gfp_t gfp_mask)
 762{
 763	transaction_t *transaction = handle->h_transaction;
 764	journal_t *journal;
 765	tid_t		tid;
 766	int		need_to_start;
 767	int		ret;
 768
 769	/* If we've had an abort of any type, don't even think about
 770	 * actually doing the restart! */
 771	if (is_handle_aborted(handle))
 772		return 0;
 773	journal = transaction->t_journal;
 774	tid = transaction->t_tid;
 775
 776	/*
 777	 * First unlink the handle from its current transaction, and start the
 778	 * commit on that.
 779	 */
 780	jbd2_debug(2, "restarting handle %p\n", handle);
 781	stop_this_handle(handle);
 782	handle->h_transaction = NULL;
 783
 784	/*
 785	 * TODO: If we use READ_ONCE / WRITE_ONCE for j_commit_request we can
 786 	 * get rid of pointless j_state_lock traffic like this.
 787	 */
 788	read_lock(&journal->j_state_lock);
 789	need_to_start = !tid_geq(journal->j_commit_request, tid);
 790	read_unlock(&journal->j_state_lock);
 791	if (need_to_start)
 792		jbd2_log_start_commit(journal, tid);
 793	handle->h_total_credits = nblocks +
 794		DIV_ROUND_UP(revoke_records,
 795			     journal->j_revoke_records_per_block);
 796	handle->h_revoke_credits = revoke_records;
 797	ret = start_this_handle(journal, handle, gfp_mask);
 798	trace_jbd2_handle_restart(journal->j_fs_dev->bd_dev,
 799				 ret ? 0 : handle->h_transaction->t_tid,
 800				 handle->h_type, handle->h_line_no,
 801				 handle->h_total_credits);
 802	return ret;
 803}
 804EXPORT_SYMBOL(jbd2__journal_restart);
 805
 806
 807int jbd2_journal_restart(handle_t *handle, int nblocks)
 808{
 809	return jbd2__journal_restart(handle, nblocks, 0, GFP_NOFS);
 810}
 811EXPORT_SYMBOL(jbd2_journal_restart);
 812
 813/*
 814 * Waits for any outstanding t_updates to finish.
 815 * This is called with write j_state_lock held.
 816 */
 817void jbd2_journal_wait_updates(journal_t *journal)
 818{
 819	DEFINE_WAIT(wait);
 820
 821	while (1) {
 822		/*
 823		 * Note that the running transaction can get freed under us if
 824		 * this transaction is getting committed in
 825		 * jbd2_journal_commit_transaction() ->
 826		 * jbd2_journal_free_transaction(). This can only happen when we
 827		 * release j_state_lock -> schedule() -> acquire j_state_lock.
 828		 * Hence we should everytime retrieve new j_running_transaction
 829		 * value (after j_state_lock release acquire cycle), else it may
 830		 * lead to use-after-free of old freed transaction.
 831		 */
 832		transaction_t *transaction = journal->j_running_transaction;
 833
 834		if (!transaction)
 835			break;
 836
 837		prepare_to_wait(&journal->j_wait_updates, &wait,
 838				TASK_UNINTERRUPTIBLE);
 839		if (!atomic_read(&transaction->t_updates)) {
 840			finish_wait(&journal->j_wait_updates, &wait);
 841			break;
 842		}
 843		write_unlock(&journal->j_state_lock);
 844		schedule();
 845		finish_wait(&journal->j_wait_updates, &wait);
 846		write_lock(&journal->j_state_lock);
 847	}
 848}
 849
 850/**
 851 * jbd2_journal_lock_updates () - establish a transaction barrier.
 852 * @journal:  Journal to establish a barrier on.
 853 *
 854 * This locks out any further updates from being started, and blocks
 855 * until all existing updates have completed, returning only once the
 856 * journal is in a quiescent state with no updates running.
 857 *
 858 * The journal lock should not be held on entry.
 859 */
 860void jbd2_journal_lock_updates(journal_t *journal)
 861{
 862	jbd2_might_wait_for_commit(journal);
 863
 864	write_lock(&journal->j_state_lock);
 865	++journal->j_barrier_count;
 866
 867	/* Wait until there are no reserved handles */
 868	if (atomic_read(&journal->j_reserved_credits)) {
 869		write_unlock(&journal->j_state_lock);
 870		wait_event(journal->j_wait_reserved,
 871			   atomic_read(&journal->j_reserved_credits) == 0);
 872		write_lock(&journal->j_state_lock);
 873	}
 874
 875	/* Wait until there are no running t_updates */
 876	jbd2_journal_wait_updates(journal);
 877
 878	write_unlock(&journal->j_state_lock);
 879
 880	/*
 881	 * We have now established a barrier against other normal updates, but
 882	 * we also need to barrier against other jbd2_journal_lock_updates() calls
 883	 * to make sure that we serialise special journal-locked operations
 884	 * too.
 885	 */
 886	mutex_lock(&journal->j_barrier);
 887}
 888
 889/**
 890 * jbd2_journal_unlock_updates () - release barrier
 891 * @journal:  Journal to release the barrier on.
 892 *
 893 * Release a transaction barrier obtained with jbd2_journal_lock_updates().
 894 *
 895 * Should be called without the journal lock held.
 896 */
 897void jbd2_journal_unlock_updates (journal_t *journal)
 898{
 899	J_ASSERT(journal->j_barrier_count != 0);
 900
 901	mutex_unlock(&journal->j_barrier);
 902	write_lock(&journal->j_state_lock);
 903	--journal->j_barrier_count;
 904	write_unlock(&journal->j_state_lock);
 905	wake_up_all(&journal->j_wait_transaction_locked);
 906}
 907
 908static void warn_dirty_buffer(struct buffer_head *bh)
 909{
 910	printk(KERN_WARNING
 911	       "JBD2: Spotted dirty metadata buffer (dev = %pg, blocknr = %llu). "
 912	       "There's a risk of filesystem corruption in case of system "
 913	       "crash.\n",
 914	       bh->b_bdev, (unsigned long long)bh->b_blocknr);
 915}
 916
 917/* Call t_frozen trigger and copy buffer data into jh->b_frozen_data. */
 918static void jbd2_freeze_jh_data(struct journal_head *jh)
 919{
 920	char *source;
 921	struct buffer_head *bh = jh2bh(jh);
 922
 923	J_EXPECT_JH(jh, buffer_uptodate(bh), "Possible IO failure.\n");
 924	source = kmap_local_folio(bh->b_folio, bh_offset(bh));
 925	/* Fire data frozen trigger just before we copy the data */
 926	jbd2_buffer_frozen_trigger(jh, source, jh->b_triggers);
 927	memcpy(jh->b_frozen_data, source, bh->b_size);
 928	kunmap_local(source);
 929
 930	/*
 931	 * Now that the frozen data is saved off, we need to store any matching
 932	 * triggers.
 933	 */
 934	jh->b_frozen_triggers = jh->b_triggers;
 935}
 936
 937/*
 938 * If the buffer is already part of the current transaction, then there
 939 * is nothing we need to do.  If it is already part of a prior
 940 * transaction which we are still committing to disk, then we need to
 941 * make sure that we do not overwrite the old copy: we do copy-out to
 942 * preserve the copy going to disk.  We also account the buffer against
 943 * the handle's metadata buffer credits (unless the buffer is already
 944 * part of the transaction, that is).
 945 *
 946 */
 947static int
 948do_get_write_access(handle_t *handle, struct journal_head *jh,
 949			int force_copy)
 950{
 951	struct buffer_head *bh;
 952	transaction_t *transaction = handle->h_transaction;
 953	journal_t *journal;
 954	int error;
 955	char *frozen_buffer = NULL;
 956	unsigned long start_lock, time_lock;
 957
 958	journal = transaction->t_journal;
 959
 960	jbd2_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy);
 961
 962	JBUFFER_TRACE(jh, "entry");
 963repeat:
 964	bh = jh2bh(jh);
 965
 966	/* @@@ Need to check for errors here at some point. */
 967
 968 	start_lock = jiffies;
 969	lock_buffer(bh);
 970	spin_lock(&jh->b_state_lock);
 971
 972	/* If it takes too long to lock the buffer, trace it */
 973	time_lock = jbd2_time_diff(start_lock, jiffies);
 974	if (time_lock > HZ/10)
 975		trace_jbd2_lock_buffer_stall(bh->b_bdev->bd_dev,
 976			jiffies_to_msecs(time_lock));
 977
 978	/* We now hold the buffer lock so it is safe to query the buffer
 979	 * state.  Is the buffer dirty?
 980	 *
 981	 * If so, there are two possibilities.  The buffer may be
 982	 * non-journaled, and undergoing a quite legitimate writeback.
 983	 * Otherwise, it is journaled, and we don't expect dirty buffers
 984	 * in that state (the buffers should be marked JBD_Dirty
 985	 * instead.)  So either the IO is being done under our own
 986	 * control and this is a bug, or it's a third party IO such as
 987	 * dump(8) (which may leave the buffer scheduled for read ---
 988	 * ie. locked but not dirty) or tune2fs (which may actually have
 989	 * the buffer dirtied, ugh.)  */
 990
 991	if (buffer_dirty(bh) && jh->b_transaction) {
 992		warn_dirty_buffer(bh);
 993		/*
 994		 * We need to clean the dirty flag and we must do it under the
 995		 * buffer lock to be sure we don't race with running write-out.
 996		 */
 997		JBUFFER_TRACE(jh, "Journalling dirty buffer");
 998		clear_buffer_dirty(bh);
 999		/*
1000		 * The buffer is going to be added to BJ_Reserved list now and
1001		 * nothing guarantees jbd2_journal_dirty_metadata() will be
1002		 * ever called for it. So we need to set jbddirty bit here to
1003		 * make sure the buffer is dirtied and written out when the
1004		 * journaling machinery is done with it.
1005		 */
1006		set_buffer_jbddirty(bh);
1007	}
1008
1009	error = -EROFS;
1010	if (is_handle_aborted(handle)) {
1011		spin_unlock(&jh->b_state_lock);
1012		unlock_buffer(bh);
1013		goto out;
1014	}
1015	error = 0;
1016
1017	/*
1018	 * The buffer is already part of this transaction if b_transaction or
1019	 * b_next_transaction points to it
1020	 */
1021	if (jh->b_transaction == transaction ||
1022	    jh->b_next_transaction == transaction) {
1023		unlock_buffer(bh);
1024		goto done;
1025	}
1026
1027	/*
1028	 * this is the first time this transaction is touching this buffer,
1029	 * reset the modified flag
1030	 */
1031	jh->b_modified = 0;
1032
1033	/*
1034	 * If the buffer is not journaled right now, we need to make sure it
1035	 * doesn't get written to disk before the caller actually commits the
1036	 * new data
1037	 */
1038	if (!jh->b_transaction) {
1039		JBUFFER_TRACE(jh, "no transaction");
1040		if (WARN_ON_ONCE(jh->b_next_transaction)) {
1041			spin_unlock(&jh->b_state_lock);
1042			unlock_buffer(bh);
1043			error = -EINVAL;
1044			jbd2_journal_abort(journal, error);
1045			goto out;
1046		}
1047		JBUFFER_TRACE(jh, "file as BJ_Reserved");
1048		/*
1049		 * Make sure all stores to jh (b_modified, b_frozen_data) are
1050		 * visible before attaching it to the running transaction.
1051		 * Paired with barrier in jbd2_write_access_granted()
1052		 */
1053		smp_wmb();
1054		spin_lock(&journal->j_list_lock);
1055		if (test_clear_buffer_dirty(bh)) {
1056			/*
1057			 * Execute buffer dirty clearing and jh->b_transaction
1058			 * assignment under journal->j_list_lock locked to
1059			 * prevent bh being removed from checkpoint list if
1060			 * the buffer is in an intermediate state (not dirty
1061			 * and jh->b_transaction is NULL).
1062			 */
1063			JBUFFER_TRACE(jh, "Journalling dirty buffer");
1064			set_buffer_jbddirty(bh);
1065		}
1066		__jbd2_journal_file_buffer(jh, transaction, BJ_Reserved);
1067		spin_unlock(&journal->j_list_lock);
1068		unlock_buffer(bh);
1069		goto done;
1070	}
1071	unlock_buffer(bh);
1072
1073	/*
1074	 * If there is already a copy-out version of this buffer, then we don't
1075	 * need to make another one
1076	 */
1077	if (jh->b_frozen_data) {
1078		JBUFFER_TRACE(jh, "has frozen data");
1079		if (WARN_ON_ONCE(jh->b_next_transaction)) {
1080			spin_unlock(&jh->b_state_lock);
1081			error = -EINVAL;
1082			jbd2_journal_abort(journal, error);
1083			goto out;
1084		}
1085		goto attach_next;
1086	}
1087
1088	JBUFFER_TRACE(jh, "owned by older transaction");
1089	if (WARN_ON_ONCE(jh->b_next_transaction ||
1090			 jh->b_transaction !=
1091			 journal->j_committing_transaction)) {
1092		pr_err("JBD2: %s: assertion failure: b_next_transaction=%p b_transaction=%p j_committing_transaction=%p\n",
1093		       journal->j_devname, jh->b_next_transaction,
1094		       jh->b_transaction, journal->j_committing_transaction);
1095		spin_unlock(&jh->b_state_lock);
1096		error = -EINVAL;
1097		jbd2_journal_abort(journal, error);
1098		goto out;
1099	}
1100
1101	/*
1102	 * There is one case we have to be very careful about.  If the
1103	 * committing transaction is currently writing this buffer out to disk
1104	 * and has NOT made a copy-out, then we cannot modify the buffer
1105	 * contents at all right now.  The essence of copy-out is that it is
1106	 * the extra copy, not the primary copy, which gets journaled.  If the
1107	 * primary copy is already going to disk then we cannot do copy-out
1108	 * here.
1109	 */
1110	if (buffer_shadow(bh)) {
1111		JBUFFER_TRACE(jh, "on shadow: sleep");
1112		spin_unlock(&jh->b_state_lock);
1113		wait_on_bit_io(&bh->b_state, BH_Shadow, TASK_UNINTERRUPTIBLE);
1114		goto repeat;
1115	}
1116
1117	/*
1118	 * Only do the copy if the currently-owning transaction still needs it.
1119	 * If buffer isn't on BJ_Metadata list, the committing transaction is
1120	 * past that stage (here we use the fact that BH_Shadow is set under
1121	 * bh_state lock together with refiling to BJ_Shadow list and at this
1122	 * point we know the buffer doesn't have BH_Shadow set).
1123	 *
1124	 * Subtle point, though: if this is a get_undo_access, then we will be
1125	 * relying on the frozen_data to contain the new value of the
1126	 * committed_data record after the transaction, so we HAVE to force the
1127	 * frozen_data copy in that case.
1128	 */
1129	if (jh->b_jlist == BJ_Metadata || force_copy) {
1130		JBUFFER_TRACE(jh, "generate frozen data");
1131		if (!frozen_buffer) {
1132			JBUFFER_TRACE(jh, "allocate memory for buffer");
1133			spin_unlock(&jh->b_state_lock);
1134			frozen_buffer = jbd2_alloc(jh2bh(jh)->b_size,
1135						   GFP_NOFS | __GFP_NOFAIL);
1136			goto repeat;
1137		}
1138		jh->b_frozen_data = frozen_buffer;
1139		frozen_buffer = NULL;
1140		jbd2_freeze_jh_data(jh);
1141	}
1142attach_next:
1143	/*
1144	 * Make sure all stores to jh (b_modified, b_frozen_data) are visible
1145	 * before attaching it to the running transaction. Paired with barrier
1146	 * in jbd2_write_access_granted()
1147	 */
1148	smp_wmb();
1149	jh->b_next_transaction = transaction;
1150
1151done:
1152	spin_unlock(&jh->b_state_lock);
1153
1154	/*
1155	 * If we are about to journal a buffer, then any revoke pending on it is
1156	 * no longer valid
1157	 */
1158	jbd2_journal_cancel_revoke(handle, jh);
1159
1160out:
1161	if (unlikely(frozen_buffer))	/* It's usually NULL */
1162		jbd2_free(frozen_buffer, bh->b_size);
1163
1164	JBUFFER_TRACE(jh, "exit");
1165	return error;
1166}
1167
1168/* Fast check whether buffer is already attached to the required transaction */
1169static bool jbd2_write_access_granted(handle_t *handle, struct buffer_head *bh,
1170							bool undo)
1171{
1172	struct journal_head *jh;
1173	bool ret = false;
1174
1175	/* Dirty buffers require special handling... */
1176	if (buffer_dirty(bh))
1177		return false;
1178
1179	/*
1180	 * RCU protects us from dereferencing freed pages. So the checks we do
1181	 * are guaranteed not to oops. However the jh slab object can get freed
1182	 * & reallocated while we work with it. So we have to be careful. When
1183	 * we see jh attached to the running transaction, we know it must stay
1184	 * so until the transaction is committed. Thus jh won't be freed and
1185	 * will be attached to the same bh while we run.  However it can
1186	 * happen jh gets freed, reallocated, and attached to the transaction
1187	 * just after we get pointer to it from bh. So we have to be careful
1188	 * and recheck jh still belongs to our bh before we return success.
1189	 */
1190	rcu_read_lock();
1191	if (!buffer_jbd(bh))
1192		goto out;
1193	/* This should be bh2jh() but that doesn't work with inline functions */
1194	jh = READ_ONCE(bh->b_private);
1195	if (!jh)
1196		goto out;
1197	/* For undo access buffer must have data copied */
1198	if (undo && !jh->b_committed_data)
1199		goto out;
1200	if (READ_ONCE(jh->b_transaction) != handle->h_transaction &&
1201	    READ_ONCE(jh->b_next_transaction) != handle->h_transaction)
1202		goto out;
1203	/*
1204	 * There are two reasons for the barrier here:
1205	 * 1) Make sure to fetch b_bh after we did previous checks so that we
1206	 * detect when jh went through free, realloc, attach to transaction
1207	 * while we were checking. Paired with implicit barrier in that path.
1208	 * 2) So that access to bh done after jbd2_write_access_granted()
1209	 * doesn't get reordered and see inconsistent state of concurrent
1210	 * do_get_write_access().
1211	 */
1212	smp_mb();
1213	if (unlikely(jh->b_bh != bh))
1214		goto out;
1215	ret = true;
1216out:
1217	rcu_read_unlock();
1218	return ret;
1219}
1220
1221/**
1222 * jbd2_journal_get_write_access() - notify intent to modify a buffer
1223 *				     for metadata (not data) update.
1224 * @handle: transaction to add buffer modifications to
1225 * @bh:     bh to be used for metadata writes
1226 *
1227 * Returns: error code or 0 on success.
1228 *
1229 * In full data journalling mode the buffer may be of type BJ_AsyncData,
1230 * because we're ``write()ing`` a buffer which is also part of a shared mapping.
1231 */
1232
1233int jbd2_journal_get_write_access(handle_t *handle, struct buffer_head *bh)
1234{
1235	struct journal_head *jh;
1236	journal_t *journal;
1237	int rc;
1238
1239	if (is_handle_aborted(handle))
1240		return -EROFS;
1241
1242	journal = handle->h_transaction->t_journal;
1243	rc = jbd2_check_fs_dev_write_error(journal);
1244	if (rc) {
1245		/*
1246		 * If the fs dev has writeback errors, it may have failed
1247		 * to async write out metadata buffers in the background.
1248		 * In this case, we could read old data from disk and write
1249		 * it out again, which may lead to on-disk filesystem
1250		 * inconsistency. Aborting journal can avoid it happen.
1251		 */
1252		jbd2_journal_abort(journal, rc);
1253		return -EIO;
1254	}
1255
1256	if (jbd2_write_access_granted(handle, bh, false))
1257		return 0;
1258
1259	jh = jbd2_journal_add_journal_head(bh);
1260	/* We do not want to get caught playing with fields which the
1261	 * log thread also manipulates.  Make sure that the buffer
1262	 * completes any outstanding IO before proceeding. */
1263	rc = do_get_write_access(handle, jh, 0);
1264	jbd2_journal_put_journal_head(jh);
1265	return rc;
1266}
1267
1268
1269/*
1270 * When the user wants to journal a newly created buffer_head
1271 * (ie. getblk() returned a new buffer and we are going to populate it
1272 * manually rather than reading off disk), then we need to keep the
1273 * buffer_head locked until it has been completely filled with new
1274 * data.  In this case, we should be able to make the assertion that
1275 * the bh is not already part of an existing transaction.
1276 *
1277 * The buffer should already be locked by the caller by this point.
1278 * There is no lock ranking violation: it was a newly created,
1279 * unlocked buffer beforehand. */
1280
1281/**
1282 * jbd2_journal_get_create_access () - notify intent to use newly created bh
1283 * @handle: transaction to new buffer to
1284 * @bh: new buffer.
1285 *
1286 * Call this if you create a new bh.
1287 */
1288int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
1289{
1290	transaction_t *transaction = handle->h_transaction;
1291	journal_t *journal;
1292	struct journal_head *jh = jbd2_journal_add_journal_head(bh);
1293	int err;
1294
1295	jbd2_debug(5, "journal_head %p\n", jh);
1296	err = -EROFS;
1297	if (is_handle_aborted(handle))
1298		goto out;
1299	journal = transaction->t_journal;
1300	err = 0;
1301
1302	JBUFFER_TRACE(jh, "entry");
1303	/*
1304	 * The buffer may already belong to this transaction due to pre-zeroing
1305	 * in the filesystem's new_block code.  It may also be on the previous,
1306	 * committing transaction's lists, but it HAS to be in Forget state in
1307	 * that case: the transaction must have deleted the buffer for it to be
1308	 * reused here.
1309	 * In the case of file system data inconsistency, for example, if the
1310	 * block bitmap of a referenced block is not set, it can lead to the
1311	 * situation where a block being committed is allocated and used again.
1312	 * As a result, the following condition will not be satisfied, so here
1313	 * we directly trigger a JBD abort instead of immediately invoking
1314	 * bugon.
1315	 */
1316	spin_lock(&jh->b_state_lock);
1317	if (!(jh->b_transaction == transaction || jh->b_transaction == NULL ||
1318	      (jh->b_transaction == journal->j_committing_transaction &&
1319	       jh->b_jlist == BJ_Forget)) || jh->b_next_transaction != NULL) {
1320		err = -EROFS;
1321		spin_unlock(&jh->b_state_lock);
1322		jbd2_journal_abort(journal, err);
1323		goto out;
1324	}
1325
1326	if (WARN_ON_ONCE(!buffer_locked(jh2bh(jh)))) {
1327		err = -EINVAL;
1328		spin_unlock(&jh->b_state_lock);
1329		jbd2_journal_abort(journal, err);
1330		goto out;
1331	}
1332
1333	if (jh->b_transaction == NULL) {
1334		/*
1335		 * Previous jbd2_journal_forget() could have left the buffer
1336		 * with jbddirty bit set because it was being committed. When
1337		 * the commit finished, we've filed the buffer for
1338		 * checkpointing and marked it dirty. Now we are reallocating
1339		 * the buffer so the transaction freeing it must have
1340		 * committed and so it's safe to clear the dirty bit.
1341		 */
1342		clear_buffer_dirty(jh2bh(jh));
1343		/* first access by this transaction */
1344		jh->b_modified = 0;
1345
1346		JBUFFER_TRACE(jh, "file as BJ_Reserved");
1347		spin_lock(&journal->j_list_lock);
1348		__jbd2_journal_file_buffer(jh, transaction, BJ_Reserved);
1349		spin_unlock(&journal->j_list_lock);
1350	} else if (jh->b_transaction == journal->j_committing_transaction) {
1351		/* first access by this transaction */
1352		jh->b_modified = 0;
1353
1354		JBUFFER_TRACE(jh, "set next transaction");
1355		spin_lock(&journal->j_list_lock);
1356		jh->b_next_transaction = transaction;
1357		spin_unlock(&journal->j_list_lock);
1358	}
1359	spin_unlock(&jh->b_state_lock);
1360
1361	/*
1362	 * akpm: I added this.  ext3_alloc_branch can pick up new indirect
1363	 * blocks which contain freed but then revoked metadata.  We need
1364	 * to cancel the revoke in case we end up freeing it yet again
1365	 * and the reallocating as data - this would cause a second revoke,
1366	 * which hits an assertion error.
1367	 */
1368	JBUFFER_TRACE(jh, "cancelling revoke");
1369	jbd2_journal_cancel_revoke(handle, jh);
1370out:
1371	jbd2_journal_put_journal_head(jh);
1372	return err;
1373}
1374
1375/**
1376 * jbd2_journal_get_undo_access() -  Notify intent to modify metadata with
1377 *     non-rewindable consequences
1378 * @handle: transaction
1379 * @bh: buffer to undo
1380 *
1381 * Sometimes there is a need to distinguish between metadata which has
1382 * been committed to disk and that which has not.  The ext3fs code uses
1383 * this for freeing and allocating space, we have to make sure that we
1384 * do not reuse freed space until the deallocation has been committed,
1385 * since if we overwrote that space we would make the delete
1386 * un-rewindable in case of a crash.
1387 *
1388 * To deal with that, jbd2_journal_get_undo_access requests write access to a
1389 * buffer for parts of non-rewindable operations such as delete
1390 * operations on the bitmaps.  The journaling code must keep a copy of
1391 * the buffer's contents prior to the undo_access call until such time
1392 * as we know that the buffer has definitely been committed to disk.
1393 *
1394 * We never need to know which transaction the committed data is part
1395 * of, buffers touched here are guaranteed to be dirtied later and so
1396 * will be committed to a new transaction in due course, at which point
1397 * we can discard the old committed data pointer.
1398 *
1399 * Returns error number or 0 on success.
1400 */
1401int jbd2_journal_get_undo_access(handle_t *handle, struct buffer_head *bh)
1402{
1403	int err;
1404	struct journal_head *jh;
1405	char *committed_data = NULL;
1406
1407	if (is_handle_aborted(handle))
1408		return -EROFS;
1409
1410	if (jbd2_write_access_granted(handle, bh, true))
1411		return 0;
1412
1413	jh = jbd2_journal_add_journal_head(bh);
1414	JBUFFER_TRACE(jh, "entry");
1415
1416	/*
1417	 * Do this first --- it can drop the journal lock, so we want to
1418	 * make sure that obtaining the committed_data is done
1419	 * atomically wrt. completion of any outstanding commits.
1420	 */
1421	err = do_get_write_access(handle, jh, 1);
1422	if (err)
1423		goto out;
1424
1425repeat:
1426	if (!jh->b_committed_data)
1427		committed_data = jbd2_alloc(jh2bh(jh)->b_size,
1428					    GFP_NOFS|__GFP_NOFAIL);
1429
1430	spin_lock(&jh->b_state_lock);
1431	if (!jh->b_committed_data) {
1432		/* Copy out the current buffer contents into the
1433		 * preserved, committed copy. */
1434		JBUFFER_TRACE(jh, "generate b_committed data");
1435		if (!committed_data) {
1436			spin_unlock(&jh->b_state_lock);
1437			goto repeat;
1438		}
1439
1440		jh->b_committed_data = committed_data;
1441		committed_data = NULL;
1442		memcpy(jh->b_committed_data, bh->b_data, bh->b_size);
1443	}
1444	spin_unlock(&jh->b_state_lock);
1445out:
1446	jbd2_journal_put_journal_head(jh);
1447	if (unlikely(committed_data))
1448		jbd2_free(committed_data, bh->b_size);
1449	return err;
1450}
1451
1452/**
1453 * jbd2_journal_set_triggers() - Add triggers for commit writeout
1454 * @bh: buffer to trigger on
1455 * @type: struct jbd2_buffer_trigger_type containing the trigger(s).
1456 *
1457 * Set any triggers on this journal_head.  This is always safe, because
1458 * triggers for a committing buffer will be saved off, and triggers for
1459 * a running transaction will match the buffer in that transaction.
1460 *
1461 * Call with NULL to clear the triggers.
1462 */
1463void jbd2_journal_set_triggers(struct buffer_head *bh,
1464			       struct jbd2_buffer_trigger_type *type)
1465{
1466	struct journal_head *jh = jbd2_journal_grab_journal_head(bh);
1467
1468	if (WARN_ON_ONCE(!jh))
1469		return;
1470	jh->b_triggers = type;
1471	jbd2_journal_put_journal_head(jh);
1472}
1473
1474void jbd2_buffer_frozen_trigger(struct journal_head *jh, void *mapped_data,
1475				struct jbd2_buffer_trigger_type *triggers)
1476{
1477	struct buffer_head *bh = jh2bh(jh);
1478
1479	if (!triggers || !triggers->t_frozen)
1480		return;
1481
1482	triggers->t_frozen(triggers, bh, mapped_data, bh->b_size);
1483}
1484
1485void jbd2_buffer_abort_trigger(struct journal_head *jh,
1486			       struct jbd2_buffer_trigger_type *triggers)
1487{
1488	if (!triggers || !triggers->t_abort)
1489		return;
1490
1491	triggers->t_abort(triggers, jh2bh(jh));
1492}
1493
1494/**
1495 * jbd2_journal_dirty_metadata() -  mark a buffer as containing dirty metadata
1496 * @handle: transaction to add buffer to.
1497 * @bh: buffer to mark
1498 *
1499 * mark dirty metadata which needs to be journaled as part of the current
1500 * transaction.
1501 *
1502 * The buffer must have previously had jbd2_journal_get_write_access()
1503 * called so that it has a valid journal_head attached to the buffer
1504 * head.
1505 *
1506 * The buffer is placed on the transaction's metadata list and is marked
1507 * as belonging to the transaction.
1508 *
1509 * Returns error number or 0 on success.
1510 *
1511 * Special care needs to be taken if the buffer already belongs to the
1512 * current committing transaction (in which case we should have frozen
1513 * data present for that commit).  In that case, we don't relink the
1514 * buffer: that only gets done when the old transaction finally
1515 * completes its commit.
1516 */
1517int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
1518{
1519	transaction_t *transaction = handle->h_transaction;
1520	journal_t *journal = transaction->t_journal;
1521	struct journal_head *jh;
1522	int ret = 0;
1523
1524	if (!buffer_jbd(bh))
1525		return -EUCLEAN;
1526
1527	/*
1528	 * We don't grab jh reference here since the buffer must be part
1529	 * of the running transaction.
1530	 */
1531	jh = bh2jh(bh);
1532	jbd2_debug(5, "journal_head %p\n", jh);
1533	JBUFFER_TRACE(jh, "entry");
1534
1535	/*
1536	 * This and the following assertions are unreliable since we may see jh
1537	 * in inconsistent state unless we grab bh_state lock. But this is
1538	 * crucial to catch bugs so let's do a reliable check until the
1539	 * lockless handling is fully proven.
1540	 */
1541	if (data_race(jh->b_transaction != transaction &&
1542	    jh->b_next_transaction != transaction)) {
1543		spin_lock(&jh->b_state_lock);
1544		if (WARN_ON_ONCE(jh->b_transaction != transaction &&
1545				 jh->b_next_transaction != transaction)) {
1546			pr_err("JBD2: %s: assertion failure: b_transaction=%p transaction=%p b_next_transaction=%p\n",
1547			       journal->j_devname, jh->b_transaction,
1548			       transaction, jh->b_next_transaction);
1549			ret = -EINVAL;
1550			goto out_unlock_bh;
1551		}
1552		spin_unlock(&jh->b_state_lock);
1553	}
1554	if (data_race(jh->b_modified == 1)) {
1555		/* If it's in our transaction it must be in BJ_Metadata list. */
1556		if (data_race(jh->b_transaction == transaction &&
1557		    jh->b_jlist != BJ_Metadata)) {
1558			spin_lock(&jh->b_state_lock);
1559			if (WARN_ON_ONCE(jh->b_transaction == transaction &&
1560					 jh->b_jlist != BJ_Metadata)) {
1561				pr_err("JBD2: assertion failure: h_type=%u h_line_no=%u block_no=%llu jlist=%u\n",
1562				       handle->h_type, handle->h_line_no,
1563				       (unsigned long long) bh->b_blocknr,
1564				       jh->b_jlist);
1565				ret = -EINVAL;
1566				goto out_unlock_bh;
1567			}
1568			spin_unlock(&jh->b_state_lock);
1569		}
1570		goto out;
1571	}
1572
1573	spin_lock(&jh->b_state_lock);
1574
1575	if (is_handle_aborted(handle)) {
1576		/*
1577		 * Check journal aborting with @jh->b_state_lock locked,
1578		 * since 'jh->b_transaction' could be replaced with
1579		 * 'jh->b_next_transaction' during old transaction
1580		 * committing if journal aborted, which may fail
1581		 * assertion on 'jh->b_frozen_data == NULL'.
1582		 */
1583		ret = -EROFS;
1584		goto out_unlock_bh;
1585	}
1586
1587	if (jh->b_modified == 0) {
1588		/*
1589		 * This buffer's got modified and becoming part
1590		 * of the transaction. This needs to be done
1591		 * once a transaction -bzzz
1592		 */
1593		if (WARN_ON_ONCE(jbd2_handle_buffer_credits(handle) <= 0)) {
1594			ret = -ENOSPC;
1595			goto out_unlock_bh;
1596		}
1597		jh->b_modified = 1;
1598		handle->h_total_credits--;
1599	}
1600
1601	/*
1602	 * fastpath, to avoid expensive locking.  If this buffer is already
1603	 * on the running transaction's metadata list there is nothing to do.
1604	 * Nobody can take it off again because there is a handle open.
1605	 * I _think_ we're OK here with SMP barriers - a mistaken decision will
1606	 * result in this test being false, so we go in and take the locks.
1607	 */
1608	if (jh->b_transaction == transaction && jh->b_jlist == BJ_Metadata) {
1609		JBUFFER_TRACE(jh, "fastpath");
1610		if (unlikely(jh->b_transaction !=
1611			     journal->j_running_transaction)) {
1612			printk(KERN_ERR "JBD2: %s: "
1613			       "jh->b_transaction (%llu, %p, %u) != "
1614			       "journal->j_running_transaction (%p, %u)\n",
1615			       journal->j_devname,
1616			       (unsigned long long) bh->b_blocknr,
1617			       jh->b_transaction,
1618			       jh->b_transaction ? jh->b_transaction->t_tid : 0,
1619			       journal->j_running_transaction,
1620			       journal->j_running_transaction ?
1621			       journal->j_running_transaction->t_tid : 0);
1622			ret = -EINVAL;
1623		}
1624		goto out_unlock_bh;
1625	}
1626
1627	set_buffer_jbddirty(bh);
1628
1629	/*
1630	 * Metadata already on the current transaction list doesn't
1631	 * need to be filed.  Metadata on another transaction's list must
1632	 * be committing, and will be refiled once the commit completes:
1633	 * leave it alone for now.
1634	 */
1635	if (jh->b_transaction != transaction) {
1636		JBUFFER_TRACE(jh, "already on other transaction");
1637		if (unlikely(((jh->b_transaction !=
1638			       journal->j_committing_transaction)) ||
1639			     (jh->b_next_transaction != transaction))) {
1640			printk(KERN_ERR "jbd2_journal_dirty_metadata: %s: "
1641			       "bad jh for block %llu: "
1642			       "transaction (%p, %u), "
1643			       "jh->b_transaction (%p, %u), "
1644			       "jh->b_next_transaction (%p, %u), jlist %u\n",
1645			       journal->j_devname,
1646			       (unsigned long long) bh->b_blocknr,
1647			       transaction, transaction->t_tid,
1648			       jh->b_transaction,
1649			       jh->b_transaction ?
1650			       jh->b_transaction->t_tid : 0,
1651			       jh->b_next_transaction,
1652			       jh->b_next_transaction ?
1653			       jh->b_next_transaction->t_tid : 0,
1654			       jh->b_jlist);
1655			WARN_ON(1);
1656			ret = -EINVAL;
1657		}
1658		/* And this case is illegal: we can't reuse another
1659		 * transaction's data buffer, ever. */
1660		goto out_unlock_bh;
1661	}
1662
1663	/* That test should have eliminated the following case: */
1664	if (WARN_ON_ONCE(jh->b_frozen_data)) {
1665		ret = -EINVAL;
1666		goto out_unlock_bh;
1667	}
1668
1669	JBUFFER_TRACE(jh, "file as BJ_Metadata");
1670	spin_lock(&journal->j_list_lock);
1671	__jbd2_journal_file_buffer(jh, transaction, BJ_Metadata);
1672	spin_unlock(&journal->j_list_lock);
1673out_unlock_bh:
1674	spin_unlock(&jh->b_state_lock);
1675out:
1676	JBUFFER_TRACE(jh, "exit");
1677	return ret;
1678}
1679
1680/**
1681 * jbd2_journal_forget() - bforget() for potentially-journaled buffers.
1682 * @handle: transaction handle
1683 * @bh:     bh to 'forget'
1684 *
1685 * We can only do the bforget if there are no commits pending against the
1686 * buffer.  If the buffer is dirty in the current running transaction we
1687 * can safely unlink it.
1688 *
1689 * bh may not be a journalled buffer at all - it may be a non-JBD
1690 * buffer which came off the hashtable.  Check for this.
1691 *
1692 * Decrements bh->b_count by one.
1693 *
1694 * Allow this call even if the handle has aborted --- it may be part of
1695 * the caller's cleanup after an abort.
1696 */
1697int jbd2_journal_forget(handle_t *handle, struct buffer_head *bh)
1698{
1699	transaction_t *transaction = handle->h_transaction;
1700	journal_t *journal;
1701	struct journal_head *jh;
1702	int drop_reserve = 0;
1703	int err = 0;
1704	int was_modified = 0;
1705	int wait_for_writeback = 0;
1706	int abort_journal = 0;
1707
1708	if (is_handle_aborted(handle))
1709		return -EROFS;
1710	journal = transaction->t_journal;
1711
1712	BUFFER_TRACE(bh, "entry");
1713
1714	jh = jbd2_journal_grab_journal_head(bh);
1715	if (!jh) {
1716		__bforget(bh);
1717		return 0;
1718	}
1719
1720	spin_lock(&jh->b_state_lock);
1721
1722	/* Critical error: attempting to delete a bitmap buffer, maybe?
1723	 * Don't do any jbd operations, and return an error. */
1724	if (!J_EXPECT_JH(jh, !jh->b_committed_data,
1725			 "inconsistent data on disk")) {
1726		err = -EIO;
1727		goto drop;
1728	}
1729
1730	/* keep track of whether or not this transaction modified us */
1731	was_modified = jh->b_modified;
1732
1733	/*
1734	 * The buffer's going from the transaction, we must drop
1735	 * all references -bzzz
1736	 */
1737	jh->b_modified = 0;
1738
1739	if (jh->b_transaction == transaction) {
1740		if (WARN_ON_ONCE(jh->b_frozen_data)) {
1741			err = -EINVAL;
1742			abort_journal = 1;
1743			goto drop;
1744		}
1745
1746		/* If we are forgetting a buffer which is already part
1747		 * of this transaction, then we can just drop it from
1748		 * the transaction immediately. */
1749		clear_buffer_dirty(bh);
1750		clear_buffer_jbddirty(bh);
1751
1752		JBUFFER_TRACE(jh, "belongs to current transaction: unfile");
1753
1754		/*
1755		 * we only want to drop a reference if this transaction
1756		 * modified the buffer
1757		 */
1758		if (was_modified)
1759			drop_reserve = 1;
1760
1761		/*
1762		 * We are no longer going to journal this buffer.
1763		 * However, the commit of this transaction is still
1764		 * important to the buffer: the delete that we are now
1765		 * processing might obsolete an old log entry, so by
1766		 * committing, we can satisfy the buffer's checkpoint.
1767		 *
1768		 * So, if we have a checkpoint on the buffer, we should
1769		 * now refile the buffer on our BJ_Forget list so that
1770		 * we know to remove the checkpoint after we commit.
1771		 */
1772
1773		spin_lock(&journal->j_list_lock);
1774		if (jh->b_cp_transaction) {
1775			__jbd2_journal_temp_unlink_buffer(jh);
1776			__jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
1777		} else {
1778			__jbd2_journal_unfile_buffer(jh);
1779			jbd2_journal_put_journal_head(jh);
1780		}
1781		spin_unlock(&journal->j_list_lock);
1782	} else if (jh->b_transaction) {
1783		if (WARN_ON_ONCE(jh->b_transaction != journal->j_committing_transaction)) {
1784			err = -EINVAL;
1785			abort_journal = 1;
1786			goto drop;
1787		}
1788		/* However, if the buffer is still owned by a prior
1789		 * (committing) transaction, we can't drop it yet... */
1790		JBUFFER_TRACE(jh, "belongs to older transaction");
1791		/* ... but we CAN drop it from the new transaction through
1792		 * marking the buffer as freed and set j_next_transaction to
1793		 * the new transaction, so that not only the commit code
1794		 * knows it should clear dirty bits when it is done with the
1795		 * buffer, but also the buffer can be checkpointed only
1796		 * after the new transaction commits. */
1797
1798		set_buffer_freed(bh);
1799
1800		if (!jh->b_next_transaction) {
1801			spin_lock(&journal->j_list_lock);
1802			jh->b_next_transaction = transaction;
1803			spin_unlock(&journal->j_list_lock);
1804		} else {
1805			if (WARN_ON_ONCE(jh->b_next_transaction != transaction)) {
1806				err = -EINVAL;
1807				abort_journal = 1;
1808				goto drop;
1809			}
1810
1811			/*
1812			 * only drop a reference if this transaction modified
1813			 * the buffer
1814			 */
1815			if (was_modified)
1816				drop_reserve = 1;
1817		}
1818	} else {
1819		/*
1820		 * Finally, if the buffer is not belongs to any
1821		 * transaction, we can just drop it now if it has no
1822		 * checkpoint.
1823		 */
1824		spin_lock(&journal->j_list_lock);
1825		if (!jh->b_cp_transaction) {
1826			JBUFFER_TRACE(jh, "belongs to none transaction");
1827			spin_unlock(&journal->j_list_lock);
1828			goto drop;
1829		}
1830
1831		/*
1832		 * Otherwise, if the buffer has been written to disk,
1833		 * it is safe to remove the checkpoint and drop it.
1834		 */
1835		if (jbd2_journal_try_remove_checkpoint(jh) >= 0) {
1836			spin_unlock(&journal->j_list_lock);
1837			goto drop;
1838		}
1839
1840		/*
1841		 * The buffer has not yet been written to disk. We should
1842		 * either clear the buffer or ensure that the ongoing I/O
1843		 * is completed, and attach this buffer to current
1844		 * transaction so that the buffer can be checkpointed only
1845		 * after the current transaction commits.
1846		 */
1847		clear_buffer_dirty(bh);
1848		wait_for_writeback = 1;
1849		__jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
1850		spin_unlock(&journal->j_list_lock);
1851	}
1852drop:
1853	__brelse(bh);
1854	spin_unlock(&jh->b_state_lock);
1855	if (abort_journal)
1856		jbd2_journal_abort(journal, err);
1857	if (wait_for_writeback)
1858		wait_on_buffer(bh);
1859	jbd2_journal_put_journal_head(jh);
1860	if (drop_reserve) {
1861		/* no need to reserve log space for this block -bzzz */
1862		handle->h_total_credits++;
1863	}
1864	return err;
1865}
1866
1867/**
1868 * jbd2_journal_stop() - complete a transaction
1869 * @handle: transaction to complete.
1870 *
1871 * All done for a particular handle.
1872 *
1873 * There is not much action needed here.  We just return any remaining
1874 * buffer credits to the transaction and remove the handle.  The only
1875 * complication is that we need to start a commit operation if the
1876 * filesystem is marked for synchronous update.
1877 *
1878 * jbd2_journal_stop itself will not usually return an error, but it may
1879 * do so in unusual circumstances.  In particular, expect it to
1880 * return -EIO if a jbd2_journal_abort has been executed since the
1881 * transaction began.
1882 */
1883int jbd2_journal_stop(handle_t *handle)
1884{
1885	transaction_t *transaction = handle->h_transaction;
1886	journal_t *journal;
1887	int err = 0, wait_for_commit = 0;
1888	tid_t tid;
1889	pid_t pid;
1890
1891	if (--handle->h_ref > 0) {
1892		jbd2_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
1893						 handle->h_ref);
1894		if (is_handle_aborted(handle))
1895			return -EIO;
1896		return 0;
1897	}
1898	if (!transaction) {
1899		/*
1900		 * Handle is already detached from the transaction so there is
1901		 * nothing to do other than free the handle.
1902		 */
1903		memalloc_nofs_restore(handle->saved_alloc_context);
1904		goto free_and_exit;
1905	}
1906	journal = transaction->t_journal;
1907	tid = transaction->t_tid;
1908
1909	if (is_handle_aborted(handle))
1910		err = -EIO;
1911
1912	jbd2_debug(4, "Handle %p going down\n", handle);
1913	trace_jbd2_handle_stats(journal->j_fs_dev->bd_dev,
1914				tid, handle->h_type, handle->h_line_no,
1915				jiffies - handle->h_start_jiffies,
1916				handle->h_sync, handle->h_requested_credits,
1917				(handle->h_requested_credits -
1918				 handle->h_total_credits));
1919
1920	/*
1921	 * Implement synchronous transaction batching.  If the handle
1922	 * was synchronous, don't force a commit immediately.  Let's
1923	 * yield and let another thread piggyback onto this
1924	 * transaction.  Keep doing that while new threads continue to
1925	 * arrive.  It doesn't cost much - we're about to run a commit
1926	 * and sleep on IO anyway.  Speeds up many-threaded, many-dir
1927	 * operations by 30x or more...
1928	 *
1929	 * We try and optimize the sleep time against what the
1930	 * underlying disk can do, instead of having a static sleep
1931	 * time.  This is useful for the case where our storage is so
1932	 * fast that it is more optimal to go ahead and force a flush
1933	 * and wait for the transaction to be committed than it is to
1934	 * wait for an arbitrary amount of time for new writers to
1935	 * join the transaction.  We achieve this by measuring how
1936	 * long it takes to commit a transaction, and compare it with
1937	 * how long this transaction has been running, and if run time
1938	 * < commit time then we sleep for the delta and commit.  This
1939	 * greatly helps super fast disks that would see slowdowns as
1940	 * more threads started doing fsyncs.
1941	 *
1942	 * But don't do this if this process was the most recent one
1943	 * to perform a synchronous write.  We do this to detect the
1944	 * case where a single process is doing a stream of sync
1945	 * writes.  No point in waiting for joiners in that case.
1946	 *
1947	 * Setting max_batch_time to 0 disables this completely.
1948	 */
1949	pid = current->pid;
1950	if (handle->h_sync && journal->j_last_sync_writer != pid &&
1951	    journal->j_max_batch_time) {
1952		u64 commit_time, trans_time;
1953
1954		journal->j_last_sync_writer = pid;
1955
1956		read_lock(&journal->j_state_lock);
1957		commit_time = journal->j_average_commit_time;
1958		read_unlock(&journal->j_state_lock);
1959
1960		trans_time = ktime_to_ns(ktime_sub(ktime_get(),
1961						   transaction->t_start_time));
1962
1963		commit_time = max_t(u64, commit_time,
1964				    1000*journal->j_min_batch_time);
1965		commit_time = min_t(u64, commit_time,
1966				    1000*journal->j_max_batch_time);
1967
1968		if (trans_time < commit_time) {
1969			ktime_t expires = ktime_add_ns(ktime_get(),
1970						       commit_time);
1971			set_current_state(TASK_UNINTERRUPTIBLE);
1972			schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
1973		}
1974	}
1975
1976	if (handle->h_sync)
1977		transaction->t_synchronous_commit = 1;
1978
1979	/*
1980	 * If the handle is marked SYNC, we need to set another commit
1981	 * going!  We also want to force a commit if the transaction is too
1982	 * old now.
1983	 */
1984	if (handle->h_sync ||
1985	    time_after_eq(jiffies, transaction->t_expires)) {
1986		/* Do this even for aborted journals: an abort still
1987		 * completes the commit thread, it just doesn't write
1988		 * anything to disk. */
1989
1990		jbd2_debug(2, "transaction too old, requesting commit for "
1991					"handle %p\n", handle);
1992		/* This is non-blocking */
1993		jbd2_log_start_commit(journal, tid);
1994
1995		/*
1996		 * Special case: JBD2_SYNC synchronous updates require us
1997		 * to wait for the commit to complete.
1998		 */
1999		if (handle->h_sync && !(current->flags & PF_MEMALLOC))
2000			wait_for_commit = 1;
2001	}
2002
2003	/*
2004	 * Once stop_this_handle() drops t_updates, the transaction could start
2005	 * committing on us and eventually disappear.  So we must not
2006	 * dereference transaction pointer again after calling
2007	 * stop_this_handle().
2008	 */
2009	stop_this_handle(handle);
2010
2011	if (wait_for_commit)
2012		err = jbd2_log_wait_commit(journal, tid);
2013
2014free_and_exit:
2015	if (handle->h_rsv_handle)
2016		jbd2_free_handle(handle->h_rsv_handle);
2017	jbd2_free_handle(handle);
2018	return err;
2019}
2020
2021/*
2022 *
2023 * List management code snippets: various functions for manipulating the
2024 * transaction buffer lists.
2025 *
2026 */
2027
2028/*
2029 * Append a buffer to a transaction list, given the transaction's list head
2030 * pointer.
2031 *
2032 * j_list_lock is held.
2033 *
2034 * jh->b_state_lock is held.
2035 */
2036
2037static inline void
2038__blist_add_buffer(struct journal_head **list, struct journal_head *jh)
2039{
2040	if (!*list) {
2041		jh->b_tnext = jh->b_tprev = jh;
2042		*list = jh;
2043	} else {
2044		/* Insert at the tail of the list to preserve order */
2045		struct journal_head *first = *list, *last = first->b_tprev;
2046		jh->b_tprev = last;
2047		jh->b_tnext = first;
2048		last->b_tnext = first->b_tprev = jh;
2049	}
2050}
2051
2052/*
2053 * Remove a buffer from a transaction list, given the transaction's list
2054 * head pointer.
2055 *
2056 * Called with j_list_lock held, and the journal may not be locked.
2057 *
2058 * jh->b_state_lock is held.
2059 */
2060
2061static inline void
2062__blist_del_buffer(struct journal_head **list, struct journal_head *jh)
2063{
2064	if (*list == jh) {
2065		*list = jh->b_tnext;
2066		if (*list == jh)
2067			*list = NULL;
2068	}
2069	jh->b_tprev->b_tnext = jh->b_tnext;
2070	jh->b_tnext->b_tprev = jh->b_tprev;
2071}
2072
2073/*
2074 * Remove a buffer from the appropriate transaction list.
2075 *
2076 * Note that this function can *change* the value of
2077 * bh->b_transaction->t_buffers, t_forget, t_shadow_list, t_log_list or
2078 * t_reserved_list.  If the caller is holding onto a copy of one of these
2079 * pointers, it could go bad.  Generally the caller needs to re-read the
2080 * pointer from the transaction_t.
2081 *
2082 * Called under j_list_lock.
2083 */
2084static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
2085{
2086	struct journal_head **list = NULL;
2087	transaction_t *transaction;
2088	struct buffer_head *bh = jh2bh(jh);
2089
2090	lockdep_assert_held(&jh->b_state_lock);
2091	transaction = jh->b_transaction;
2092	if (transaction)
2093		assert_spin_locked(&transaction->t_journal->j_list_lock);
2094
2095	J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
2096	if (jh->b_jlist != BJ_None)
2097		J_ASSERT_JH(jh, transaction != NULL);
2098
2099	switch (jh->b_jlist) {
2100	case BJ_None:
2101		return;
2102	case BJ_Metadata:
2103		transaction->t_nr_buffers--;
2104		J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0);
2105		list = &transaction->t_buffers;
2106		break;
2107	case BJ_Forget:
2108		list = &transaction->t_forget;
2109		break;
2110	case BJ_Shadow:
2111		list = &transaction->t_shadow_list;
2112		break;
2113	case BJ_Reserved:
2114		list = &transaction->t_reserved_list;
2115		break;
2116	}
2117
2118	__blist_del_buffer(list, jh);
2119	jh->b_jlist = BJ_None;
2120	if (transaction && is_journal_aborted(transaction->t_journal))
2121		clear_buffer_jbddirty(bh);
2122	else if (test_clear_buffer_jbddirty(bh))
2123		mark_buffer_dirty(bh);	/* Expose it to the VM */
2124}
2125
2126/*
2127 * Remove buffer from all transactions. The caller is responsible for dropping
2128 * the jh reference that belonged to the transaction.
2129 *
2130 * Called with bh_state lock and j_list_lock
2131 */
2132static void __jbd2_journal_unfile_buffer(struct journal_head *jh)
2133{
2134	J_ASSERT_JH(jh, jh->b_transaction != NULL);
2135	J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
2136
2137	__jbd2_journal_temp_unlink_buffer(jh);
2138	jh->b_transaction = NULL;
2139}
2140
2141/**
2142 * jbd2_journal_try_to_free_buffers() - try to free page buffers.
2143 * @journal: journal for operation
2144 * @folio: Folio to detach data from.
2145 *
2146 * For all the buffers on this page,
2147 * if they are fully written out ordered data, move them onto BUF_CLEAN
2148 * so try_to_free_buffers() can reap them.
2149 *
2150 * This function returns non-zero if we wish try_to_free_buffers()
2151 * to be called. We do this if the page is releasable by try_to_free_buffers().
2152 * We also do it if the page has locked or dirty buffers and the caller wants
2153 * us to perform sync or async writeout.
2154 *
2155 * This complicates JBD locking somewhat.  We aren't protected by the
2156 * BKL here.  We wish to remove the buffer from its committing or
2157 * running transaction's ->t_datalist via __jbd2_journal_unfile_buffer.
2158 *
2159 * This may *change* the value of transaction_t->t_datalist, so anyone
2160 * who looks at t_datalist needs to lock against this function.
2161 *
2162 * Even worse, someone may be doing a jbd2_journal_dirty_data on this
2163 * buffer.  So we need to lock against that.  jbd2_journal_dirty_data()
2164 * will come out of the lock with the buffer dirty, which makes it
2165 * ineligible for release here.
2166 *
2167 * Who else is affected by this?  hmm...  Really the only contender
2168 * is do_get_write_access() - it could be looking at the buffer while
2169 * journal_try_to_free_buffer() is changing its state.  But that
2170 * cannot happen because we never reallocate freed data as metadata
2171 * while the data is part of a transaction.  Yes?
2172 *
2173 * Return false on failure, true on success
2174 */
2175bool jbd2_journal_try_to_free_buffers(journal_t *journal, struct folio *folio)
2176{
2177	struct buffer_head *head;
2178	struct buffer_head *bh;
2179	bool ret = false;
2180
2181	if (WARN_ON_ONCE(!folio_test_locked(folio)))
2182		return false;
2183
2184	head = folio_buffers(folio);
2185	bh = head;
2186	do {
2187		struct journal_head *jh;
2188
2189		/*
2190		 * We take our own ref against the journal_head here to avoid
2191		 * having to add tons of locking around each instance of
2192		 * jbd2_journal_put_journal_head().
2193		 */
2194		jh = jbd2_journal_grab_journal_head(bh);
2195		if (!jh)
2196			continue;
2197
2198		spin_lock(&jh->b_state_lock);
2199		if (!jh->b_transaction && !jh->b_next_transaction) {
2200			spin_lock(&journal->j_list_lock);
2201			/* Remove written-back checkpointed metadata buffer */
2202			if (jh->b_cp_transaction != NULL)
2203				jbd2_journal_try_remove_checkpoint(jh);
2204			spin_unlock(&journal->j_list_lock);
2205		}
2206		spin_unlock(&jh->b_state_lock);
2207		jbd2_journal_put_journal_head(jh);
2208		if (buffer_jbd(bh))
2209			goto busy;
2210	} while ((bh = bh->b_this_page) != head);
2211
2212	ret = try_to_free_buffers(folio);
2213busy:
2214	return ret;
2215}
2216
2217/*
2218 * This buffer is no longer needed.  If it is on an older transaction's
2219 * checkpoint list we need to record it on this transaction's forget list
2220 * to pin this buffer (and hence its checkpointing transaction) down until
2221 * this transaction commits.  If the buffer isn't on a checkpoint list, we
2222 * release it.
2223 * Returns non-zero if JBD no longer has an interest in the buffer.
2224 *
2225 * Called under j_list_lock.
2226 *
2227 * Called under jh->b_state_lock.
2228 */
2229static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
2230{
2231	int may_free = 1;
2232	struct buffer_head *bh = jh2bh(jh);
2233
2234	if (jh->b_cp_transaction) {
2235		JBUFFER_TRACE(jh, "on running+cp transaction");
2236		__jbd2_journal_temp_unlink_buffer(jh);
2237		/*
2238		 * We don't want to write the buffer anymore, clear the
2239		 * bit so that we don't confuse checks in
2240		 * __jbd2_journal_file_buffer
2241		 */
2242		clear_buffer_dirty(bh);
2243		__jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
2244		may_free = 0;
2245	} else {
2246		JBUFFER_TRACE(jh, "on running transaction");
2247		__jbd2_journal_unfile_buffer(jh);
2248		jbd2_journal_put_journal_head(jh);
2249	}
2250	return may_free;
2251}
2252
2253/*
2254 * jbd2_journal_invalidate_folio
2255 *
2256 * This code is tricky.  It has a number of cases to deal with.
2257 *
2258 * There are two invariants which this code relies on:
2259 *
2260 * i_size must be updated on disk before we start calling invalidate_folio
2261 * on the data.
2262 *
2263 *  This is done in ext3 by defining an ext3_setattr method which
2264 *  updates i_size before truncate gets going.  By maintaining this
2265 *  invariant, we can be sure that it is safe to throw away any buffers
2266 *  attached to the current transaction: once the transaction commits,
2267 *  we know that the data will not be needed.
2268 *
2269 *  Note however that we can *not* throw away data belonging to the
2270 *  previous, committing transaction!
2271 *
2272 * Any disk blocks which *are* part of the previous, committing
2273 * transaction (and which therefore cannot be discarded immediately) are
2274 * not going to be reused in the new running transaction
2275 *
2276 *  The bitmap committed_data images guarantee this: any block which is
2277 *  allocated in one transaction and removed in the next will be marked
2278 *  as in-use in the committed_data bitmap, so cannot be reused until
2279 *  the next transaction to delete the block commits.  This means that
2280 *  leaving committing buffers dirty is quite safe: the disk blocks
2281 *  cannot be reallocated to a different file and so buffer aliasing is
2282 *  not possible.
2283 *
2284 *
2285 * The above applies mainly to ordered data mode.  In writeback mode we
2286 * don't make guarantees about the order in which data hits disk --- in
2287 * particular we don't guarantee that new dirty data is flushed before
2288 * transaction commit --- so it is always safe just to discard data
2289 * immediately in that mode.  --sct
2290 */
2291
2292/*
2293 * The journal_unmap_buffer helper function returns zero if the buffer
2294 * concerned remains pinned as an anonymous buffer belonging to an older
2295 * transaction.
2296 *
2297 * We're outside-transaction here.  Either or both of j_running_transaction
2298 * and j_committing_transaction may be NULL.
2299 */
2300static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh,
2301				int partial_page)
2302{
2303	transaction_t *transaction;
2304	struct journal_head *jh;
2305	int may_free = 1;
2306
2307	BUFFER_TRACE(bh, "entry");
2308
2309	/*
2310	 * It is safe to proceed here without the j_list_lock because the
2311	 * buffers cannot be stolen by try_to_free_buffers as long as we are
2312	 * holding the page lock. --sct
2313	 */
2314
2315	jh = jbd2_journal_grab_journal_head(bh);
2316	if (!jh)
2317		goto zap_buffer_unlocked;
2318
2319	/* OK, we have data buffer in journaled mode */
2320	write_lock(&journal->j_state_lock);
2321	spin_lock(&jh->b_state_lock);
2322	spin_lock(&journal->j_list_lock);
2323
2324	/*
2325	 * We cannot remove the buffer from checkpoint lists until the
2326	 * transaction adding inode to orphan list (let's call it T)
2327	 * is committed.  Otherwise if the transaction changing the
2328	 * buffer would be cleaned from the journal before T is
2329	 * committed, a crash will cause that the correct contents of
2330	 * the buffer will be lost.  On the other hand we have to
2331	 * clear the buffer dirty bit at latest at the moment when the
2332	 * transaction marking the buffer as freed in the filesystem
2333	 * structures is committed because from that moment on the
2334	 * block can be reallocated and used by a different page.
2335	 * Since the block hasn't been freed yet but the inode has
2336	 * already been added to orphan list, it is safe for us to add
2337	 * the buffer to BJ_Forget list of the newest transaction.
2338	 *
2339	 * Also we have to clear buffer_mapped flag of a truncated buffer
2340	 * because the buffer_head may be attached to the page straddling
2341	 * i_size (can happen only when blocksize < pagesize) and thus the
2342	 * buffer_head can be reused when the file is extended again. So we end
2343	 * up keeping around invalidated buffers attached to transactions'
2344	 * BJ_Forget list just to stop checkpointing code from cleaning up
2345	 * the transaction this buffer was modified in.
2346	 */
2347	transaction = jh->b_transaction;
2348	if (transaction == NULL) {
2349		/* First case: not on any transaction.  If it
2350		 * has no checkpoint link, then we can zap it:
2351		 * it's a writeback-mode buffer so we don't care
2352		 * if it hits disk safely. */
2353		if (!jh->b_cp_transaction) {
2354			JBUFFER_TRACE(jh, "not on any transaction: zap");
2355			goto zap_buffer;
2356		}
2357
2358		if (!buffer_dirty(bh)) {
2359			/* bdflush has written it.  We can drop it now */
2360			__jbd2_journal_remove_checkpoint(jh);
2361			goto zap_buffer;
2362		}
2363
2364		/* OK, it must be in the journal but still not
2365		 * written fully to disk: it's metadata or
2366		 * journaled data... */
2367
2368		if (journal->j_running_transaction) {
2369			/* ... and once the current transaction has
2370			 * committed, the buffer won't be needed any
2371			 * longer. */
2372			JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget");
2373			may_free = __dispose_buffer(jh,
2374					journal->j_running_transaction);
2375			goto zap_buffer;
2376		} else {
2377			/* There is no currently-running transaction. So the
2378			 * orphan record which we wrote for this file must have
2379			 * passed into commit.  We must attach this buffer to
2380			 * the committing transaction, if it exists. */
2381			if (journal->j_committing_transaction) {
2382				JBUFFER_TRACE(jh, "give to committing trans");
2383				may_free = __dispose_buffer(jh,
2384					journal->j_committing_transaction);
2385				goto zap_buffer;
2386			} else {
2387				/* The orphan record's transaction has
2388				 * committed.  We can cleanse this buffer */
2389				clear_buffer_jbddirty(bh);
2390				__jbd2_journal_remove_checkpoint(jh);
2391				goto zap_buffer;
2392			}
2393		}
2394	} else if (transaction == journal->j_committing_transaction) {
2395		JBUFFER_TRACE(jh, "on committing transaction");
2396		/*
2397		 * The buffer is committing, we simply cannot touch
2398		 * it. If the page is straddling i_size we have to wait
2399		 * for commit and try again.
2400		 */
2401		if (partial_page) {
2402			spin_unlock(&journal->j_list_lock);
2403			spin_unlock(&jh->b_state_lock);
2404			write_unlock(&journal->j_state_lock);
2405			jbd2_journal_put_journal_head(jh);
2406			/* Already zapped buffer? Nothing to do... */
2407			if (!bh->b_bdev)
2408				return 0;
2409			return -EBUSY;
2410		}
2411		/*
2412		 * OK, buffer won't be reachable after truncate. We just clear
2413		 * b_modified to not confuse transaction credit accounting, and
2414		 * set j_next_transaction to the running transaction (if there
2415		 * is one) and mark buffer as freed so that commit code knows
2416		 * it should clear dirty bits when it is done with the buffer.
2417		 */
2418		set_buffer_freed(bh);
2419		if (journal->j_running_transaction && buffer_jbddirty(bh))
2420			jh->b_next_transaction = journal->j_running_transaction;
2421		jh->b_modified = 0;
2422		spin_unlock(&journal->j_list_lock);
2423		spin_unlock(&jh->b_state_lock);
2424		write_unlock(&journal->j_state_lock);
2425		jbd2_journal_put_journal_head(jh);
2426		return 0;
2427	} else {
2428		/* Good, the buffer belongs to the running transaction.
2429		 * We are writing our own transaction's data, not any
2430		 * previous one's, so it is safe to throw it away
2431		 * (remember that we expect the filesystem to have set
2432		 * i_size already for this truncate so recovery will not
2433		 * expose the disk blocks we are discarding here.) */
2434		J_ASSERT_JH(jh, transaction == journal->j_running_transaction);
2435		JBUFFER_TRACE(jh, "on running transaction");
2436		may_free = __dispose_buffer(jh, transaction);
2437	}
2438
2439zap_buffer:
2440	/*
2441	 * This is tricky. Although the buffer is truncated, it may be reused
2442	 * if blocksize < pagesize and it is attached to the page straddling
2443	 * EOF. Since the buffer might have been added to BJ_Forget list of the
2444	 * running transaction, journal_get_write_access() won't clear
2445	 * b_modified and credit accounting gets confused. So clear b_modified
2446	 * here.
2447	 */
2448	jh->b_modified = 0;
2449	spin_unlock(&journal->j_list_lock);
2450	spin_unlock(&jh->b_state_lock);
2451	write_unlock(&journal->j_state_lock);
2452	jbd2_journal_put_journal_head(jh);
2453zap_buffer_unlocked:
2454	clear_buffer_dirty(bh);
2455	J_ASSERT_BH(bh, !buffer_jbddirty(bh));
2456	clear_buffer_mapped(bh);
2457	clear_buffer_req(bh);
2458	clear_buffer_new(bh);
2459	clear_buffer_delay(bh);
2460	clear_buffer_unwritten(bh);
2461	bh->b_bdev = NULL;
2462	return may_free;
2463}
2464
2465/**
2466 * jbd2_journal_invalidate_folio()
2467 * @journal: journal to use for flush...
2468 * @folio:    folio to flush
2469 * @offset:  start of the range to invalidate
2470 * @length:  length of the range to invalidate
2471 *
2472 * Reap page buffers containing data after in the specified range in page.
2473 * Can return -EBUSY if buffers are part of the committing transaction and
2474 * the page is straddling i_size. Caller then has to wait for current commit
2475 * and try again.
2476 */
2477int jbd2_journal_invalidate_folio(journal_t *journal, struct folio *folio,
2478				size_t offset, size_t length)
2479{
2480	struct buffer_head *head, *bh, *next;
2481	unsigned int stop = offset + length;
2482	unsigned int curr_off = 0;
2483	int partial_page = (offset || length < folio_size(folio));
2484	int may_free = 1;
2485	int ret = 0;
2486
2487	if (!folio_test_locked(folio))
2488		BUG();
2489	head = folio_buffers(folio);
2490	if (!head)
2491		return 0;
2492
2493	BUG_ON(stop > folio_size(folio) || stop < length);
2494
2495	/* We will potentially be playing with lists other than just the
2496	 * data lists (especially for journaled data mode), so be
2497	 * cautious in our locking. */
2498
2499	bh = head;
2500	do {
2501		unsigned int next_off = curr_off + bh->b_size;
2502		next = bh->b_this_page;
2503
2504		if (next_off > stop)
2505			return 0;
2506
2507		if (offset <= curr_off) {
2508			/* This block is wholly outside the truncation point */
2509			lock_buffer(bh);
2510			ret = journal_unmap_buffer(journal, bh, partial_page);
2511			unlock_buffer(bh);
2512			if (ret < 0)
2513				return ret;
2514			may_free &= ret;
2515		}
2516		curr_off = next_off;
2517		bh = next;
2518
2519	} while (bh != head);
2520
2521	if (!partial_page) {
2522		if (may_free && try_to_free_buffers(folio))
2523			J_ASSERT(!folio_buffers(folio));
2524	}
2525	return 0;
2526}
2527
2528/*
2529 * File a buffer on the given transaction list.
2530 */
2531void __jbd2_journal_file_buffer(struct journal_head *jh,
2532			transaction_t *transaction, int jlist)
2533{
2534	struct journal_head **list = NULL;
2535	int was_dirty = 0;
2536	struct buffer_head *bh = jh2bh(jh);
2537
2538	lockdep_assert_held(&jh->b_state_lock);
2539	assert_spin_locked(&transaction->t_journal->j_list_lock);
2540
2541	J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
2542	J_ASSERT_JH(jh, jh->b_transaction == transaction ||
2543				jh->b_transaction == NULL);
2544
2545	if (jh->b_transaction && jh->b_jlist == jlist)
2546		return;
2547
2548	if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
2549	    jlist == BJ_Shadow || jlist == BJ_Forget) {
2550		/*
2551		 * For metadata buffers, we track dirty bit in buffer_jbddirty
2552		 * instead of buffer_dirty. We should not see a dirty bit set
2553		 * here because we clear it in do_get_write_access but e.g.
2554		 * tune2fs can modify the sb and set the dirty bit at any time
2555		 * so we try to gracefully handle that.
2556		 */
2557		if (buffer_dirty(bh))
2558			warn_dirty_buffer(bh);
2559		if (test_clear_buffer_dirty(bh) ||
2560		    test_clear_buffer_jbddirty(bh))
2561			was_dirty = 1;
2562	}
2563
2564	if (jh->b_transaction)
2565		__jbd2_journal_temp_unlink_buffer(jh);
2566	else
2567		jbd2_journal_grab_journal_head(bh);
2568	jh->b_transaction = transaction;
2569
2570	switch (jlist) {
2571	case BJ_None:
2572		J_ASSERT_JH(jh, !jh->b_committed_data);
2573		J_ASSERT_JH(jh, !jh->b_frozen_data);
2574		return;
2575	case BJ_Metadata:
2576		transaction->t_nr_buffers++;
2577		list = &transaction->t_buffers;
2578		break;
2579	case BJ_Forget:
2580		list = &transaction->t_forget;
2581		break;
2582	case BJ_Shadow:
2583		list = &transaction->t_shadow_list;
2584		break;
2585	case BJ_Reserved:
2586		list = &transaction->t_reserved_list;
2587		break;
2588	}
2589
2590	__blist_add_buffer(list, jh);
2591	jh->b_jlist = jlist;
2592
2593	if (was_dirty)
2594		set_buffer_jbddirty(bh);
2595}
2596
2597void jbd2_journal_file_buffer(struct journal_head *jh,
2598				transaction_t *transaction, int jlist)
2599{
2600	spin_lock(&jh->b_state_lock);
2601	spin_lock(&transaction->t_journal->j_list_lock);
2602	__jbd2_journal_file_buffer(jh, transaction, jlist);
2603	spin_unlock(&transaction->t_journal->j_list_lock);
2604	spin_unlock(&jh->b_state_lock);
2605}
2606
2607/*
2608 * Remove a buffer from its current buffer list in preparation for
2609 * dropping it from its current transaction entirely.  If the buffer has
2610 * already started to be used by a subsequent transaction, refile the
2611 * buffer on that transaction's metadata list.
2612 *
2613 * Called under j_list_lock
2614 * Called under jh->b_state_lock
2615 *
2616 * When this function returns true, there's no next transaction to refile to
2617 * and the caller has to drop jh reference through
2618 * jbd2_journal_put_journal_head().
2619 */
2620bool __jbd2_journal_refile_buffer(struct journal_head *jh)
2621{
2622	int was_dirty, jlist;
2623	struct buffer_head *bh = jh2bh(jh);
2624
2625	lockdep_assert_held(&jh->b_state_lock);
2626	if (jh->b_transaction)
2627		assert_spin_locked(&jh->b_transaction->t_journal->j_list_lock);
2628
2629	/* If the buffer is now unused, just drop it. */
2630	if (jh->b_next_transaction == NULL) {
2631		__jbd2_journal_unfile_buffer(jh);
2632		return true;
2633	}
2634
2635	/*
2636	 * It has been modified by a later transaction: add it to the new
2637	 * transaction's metadata list.
2638	 */
2639
2640	was_dirty = test_clear_buffer_jbddirty(bh);
2641	__jbd2_journal_temp_unlink_buffer(jh);
2642
2643	/*
2644	 * b_transaction must be set, otherwise the new b_transaction won't
2645	 * be holding jh reference
2646	 */
2647	J_ASSERT_JH(jh, jh->b_transaction != NULL);
2648
2649	/*
2650	 * We set b_transaction here because b_next_transaction will inherit
2651	 * our jh reference and thus __jbd2_journal_file_buffer() must not
2652	 * take a new one.
2653	 */
2654	WRITE_ONCE(jh->b_transaction, jh->b_next_transaction);
2655	WRITE_ONCE(jh->b_next_transaction, NULL);
2656	if (buffer_freed(bh))
2657		jlist = BJ_Forget;
2658	else if (jh->b_modified)
2659		jlist = BJ_Metadata;
2660	else
2661		jlist = BJ_Reserved;
2662	__jbd2_journal_file_buffer(jh, jh->b_transaction, jlist);
2663	J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING);
2664
2665	if (was_dirty)
2666		set_buffer_jbddirty(bh);
2667	return false;
2668}
2669
2670/*
2671 * __jbd2_journal_refile_buffer() with necessary locking added. We take our
2672 * bh reference so that we can safely unlock bh.
2673 *
2674 * The jh and bh may be freed by this call.
2675 */
2676void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh)
2677{
2678	bool drop;
2679
2680	spin_lock(&jh->b_state_lock);
2681	spin_lock(&journal->j_list_lock);
2682	drop = __jbd2_journal_refile_buffer(jh);
2683	spin_unlock(&jh->b_state_lock);
2684	spin_unlock(&journal->j_list_lock);
2685	if (drop)
2686		jbd2_journal_put_journal_head(jh);
2687}
2688
2689/*
2690 * File inode in the inode list of the handle's transaction
2691 */
2692static int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode,
2693		unsigned long flags, loff_t start_byte, loff_t end_byte)
2694{
2695	transaction_t *transaction = handle->h_transaction;
2696	journal_t *journal;
2697	pgoff_t start_page, end_page;
2698	int err = 0;
2699	int abort_transaction = 0;
2700
2701	if (is_handle_aborted(handle))
2702		return -EROFS;
2703	journal = transaction->t_journal;
2704
2705	jbd2_debug(4, "Adding inode %llu, tid:%d\n", jinode->i_vfs_inode->i_ino,
2706			transaction->t_tid);
2707
2708	start_page = (pgoff_t)(start_byte >> PAGE_SHIFT);
2709	end_page = (pgoff_t)(end_byte >> PAGE_SHIFT) + 1;
2710
2711	spin_lock(&journal->j_list_lock);
2712	WRITE_ONCE(jinode->i_flags, jinode->i_flags | flags);
2713
2714	if (jinode->i_dirty_start_page != jinode->i_dirty_end_page) {
2715		WRITE_ONCE(jinode->i_dirty_start_page,
2716			   min(jinode->i_dirty_start_page, start_page));
2717		WRITE_ONCE(jinode->i_dirty_end_page,
2718			   max(jinode->i_dirty_end_page, end_page));
2719	} else {
2720		/* Publish a new non-empty range by making end visible first. */
2721		WRITE_ONCE(jinode->i_dirty_end_page, end_page);
2722		WRITE_ONCE(jinode->i_dirty_start_page, start_page);
2723	}
2724
2725	/* Is inode already attached where we need it? */
2726	if (jinode->i_transaction == transaction ||
2727	    jinode->i_next_transaction == transaction)
2728		goto done;
2729
2730	/*
2731	 * We only ever set this variable to 1 so the test is safe. Since
2732	 * t_need_data_flush is likely to be set, we do the test to save some
2733	 * cacheline bouncing
2734	 */
2735	if (!transaction->t_need_data_flush)
2736		transaction->t_need_data_flush = 1;
2737	/* On some different transaction's list - should be
2738	 * the committing one */
2739	if (jinode->i_transaction) {
2740		if (WARN_ON_ONCE(jinode->i_next_transaction ||
2741				 jinode->i_transaction !=
2742				 journal->j_committing_transaction)) {
2743			pr_err("JBD2: %s: assertion failure: i_next_transaction=%p i_transaction=%p j_committing_transaction=%p\n",
2744			       journal->j_devname, jinode->i_next_transaction,
2745			       jinode->i_transaction,
2746			       journal->j_committing_transaction);
2747			err = -EINVAL;
2748			abort_transaction = 1;
2749			goto done;
2750		}
2751		jinode->i_next_transaction = transaction;
2752		goto done;
2753	}
2754	/* Not on any transaction list... */
2755	if (WARN_ON_ONCE(jinode->i_next_transaction)) {
2756		err = -EINVAL;
2757		abort_transaction = 1;
2758		goto done;
2759	}
2760	jinode->i_transaction = transaction;
2761	list_add(&jinode->i_list, &transaction->t_inode_list);
2762done:
2763	spin_unlock(&journal->j_list_lock);
2764	if (abort_transaction)
2765		jbd2_journal_abort(journal, err);
2766	return err;
2767}
2768
2769int jbd2_journal_inode_ranged_write(handle_t *handle,
2770		struct jbd2_inode *jinode, loff_t start_byte, loff_t length)
2771{
2772	return jbd2_journal_file_inode(handle, jinode,
2773			JI_WRITE_DATA | JI_WAIT_DATA, start_byte,
2774			start_byte + length - 1);
2775}
2776
2777int jbd2_journal_inode_ranged_wait(handle_t *handle, struct jbd2_inode *jinode,
2778		loff_t start_byte, loff_t length)
2779{
2780	return jbd2_journal_file_inode(handle, jinode, JI_WAIT_DATA,
2781			start_byte, start_byte + length - 1);
2782}
2783
2784/*
2785 * File truncate and transaction commit interact with each other in a
2786 * non-trivial way.  If a transaction writing data block A is
2787 * committing, we cannot discard the data by truncate until we have
2788 * written them.  Otherwise if we crashed after the transaction with
2789 * write has committed but before the transaction with truncate has
2790 * committed, we could see stale data in block A.  This function is a
2791 * helper to solve this problem.  It starts writeout of the truncated
2792 * part in case it is in the committing transaction.
2793 *
2794 * Filesystem code must call this function when inode is journaled in
2795 * ordered mode before truncation happens and after the inode has been
2796 * placed on orphan list with the new inode size. The second condition
2797 * avoids the race that someone writes new data and we start
2798 * committing the transaction after this function has been called but
2799 * before a transaction for truncate is started (and furthermore it
2800 * allows us to optimize the case where the addition to orphan list
2801 * happens in the same transaction as write --- we don't have to write
2802 * any data in such case).
2803 */
2804int jbd2_journal_begin_ordered_truncate(journal_t *journal,
2805					struct jbd2_inode *jinode,
2806					loff_t new_size)
2807{
2808	transaction_t *inode_trans, *commit_trans;
2809	int ret = 0;
2810
2811	/* This is a quick check to avoid locking if not necessary */
2812	if (!READ_ONCE(jinode->i_transaction))
2813		goto out;
2814	/* Locks are here just to force reading of recent values, it is
2815	 * enough that the transaction was not committing before we started
2816	 * a transaction adding the inode to orphan list */
2817	read_lock(&journal->j_state_lock);
2818	commit_trans = journal->j_committing_transaction;
2819	read_unlock(&journal->j_state_lock);
2820	spin_lock(&journal->j_list_lock);
2821	inode_trans = jinode->i_transaction;
2822	spin_unlock(&journal->j_list_lock);
2823	if (inode_trans == commit_trans) {
2824		ret = filemap_fdatawrite_range(jinode->i_vfs_inode->i_mapping,
2825			new_size, LLONG_MAX);
2826		if (ret)
2827			jbd2_journal_abort(journal, ret);
2828	}
2829out:
2830	return ret;
2831}
Configure Feed

Configure Feed