fs/jbd2/commit.c at 4e5591c2fc1b30f4ea5e2eab4c3a695acc404e39

tjh.dev / kernel
fork
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork
kernel / fs / jbd2 / commit.c
at 4e5591c2fc1b30f4ea5e2eab4c3a695acc404e39 1189 lines 36 kB view raw
wrap content
   1// SPDX-License-Identifier: GPL-2.0+
   2/*
   3 * linux/fs/jbd2/commit.c
   4 *
   5 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
   6 *
   7 * Copyright 1998 Red Hat corp --- All Rights Reserved
   8 *
   9 * Journal commit routines for the generic filesystem journaling code;
  10 * part of the ext2fs journaling system.
  11 */
  12
  13#include <linux/time.h>
  14#include <linux/fs.h>
  15#include <linux/jbd2.h>
  16#include <linux/errno.h>
  17#include <linux/slab.h>
  18#include <linux/mm.h>
  19#include <linux/pagemap.h>
  20#include <linux/jiffies.h>
  21#include <linux/crc32.h>
  22#include <linux/writeback.h>
  23#include <linux/backing-dev.h>
  24#include <linux/bio.h>
  25#include <linux/blkdev.h>
  26#include <linux/bitops.h>
  27#include <trace/events/jbd2.h>
  28
  29/*
  30 * IO end handler for temporary buffer_heads handling writes to the journal.
  31 */
  32static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
  33{
  34	struct buffer_head *orig_bh = bh->b_private;
  35
  36	BUFFER_TRACE(bh, "");
  37	if (uptodate)
  38		set_buffer_uptodate(bh);
  39	else
  40		clear_buffer_uptodate(bh);
  41	if (orig_bh) {
  42		clear_bit_unlock(BH_Shadow, &orig_bh->b_state);
  43		smp_mb__after_atomic();
  44		wake_up_bit(&orig_bh->b_state, BH_Shadow);
  45	}
  46	unlock_buffer(bh);
  47}
  48
  49/*
  50 * When an ext4 file is truncated, it is possible that some pages are not
  51 * successfully freed, because they are attached to a committing transaction.
  52 * After the transaction commits, these pages are left on the LRU, with no
  53 * ->mapping, and with attached buffers.  These pages are trivially reclaimable
  54 * by the VM, but their apparent absence upsets the VM accounting, and it makes
  55 * the numbers in /proc/meminfo look odd.
  56 *
  57 * So here, we have a buffer which has just come off the forget list.  Look to
  58 * see if we can strip all buffers from the backing page.
  59 *
  60 * Called under j_list_lock. The caller provided us with a ref against the
  61 * buffer, and we drop that here.
  62 */
  63static void release_buffer_page(struct buffer_head *bh)
  64{
  65	struct folio *folio;
  66
  67	if (buffer_dirty(bh))
  68		goto nope;
  69	if (atomic_read(&bh->b_count) != 1)
  70		goto nope;
  71	folio = bh->b_folio;
  72	if (folio->mapping)
  73		goto nope;
  74
  75	/* OK, it's a truncated page */
  76	if (!folio_trylock(folio))
  77		goto nope;
  78
  79	folio_get(folio);
  80	__brelse(bh);
  81	try_to_free_buffers(folio);
  82	folio_unlock(folio);
  83	folio_put(folio);
  84	return;
  85
  86nope:
  87	__brelse(bh);
  88}
  89
  90static void jbd2_commit_block_csum_set(journal_t *j, struct buffer_head *bh)
  91{
  92	struct commit_header *h;
  93	__u32 csum;
  94
  95	if (!jbd2_journal_has_csum_v2or3(j))
  96		return;
  97
  98	h = (struct commit_header *)(bh->b_data);
  99	h->h_chksum_type = 0;
 100	h->h_chksum_size = 0;
 101	h->h_chksum[0] = 0;
 102	csum = jbd2_chksum(j->j_csum_seed, bh->b_data, j->j_blocksize);
 103	h->h_chksum[0] = cpu_to_be32(csum);
 104}
 105
 106/*
 107 * Done it all: now submit the commit record.  We should have
 108 * cleaned up our previous buffers by now, so if we are in abort
 109 * mode we can now just skip the rest of the journal write
 110 * entirely.
 111 *
 112 * Returns 1 if the journal needs to be aborted or 0 on success
 113 */
 114static int journal_submit_commit_record(journal_t *journal,
 115					transaction_t *commit_transaction,
 116					struct buffer_head **cbh,
 117					__u32 crc32_sum)
 118{
 119	struct commit_header *tmp;
 120	struct buffer_head *bh;
 121	struct timespec64 now;
 122	blk_opf_t write_flags = REQ_OP_WRITE | JBD2_JOURNAL_REQ_FLAGS;
 123
 124	*cbh = NULL;
 125
 126	if (is_journal_aborted(journal))
 127		return 0;
 128
 129	bh = jbd2_journal_get_descriptor_buffer(commit_transaction,
 130						JBD2_COMMIT_BLOCK);
 131	if (!bh)
 132		return 1;
 133
 134	tmp = (struct commit_header *)bh->b_data;
 135	ktime_get_coarse_real_ts64(&now);
 136	tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
 137	tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
 138
 139	if (jbd2_has_feature_checksum(journal)) {
 140		tmp->h_chksum_type 	= JBD2_CRC32_CHKSUM;
 141		tmp->h_chksum_size 	= JBD2_CRC32_CHKSUM_SIZE;
 142		tmp->h_chksum[0] 	= cpu_to_be32(crc32_sum);
 143	}
 144	jbd2_commit_block_csum_set(journal, bh);
 145
 146	BUFFER_TRACE(bh, "submit commit block");
 147	lock_buffer(bh);
 148	clear_buffer_dirty(bh);
 149	set_buffer_uptodate(bh);
 150	bh->b_end_io = journal_end_buffer_io_sync;
 151
 152	if (journal->j_flags & JBD2_BARRIER &&
 153	    !jbd2_has_feature_async_commit(journal))
 154		write_flags |= REQ_PREFLUSH | REQ_FUA;
 155
 156	submit_bh(write_flags, bh);
 157	*cbh = bh;
 158	return 0;
 159}
 160
 161/*
 162 * This function along with journal_submit_commit_record
 163 * allows to write the commit record asynchronously.
 164 */
 165static int journal_wait_on_commit_record(journal_t *journal,
 166					 struct buffer_head *bh)
 167{
 168	int ret = 0;
 169
 170	clear_buffer_dirty(bh);
 171	wait_on_buffer(bh);
 172
 173	if (unlikely(!buffer_uptodate(bh)))
 174		ret = -EIO;
 175	put_bh(bh);            /* One for getblk() */
 176
 177	return ret;
 178}
 179
 180/* Send all the data buffers related to an inode */
 181int jbd2_submit_inode_data(journal_t *journal, struct jbd2_inode *jinode)
 182{
 183	unsigned long flags;
 184
 185	if (!jinode)
 186		return 0;
 187
 188	flags = READ_ONCE(jinode->i_flags);
 189	if (!(flags & JI_WRITE_DATA))
 190		return 0;
 191
 192	trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
 193	return journal->j_submit_inode_data_buffers(jinode);
 194
 195}
 196EXPORT_SYMBOL(jbd2_submit_inode_data);
 197
 198int jbd2_wait_inode_data(journal_t *journal, struct jbd2_inode *jinode)
 199{
 200	struct address_space *mapping;
 201	struct inode *inode;
 202	unsigned long flags;
 203	loff_t start_byte, end_byte;
 204
 205	if (!jinode)
 206		return 0;
 207
 208	flags = READ_ONCE(jinode->i_flags);
 209	if (!(flags & JI_WAIT_DATA))
 210		return 0;
 211
 212	inode = jinode->i_vfs_inode;
 213	if (!inode)
 214		return 0;
 215
 216	mapping = inode->i_mapping;
 217	if (!mapping)
 218		return 0;
 219
 220	if (!jbd2_jinode_get_dirty_range(jinode, &start_byte, &end_byte))
 221		return 0;
 222	return filemap_fdatawait_range_keep_errors(
 223		mapping, start_byte, end_byte);
 224}
 225EXPORT_SYMBOL(jbd2_wait_inode_data);
 226
 227/*
 228 * Submit all the data buffers of inode associated with the transaction to
 229 * disk.
 230 *
 231 * We are in a committing transaction. Therefore no new inode can be added to
 232 * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
 233 * operate on from being released while we write out pages.
 234 */
 235static int journal_submit_data_buffers(journal_t *journal,
 236		transaction_t *commit_transaction)
 237{
 238	struct jbd2_inode *jinode;
 239	int err, ret = 0;
 240
 241	spin_lock(&journal->j_list_lock);
 242	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
 243		if (!(jinode->i_flags & JI_WRITE_DATA))
 244			continue;
 245		WRITE_ONCE(jinode->i_flags,
 246			   jinode->i_flags | JI_COMMIT_RUNNING);
 247		spin_unlock(&journal->j_list_lock);
 248		/* submit the inode data buffers. */
 249		trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
 250		if (journal->j_submit_inode_data_buffers) {
 251			err = journal->j_submit_inode_data_buffers(jinode);
 252			if (!ret)
 253				ret = err;
 254		}
 255		spin_lock(&journal->j_list_lock);
 256		J_ASSERT(jinode->i_transaction == commit_transaction);
 257		WRITE_ONCE(jinode->i_flags,
 258			   jinode->i_flags & ~JI_COMMIT_RUNNING);
 259		smp_mb();
 260		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
 261	}
 262	spin_unlock(&journal->j_list_lock);
 263	return ret;
 264}
 265
 266int jbd2_journal_finish_inode_data_buffers(struct jbd2_inode *jinode)
 267{
 268	struct address_space *mapping = jinode->i_vfs_inode->i_mapping;
 269	loff_t start_byte, end_byte;
 270
 271	if (!jbd2_jinode_get_dirty_range(jinode, &start_byte, &end_byte))
 272		return 0;
 273
 274	return filemap_fdatawait_range_keep_errors(mapping,
 275						   start_byte, end_byte);
 276}
 277
 278/*
 279 * Wait for data submitted for writeout, refile inodes to proper
 280 * transaction if needed.
 281 *
 282 */
 283static int journal_finish_inode_data_buffers(journal_t *journal,
 284		transaction_t *commit_transaction)
 285{
 286	struct jbd2_inode *jinode, *next_i;
 287	int err, ret = 0;
 288
 289	/* For locking, see the comment in journal_submit_data_buffers() */
 290	spin_lock(&journal->j_list_lock);
 291	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
 292		if (!(jinode->i_flags & JI_WAIT_DATA))
 293			continue;
 294		WRITE_ONCE(jinode->i_flags, jinode->i_flags | JI_COMMIT_RUNNING);
 295		spin_unlock(&journal->j_list_lock);
 296		/* wait for the inode data buffers writeout. */
 297		if (journal->j_finish_inode_data_buffers) {
 298			err = journal->j_finish_inode_data_buffers(jinode);
 299			if (!ret)
 300				ret = err;
 301		}
 302		cond_resched();
 303		spin_lock(&journal->j_list_lock);
 304		WRITE_ONCE(jinode->i_flags, jinode->i_flags & ~JI_COMMIT_RUNNING);
 305		smp_mb();
 306		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
 307	}
 308
 309	/* Now refile inode to proper lists */
 310	list_for_each_entry_safe(jinode, next_i,
 311				 &commit_transaction->t_inode_list, i_list) {
 312		list_del(&jinode->i_list);
 313		if (jinode->i_next_transaction) {
 314			jinode->i_transaction = jinode->i_next_transaction;
 315			jinode->i_next_transaction = NULL;
 316			list_add(&jinode->i_list,
 317				&jinode->i_transaction->t_inode_list);
 318		} else {
 319			jinode->i_transaction = NULL;
 320			WRITE_ONCE(jinode->i_dirty_start_page, 0);
 321			WRITE_ONCE(jinode->i_dirty_end_page, 0);
 322		}
 323	}
 324	spin_unlock(&journal->j_list_lock);
 325
 326	return ret;
 327}
 328
 329static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
 330{
 331	char *addr;
 332	__u32 checksum;
 333
 334	addr = kmap_local_folio(bh->b_folio, bh_offset(bh));
 335	checksum = crc32_be(crc32_sum, addr, bh->b_size);
 336	kunmap_local(addr);
 337
 338	return checksum;
 339}
 340
 341static void write_tag_block(journal_t *j, journal_block_tag_t *tag,
 342				   unsigned long long block)
 343{
 344	tag->t_blocknr = cpu_to_be32(block & (u32)~0);
 345	if (jbd2_has_feature_64bit(j))
 346		tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
 347}
 348
 349static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
 350				    struct buffer_head *bh, __u32 sequence)
 351{
 352	journal_block_tag3_t *tag3 = (journal_block_tag3_t *)tag;
 353	__u8 *addr;
 354	__u32 csum32;
 355	__be32 seq;
 356
 357	if (!jbd2_journal_has_csum_v2or3(j))
 358		return;
 359
 360	seq = cpu_to_be32(sequence);
 361	addr = kmap_local_folio(bh->b_folio, bh_offset(bh));
 362	csum32 = jbd2_chksum(j->j_csum_seed, (__u8 *)&seq, sizeof(seq));
 363	csum32 = jbd2_chksum(csum32, addr, bh->b_size);
 364	kunmap_local(addr);
 365
 366	if (jbd2_has_feature_csum3(j))
 367		tag3->t_checksum = cpu_to_be32(csum32);
 368	else
 369		tag->t_checksum = cpu_to_be16(csum32);
 370}
 371/*
 372 * jbd2_journal_commit_transaction
 373 *
 374 * The primary function for committing a transaction to the log.  This
 375 * function is called by the journal thread to begin a complete commit.
 376 */
 377void jbd2_journal_commit_transaction(journal_t *journal)
 378{
 379	struct transaction_stats_s stats;
 380	transaction_t *commit_transaction;
 381	struct journal_head *jh;
 382	struct buffer_head *descriptor;
 383	struct buffer_head **wbuf = journal->j_wbuf;
 384	int bufs;
 385	int escape;
 386	int err;
 387	unsigned long long blocknr;
 388	ktime_t start_time;
 389	u64 commit_time;
 390	char *tagp = NULL;
 391	journal_block_tag_t *tag = NULL;
 392	int space_left = 0;
 393	int first_tag = 0;
 394	int tag_flag;
 395	int i;
 396	int tag_bytes = journal_tag_bytes(journal);
 397	struct buffer_head *cbh = NULL; /* For transactional checksums */
 398	__u32 crc32_sum = ~0;
 399	struct blk_plug plug;
 400	/* Tail of the journal */
 401	unsigned long first_block;
 402	tid_t first_tid;
 403	int update_tail;
 404	int csum_size = 0;
 405	LIST_HEAD(io_bufs);
 406	LIST_HEAD(log_bufs);
 407
 408	if (jbd2_journal_has_csum_v2or3(journal))
 409		csum_size = sizeof(struct jbd2_journal_block_tail);
 410
 411	/*
 412	 * First job: lock down the current transaction and wait for
 413	 * all outstanding updates to complete.
 414	 */
 415
 416	/* Do we need to erase the effects of a prior jbd2_journal_flush? */
 417	if (journal->j_flags & JBD2_FLUSHED) {
 418		jbd2_debug(3, "super block updated\n");
 419		mutex_lock_io(&journal->j_checkpoint_mutex);
 420		/*
 421		 * We hold j_checkpoint_mutex so tail cannot change under us.
 422		 * We don't need any special data guarantees for writing sb
 423		 * since journal is empty and it is ok for write to be
 424		 * flushed only with transaction commit.
 425		 */
 426		jbd2_journal_update_sb_log_tail(journal,
 427						journal->j_tail_sequence,
 428						journal->j_tail, 0);
 429		mutex_unlock(&journal->j_checkpoint_mutex);
 430	} else {
 431		jbd2_debug(3, "superblock not updated\n");
 432	}
 433
 434	J_ASSERT(journal->j_running_transaction != NULL);
 435	J_ASSERT(journal->j_committing_transaction == NULL);
 436
 437	write_lock(&journal->j_state_lock);
 438	journal->j_flags |= JBD2_FULL_COMMIT_ONGOING;
 439	while (journal->j_flags & JBD2_FAST_COMMIT_ONGOING) {
 440		DEFINE_WAIT(wait);
 441
 442		prepare_to_wait(&journal->j_fc_wait, &wait,
 443				TASK_UNINTERRUPTIBLE);
 444		write_unlock(&journal->j_state_lock);
 445		schedule();
 446		write_lock(&journal->j_state_lock);
 447		finish_wait(&journal->j_fc_wait, &wait);
 448		/*
 449		 * TODO: by blocking fast commits here, we are increasing
 450		 * fsync() latency slightly. Strictly speaking, we don't need
 451		 * to block fast commits until the transaction enters T_FLUSH
 452		 * state. So an optimization is possible where we block new fast
 453		 * commits here and wait for existing ones to complete
 454		 * just before we enter T_FLUSH. That way, the existing fast
 455		 * commits and this full commit can proceed parallely.
 456		 */
 457	}
 458	write_unlock(&journal->j_state_lock);
 459
 460	commit_transaction = journal->j_running_transaction;
 461
 462	trace_jbd2_start_commit(journal, commit_transaction);
 463	jbd2_debug(1, "JBD2: starting commit of transaction %d\n",
 464			commit_transaction->t_tid);
 465
 466	write_lock(&journal->j_state_lock);
 467	journal->j_fc_off = 0;
 468	J_ASSERT(commit_transaction->t_state == T_RUNNING);
 469	commit_transaction->t_state = T_LOCKED;
 470
 471	trace_jbd2_commit_locking(journal, commit_transaction);
 472	stats.run.rs_wait = commit_transaction->t_max_wait;
 473	stats.run.rs_request_delay = 0;
 474	stats.run.rs_locked = jiffies;
 475	if (commit_transaction->t_requested)
 476		stats.run.rs_request_delay =
 477			jbd2_time_diff(commit_transaction->t_requested,
 478				       stats.run.rs_locked);
 479	stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
 480					      stats.run.rs_locked);
 481
 482	// waits for any t_updates to finish
 483	jbd2_journal_wait_updates(journal);
 484
 485	commit_transaction->t_state = T_SWITCH;
 486
 487	J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <=
 488			journal->j_max_transaction_buffers);
 489
 490	/*
 491	 * First thing we are allowed to do is to discard any remaining
 492	 * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
 493	 * that there are no such buffers: if a large filesystem
 494	 * operation like a truncate needs to split itself over multiple
 495	 * transactions, then it may try to do a jbd2_journal_restart() while
 496	 * there are still BJ_Reserved buffers outstanding.  These must
 497	 * be released cleanly from the current transaction.
 498	 *
 499	 * In this case, the filesystem must still reserve write access
 500	 * again before modifying the buffer in the new transaction, but
 501	 * we do not require it to remember exactly which old buffers it
 502	 * has reserved.  This is consistent with the existing behaviour
 503	 * that multiple jbd2_journal_get_write_access() calls to the same
 504	 * buffer are perfectly permissible.
 505	 * We use journal->j_state_lock here to serialize processing of
 506	 * t_reserved_list with eviction of buffers from journal_unmap_buffer().
 507	 */
 508	while (commit_transaction->t_reserved_list) {
 509		jh = commit_transaction->t_reserved_list;
 510		JBUFFER_TRACE(jh, "reserved, unused: refile");
 511		/*
 512		 * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
 513		 * leave undo-committed data.
 514		 */
 515		if (jh->b_committed_data) {
 516			struct buffer_head *bh = jh2bh(jh);
 517
 518			spin_lock(&jh->b_state_lock);
 519			jbd2_free(jh->b_committed_data, bh->b_size);
 520			jh->b_committed_data = NULL;
 521			spin_unlock(&jh->b_state_lock);
 522		}
 523		jbd2_journal_refile_buffer(journal, jh);
 524	}
 525
 526	write_unlock(&journal->j_state_lock);
 527	/*
 528	 * Now try to drop any written-back buffers from the journal's
 529	 * checkpoint lists.  We do this *before* commit because it potentially
 530	 * frees some memory
 531	 */
 532	spin_lock(&journal->j_list_lock);
 533	__jbd2_journal_clean_checkpoint_list(journal, JBD2_SHRINK_BUSY_STOP);
 534	spin_unlock(&journal->j_list_lock);
 535
 536	jbd2_debug(3, "JBD2: commit phase 1\n");
 537
 538	/*
 539	 * Clear revoked flag to reflect there is no revoked buffers
 540	 * in the next transaction which is going to be started.
 541	 */
 542	jbd2_clear_buffer_revoked_flags(journal);
 543
 544	/*
 545	 * Switch to a new revoke table.
 546	 */
 547	jbd2_journal_switch_revoke_table(journal);
 548
 549	write_lock(&journal->j_state_lock);
 550	/*
 551	 * Reserved credits cannot be claimed anymore, free them
 552	 */
 553	atomic_sub(atomic_read(&journal->j_reserved_credits),
 554		   &commit_transaction->t_outstanding_credits);
 555
 556	trace_jbd2_commit_flushing(journal, commit_transaction);
 557	stats.run.rs_flushing = jiffies;
 558	stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
 559					     stats.run.rs_flushing);
 560
 561	commit_transaction->t_state = T_FLUSH;
 562	journal->j_committing_transaction = commit_transaction;
 563	journal->j_running_transaction = NULL;
 564	start_time = ktime_get();
 565	commit_transaction->t_log_start = journal->j_head;
 566	wake_up_all(&journal->j_wait_transaction_locked);
 567	write_unlock(&journal->j_state_lock);
 568
 569	jbd2_debug(3, "JBD2: commit phase 2a\n");
 570
 571	/*
 572	 * Now start flushing things to disk, in the order they appear
 573	 * on the transaction lists.  Data blocks go first.
 574	 */
 575	err = journal_submit_data_buffers(journal, commit_transaction);
 576	if (err)
 577		jbd2_journal_abort(journal, err);
 578
 579	blk_start_plug(&plug);
 580	jbd2_journal_write_revoke_records(commit_transaction, &log_bufs);
 581
 582	jbd2_debug(3, "JBD2: commit phase 2b\n");
 583
 584	/*
 585	 * Way to go: we have now written out all of the data for a
 586	 * transaction!  Now comes the tricky part: we need to write out
 587	 * metadata.  Loop over the transaction's entire buffer list:
 588	 */
 589	write_lock(&journal->j_state_lock);
 590	commit_transaction->t_state = T_COMMIT;
 591	write_unlock(&journal->j_state_lock);
 592
 593	trace_jbd2_commit_logging(journal, commit_transaction);
 594	stats.run.rs_logging = jiffies;
 595	stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
 596					       stats.run.rs_logging);
 597	stats.run.rs_blocks = commit_transaction->t_nr_buffers;
 598	stats.run.rs_blocks_logged = 0;
 599
 600	J_ASSERT(commit_transaction->t_nr_buffers <=
 601		 atomic_read(&commit_transaction->t_outstanding_credits));
 602
 603	bufs = 0;
 604	descriptor = NULL;
 605	while (commit_transaction->t_buffers) {
 606
 607		/* Find the next buffer to be journaled... */
 608
 609		jh = commit_transaction->t_buffers;
 610
 611		/* If we're in abort mode, we just un-journal the buffer and
 612		   release it. */
 613
 614		if (is_journal_aborted(journal)) {
 615			clear_buffer_jbddirty(jh2bh(jh));
 616			JBUFFER_TRACE(jh, "journal is aborting: refile");
 617			jbd2_buffer_abort_trigger(jh,
 618						  jh->b_frozen_data ?
 619						  jh->b_frozen_triggers :
 620						  jh->b_triggers);
 621			jbd2_journal_refile_buffer(journal, jh);
 622			/* If that was the last one, we need to clean up
 623			 * any descriptor buffers which may have been
 624			 * already allocated, even if we are now
 625			 * aborting. */
 626			if (!commit_transaction->t_buffers)
 627				goto start_journal_io;
 628			continue;
 629		}
 630
 631		/* Make sure we have a descriptor block in which to
 632		   record the metadata buffer. */
 633
 634		if (!descriptor) {
 635			J_ASSERT (bufs == 0);
 636
 637			jbd2_debug(4, "JBD2: get descriptor\n");
 638
 639			descriptor = jbd2_journal_get_descriptor_buffer(
 640							commit_transaction,
 641							JBD2_DESCRIPTOR_BLOCK);
 642			if (!descriptor) {
 643				jbd2_journal_abort(journal, -EIO);
 644				continue;
 645			}
 646
 647			jbd2_debug(4, "JBD2: got buffer %llu (%p)\n",
 648				(unsigned long long)descriptor->b_blocknr,
 649				descriptor->b_data);
 650			tagp = &descriptor->b_data[sizeof(journal_header_t)];
 651			space_left = descriptor->b_size -
 652						sizeof(journal_header_t);
 653			first_tag = 1;
 654			set_buffer_jwrite(descriptor);
 655			set_buffer_dirty(descriptor);
 656			wbuf[bufs++] = descriptor;
 657
 658			/* Record it so that we can wait for IO
 659                           completion later */
 660			BUFFER_TRACE(descriptor, "ph3: file as descriptor");
 661			jbd2_file_log_bh(&log_bufs, descriptor);
 662		}
 663
 664		/* Where is the buffer to be written? */
 665
 666		err = jbd2_journal_next_log_block(journal, &blocknr);
 667		/* If the block mapping failed, just abandon the buffer
 668		   and repeat this loop: we'll fall into the
 669		   refile-on-abort condition above. */
 670		if (err) {
 671			jbd2_journal_abort(journal, err);
 672			continue;
 673		}
 674
 675		/*
 676		 * start_this_handle() uses t_outstanding_credits to determine
 677		 * the free space in the log.
 678		 */
 679		atomic_dec(&commit_transaction->t_outstanding_credits);
 680
 681		/* Bump b_count to prevent truncate from stumbling over
 682                   the shadowed buffer!  @@@ This can go if we ever get
 683                   rid of the shadow pairing of buffers. */
 684		atomic_inc(&jh2bh(jh)->b_count);
 685
 686		/*
 687		 * Make a temporary IO buffer with which to write it out
 688		 * (this will requeue the metadata buffer to BJ_Shadow).
 689		 */
 690		set_bit(BH_JWrite, &jh2bh(jh)->b_state);
 691		JBUFFER_TRACE(jh, "ph3: write metadata");
 692		escape = jbd2_journal_write_metadata_buffer(commit_transaction,
 693						jh, &wbuf[bufs], blocknr);
 694		jbd2_file_log_bh(&io_bufs, wbuf[bufs]);
 695
 696		/* Record the new block's tag in the current descriptor
 697                   buffer */
 698
 699		tag_flag = 0;
 700		if (escape)
 701			tag_flag |= JBD2_FLAG_ESCAPE;
 702		if (!first_tag)
 703			tag_flag |= JBD2_FLAG_SAME_UUID;
 704
 705		tag = (journal_block_tag_t *) tagp;
 706		write_tag_block(journal, tag, jh2bh(jh)->b_blocknr);
 707		tag->t_flags = cpu_to_be16(tag_flag);
 708		jbd2_block_tag_csum_set(journal, tag, wbuf[bufs],
 709					commit_transaction->t_tid);
 710		tagp += tag_bytes;
 711		space_left -= tag_bytes;
 712		bufs++;
 713
 714		if (first_tag) {
 715			memcpy (tagp, journal->j_uuid, 16);
 716			tagp += 16;
 717			space_left -= 16;
 718			first_tag = 0;
 719		}
 720
 721		/* If there's no more to do, or if the descriptor is full,
 722		   let the IO rip! */
 723
 724		if (bufs == journal->j_wbufsize ||
 725		    commit_transaction->t_buffers == NULL ||
 726		    space_left < tag_bytes + 16 + csum_size) {
 727
 728			jbd2_debug(4, "JBD2: Submit %d IOs\n", bufs);
 729
 730			/* Write an end-of-descriptor marker before
 731                           submitting the IOs.  "tag" still points to
 732                           the last tag we set up. */
 733
 734			tag->t_flags |= cpu_to_be16(JBD2_FLAG_LAST_TAG);
 735start_journal_io:
 736			if (descriptor)
 737				jbd2_descriptor_block_csum_set(journal,
 738							descriptor);
 739
 740			for (i = 0; i < bufs; i++) {
 741				struct buffer_head *bh = wbuf[i];
 742
 743				/*
 744				 * Compute checksum.
 745				 */
 746				if (jbd2_has_feature_checksum(journal)) {
 747					crc32_sum =
 748					    jbd2_checksum_data(crc32_sum, bh);
 749				}
 750
 751				lock_buffer(bh);
 752				clear_buffer_dirty(bh);
 753				set_buffer_uptodate(bh);
 754				bh->b_end_io = journal_end_buffer_io_sync;
 755				submit_bh(REQ_OP_WRITE | JBD2_JOURNAL_REQ_FLAGS,
 756					  bh);
 757			}
 758			cond_resched();
 759
 760			/* Force a new descriptor to be generated next
 761                           time round the loop. */
 762			descriptor = NULL;
 763			bufs = 0;
 764		}
 765	}
 766
 767	err = journal_finish_inode_data_buffers(journal, commit_transaction);
 768	if (err) {
 769		printk(KERN_WARNING
 770			"JBD2: Detected IO errors %d while flushing file data on %s\n",
 771			err, journal->j_devname);
 772		err = 0;
 773	}
 774
 775	/*
 776	 * Get current oldest transaction in the log before we issue flush
 777	 * to the filesystem device. After the flush we can be sure that
 778	 * blocks of all older transactions are checkpointed to persistent
 779	 * storage and we will be safe to update journal start in the
 780	 * superblock with the numbers we get here.
 781	 */
 782	update_tail =
 783		jbd2_journal_get_log_tail(journal, &first_tid, &first_block);
 784
 785	write_lock(&journal->j_state_lock);
 786	if (update_tail) {
 787		long freed = first_block - journal->j_tail;
 788
 789		if (first_block < journal->j_tail)
 790			freed += journal->j_last - journal->j_first;
 791		/* Update tail only if we free significant amount of space */
 792		if (freed < journal->j_max_transaction_buffers)
 793			update_tail = 0;
 794	}
 795	J_ASSERT(commit_transaction->t_state == T_COMMIT);
 796	commit_transaction->t_state = T_COMMIT_DFLUSH;
 797	write_unlock(&journal->j_state_lock);
 798
 799	/*
 800	 * If the journal is not located on the file system device,
 801	 * then we must flush the file system device before we issue
 802	 * the commit record and update the journal tail sequence.
 803	 */
 804	if ((commit_transaction->t_need_data_flush || update_tail) &&
 805	    (journal->j_fs_dev != journal->j_dev) &&
 806	    (journal->j_flags & JBD2_BARRIER))
 807		blkdev_issue_flush(journal->j_fs_dev);
 808
 809	/* Done it all: now write the commit record asynchronously. */
 810	if (jbd2_has_feature_async_commit(journal)) {
 811		err = journal_submit_commit_record(journal, commit_transaction,
 812						 &cbh, crc32_sum);
 813		if (err)
 814			jbd2_journal_abort(journal, err);
 815	}
 816
 817	blk_finish_plug(&plug);
 818
 819	/* Lo and behold: we have just managed to send a transaction to
 820           the log.  Before we can commit it, wait for the IO so far to
 821           complete.  Control buffers being written are on the
 822           transaction's t_log_list queue, and metadata buffers are on
 823           the io_bufs list.
 824
 825	   Wait for the buffers in reverse order.  That way we are
 826	   less likely to be woken up until all IOs have completed, and
 827	   so we incur less scheduling load.
 828	*/
 829
 830	jbd2_debug(3, "JBD2: commit phase 3\n");
 831
 832	while (!list_empty(&io_bufs)) {
 833		struct buffer_head *bh = list_entry(io_bufs.prev,
 834						    struct buffer_head,
 835						    b_assoc_buffers);
 836
 837		wait_on_buffer(bh);
 838		cond_resched();
 839
 840		if (unlikely(!buffer_uptodate(bh)))
 841			err = -EIO;
 842		jbd2_unfile_log_bh(bh);
 843		stats.run.rs_blocks_logged++;
 844
 845		/*
 846		 * The list contains temporary buffer heads created by
 847		 * jbd2_journal_write_metadata_buffer().
 848		 */
 849		BUFFER_TRACE(bh, "dumping temporary bh");
 850		__brelse(bh);
 851		J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
 852		free_buffer_head(bh);
 853
 854		/* We also have to refile the corresponding shadowed buffer */
 855		jh = commit_transaction->t_shadow_list->b_tprev;
 856		bh = jh2bh(jh);
 857		clear_buffer_jwrite(bh);
 858		J_ASSERT_BH(bh, buffer_jbddirty(bh));
 859		J_ASSERT_BH(bh, !buffer_shadow(bh));
 860
 861		/* The metadata is now released for reuse, but we need
 862                   to remember it against this transaction so that when
 863                   we finally commit, we can do any checkpointing
 864                   required. */
 865		JBUFFER_TRACE(jh, "file as BJ_Forget");
 866		jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
 867		JBUFFER_TRACE(jh, "brelse shadowed buffer");
 868		__brelse(bh);
 869	}
 870
 871	J_ASSERT (commit_transaction->t_shadow_list == NULL);
 872
 873	jbd2_debug(3, "JBD2: commit phase 4\n");
 874
 875	/* Here we wait for the revoke record and descriptor record buffers */
 876	while (!list_empty(&log_bufs)) {
 877		struct buffer_head *bh;
 878
 879		bh = list_entry(log_bufs.prev, struct buffer_head, b_assoc_buffers);
 880		wait_on_buffer(bh);
 881		cond_resched();
 882
 883		if (unlikely(!buffer_uptodate(bh)))
 884			err = -EIO;
 885
 886		BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
 887		clear_buffer_jwrite(bh);
 888		jbd2_unfile_log_bh(bh);
 889		stats.run.rs_blocks_logged++;
 890		__brelse(bh);		/* One for getblk */
 891		/* AKPM: bforget here */
 892	}
 893
 894	if (err)
 895		jbd2_journal_abort(journal, err);
 896
 897	jbd2_debug(3, "JBD2: commit phase 5\n");
 898	write_lock(&journal->j_state_lock);
 899	J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH);
 900	commit_transaction->t_state = T_COMMIT_JFLUSH;
 901	write_unlock(&journal->j_state_lock);
 902
 903	if (!jbd2_has_feature_async_commit(journal)) {
 904		err = journal_submit_commit_record(journal, commit_transaction,
 905						&cbh, crc32_sum);
 906		if (err)
 907			jbd2_journal_abort(journal, err);
 908	}
 909	if (cbh)
 910		err = journal_wait_on_commit_record(journal, cbh);
 911	stats.run.rs_blocks_logged++;
 912	if (jbd2_has_feature_async_commit(journal) &&
 913	    journal->j_flags & JBD2_BARRIER) {
 914		blkdev_issue_flush(journal->j_dev);
 915	}
 916
 917	if (err)
 918		jbd2_journal_abort(journal, err);
 919
 920	WARN_ON_ONCE(
 921		atomic_read(&commit_transaction->t_outstanding_credits) < 0);
 922
 923	/*
 924	 * Now disk caches for filesystem device are flushed so we are safe to
 925	 * erase checkpointed transactions from the log by updating journal
 926	 * superblock.
 927	 */
 928	if (update_tail)
 929		jbd2_update_log_tail(journal, first_tid, first_block);
 930
 931	/* End of a transaction!  Finally, we can do checkpoint
 932           processing: any buffers committed as a result of this
 933           transaction can be removed from any checkpoint list it was on
 934           before. */
 935
 936	jbd2_debug(3, "JBD2: commit phase 6\n");
 937
 938	J_ASSERT(list_empty(&commit_transaction->t_inode_list));
 939	J_ASSERT(commit_transaction->t_buffers == NULL);
 940	J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
 941	J_ASSERT(commit_transaction->t_shadow_list == NULL);
 942
 943restart_loop:
 944	/*
 945	 * As there are other places (journal_unmap_buffer()) adding buffers
 946	 * to this list we have to be careful and hold the j_list_lock.
 947	 */
 948	spin_lock(&journal->j_list_lock);
 949	while (commit_transaction->t_forget) {
 950		transaction_t *cp_transaction;
 951		struct buffer_head *bh;
 952		int try_to_free = 0;
 953		bool drop_ref;
 954
 955		jh = commit_transaction->t_forget;
 956		spin_unlock(&journal->j_list_lock);
 957		bh = jh2bh(jh);
 958		/*
 959		 * Get a reference so that bh cannot be freed before we are
 960		 * done with it.
 961		 */
 962		get_bh(bh);
 963		spin_lock(&jh->b_state_lock);
 964		J_ASSERT_JH(jh,	jh->b_transaction == commit_transaction);
 965
 966		/*
 967		 * If there is undo-protected committed data against
 968		 * this buffer, then we can remove it now.  If it is a
 969		 * buffer needing such protection, the old frozen_data
 970		 * field now points to a committed version of the
 971		 * buffer, so rotate that field to the new committed
 972		 * data.
 973		 *
 974		 * Otherwise, we can just throw away the frozen data now.
 975		 *
 976		 * We also know that the frozen data has already fired
 977		 * its triggers if they exist, so we can clear that too.
 978		 */
 979		if (jh->b_committed_data) {
 980			jbd2_free(jh->b_committed_data, bh->b_size);
 981			jh->b_committed_data = NULL;
 982			if (jh->b_frozen_data) {
 983				jh->b_committed_data = jh->b_frozen_data;
 984				jh->b_frozen_data = NULL;
 985				jh->b_frozen_triggers = NULL;
 986			}
 987		} else if (jh->b_frozen_data) {
 988			jbd2_free(jh->b_frozen_data, bh->b_size);
 989			jh->b_frozen_data = NULL;
 990			jh->b_frozen_triggers = NULL;
 991		}
 992
 993		spin_lock(&journal->j_list_lock);
 994		cp_transaction = jh->b_cp_transaction;
 995		if (cp_transaction) {
 996			JBUFFER_TRACE(jh, "remove from old cp transaction");
 997			cp_transaction->t_chp_stats.cs_dropped++;
 998			__jbd2_journal_remove_checkpoint(jh);
 999		}
1000
1001		/* Only re-checkpoint the buffer_head if it is marked
1002		 * dirty.  If the buffer was added to the BJ_Forget list
1003		 * by jbd2_journal_forget, it may no longer be dirty and
1004		 * there's no point in keeping a checkpoint record for
1005		 * it. */
1006
1007		/*
1008		 * A buffer which has been freed while still being journaled
1009		 * by a previous transaction, refile the buffer to BJ_Forget of
1010		 * the running transaction. If the just committed transaction
1011		 * contains "add to orphan" operation, we can completely
1012		 * invalidate the buffer now. We are rather through in that
1013		 * since the buffer may be still accessible when blocksize <
1014		 * pagesize and it is attached to the last partial page.
1015		 */
1016		if (buffer_freed(bh) && !jh->b_next_transaction) {
1017			struct address_space *mapping;
1018
1019			clear_buffer_freed(bh);
1020			clear_buffer_jbddirty(bh);
1021
1022			/*
1023			 * Block device buffers need to stay mapped all the
1024			 * time, so it is enough to clear buffer_jbddirty and
1025			 * buffer_freed bits. For the file mapping buffers (i.e.
1026			 * journalled data) we need to unmap buffer and clear
1027			 * more bits. We also need to be careful about the check
1028			 * because the data page mapping can get cleared under
1029			 * our hands. Note that if mapping == NULL, we don't
1030			 * need to make buffer unmapped because the page is
1031			 * already detached from the mapping and buffers cannot
1032			 * get reused.
1033			 */
1034			mapping = READ_ONCE(bh->b_folio->mapping);
1035			if (mapping && !sb_is_blkdev_sb(mapping->host->i_sb)) {
1036				clear_buffer_mapped(bh);
1037				clear_buffer_new(bh);
1038				clear_buffer_req(bh);
1039				bh->b_bdev = NULL;
1040			}
1041		}
1042
1043		if (buffer_jbddirty(bh)) {
1044			JBUFFER_TRACE(jh, "add to new checkpointing trans");
1045			__jbd2_journal_insert_checkpoint(jh, commit_transaction);
1046			if (is_journal_aborted(journal))
1047				clear_buffer_jbddirty(bh);
1048		} else {
1049			J_ASSERT_BH(bh, !buffer_dirty(bh));
1050			/*
1051			 * The buffer on BJ_Forget list and not jbddirty means
1052			 * it has been freed by this transaction and hence it
1053			 * could not have been reallocated until this
1054			 * transaction has committed. *BUT* it could be
1055			 * reallocated once we have written all the data to
1056			 * disk and before we process the buffer on BJ_Forget
1057			 * list.
1058			 */
1059			if (!jh->b_next_transaction)
1060				try_to_free = 1;
1061		}
1062		JBUFFER_TRACE(jh, "refile or unfile buffer");
1063		drop_ref = __jbd2_journal_refile_buffer(jh);
1064		spin_unlock(&jh->b_state_lock);
1065		if (drop_ref)
1066			jbd2_journal_put_journal_head(jh);
1067		if (try_to_free)
1068			release_buffer_page(bh);	/* Drops bh reference */
1069		else
1070			__brelse(bh);
1071		cond_resched_lock(&journal->j_list_lock);
1072	}
1073	spin_unlock(&journal->j_list_lock);
1074	/*
1075	 * This is a bit sleazy.  We use j_list_lock to protect transition
1076	 * of a transaction into T_FINISHED state and calling
1077	 * __jbd2_journal_drop_transaction(). Otherwise we could race with
1078	 * other checkpointing code processing the transaction...
1079	 */
1080	write_lock(&journal->j_state_lock);
1081	spin_lock(&journal->j_list_lock);
1082	/*
1083	 * Now recheck if some buffers did not get attached to the transaction
1084	 * while the lock was dropped...
1085	 */
1086	if (commit_transaction->t_forget) {
1087		spin_unlock(&journal->j_list_lock);
1088		write_unlock(&journal->j_state_lock);
1089		goto restart_loop;
1090	}
1091
1092	/* Add the transaction to the checkpoint list
1093	 * __journal_remove_checkpoint() can not destroy transaction
1094	 * under us because it is not marked as T_FINISHED yet */
1095	if (journal->j_checkpoint_transactions == NULL) {
1096		journal->j_checkpoint_transactions = commit_transaction;
1097		commit_transaction->t_cpnext = commit_transaction;
1098		commit_transaction->t_cpprev = commit_transaction;
1099	} else {
1100		commit_transaction->t_cpnext =
1101			journal->j_checkpoint_transactions;
1102		commit_transaction->t_cpprev =
1103			commit_transaction->t_cpnext->t_cpprev;
1104		commit_transaction->t_cpnext->t_cpprev =
1105			commit_transaction;
1106		commit_transaction->t_cpprev->t_cpnext =
1107				commit_transaction;
1108	}
1109	spin_unlock(&journal->j_list_lock);
1110
1111	/* Done with this transaction! */
1112
1113	jbd2_debug(3, "JBD2: commit phase 7\n");
1114
1115	J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH);
1116
1117	commit_transaction->t_start = jiffies;
1118	stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
1119					      commit_transaction->t_start);
1120
1121	/*
1122	 * File the transaction statistics
1123	 */
1124	stats.ts_tid = commit_transaction->t_tid;
1125	stats.run.rs_handle_count =
1126		atomic_read(&commit_transaction->t_handle_count);
1127	trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
1128			     commit_transaction->t_tid, &stats.run);
1129	stats.ts_requested = (commit_transaction->t_requested) ? 1 : 0;
1130
1131	commit_transaction->t_state = T_COMMIT_CALLBACK;
1132	J_ASSERT(commit_transaction == journal->j_committing_transaction);
1133	WRITE_ONCE(journal->j_commit_sequence, commit_transaction->t_tid);
1134	journal->j_committing_transaction = NULL;
1135	commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1136
1137	/*
1138	 * weight the commit time higher than the average time so we don't
1139	 * react too strongly to vast changes in the commit time
1140	 */
1141	if (likely(journal->j_average_commit_time))
1142		journal->j_average_commit_time = (commit_time +
1143				journal->j_average_commit_time*3) / 4;
1144	else
1145		journal->j_average_commit_time = commit_time;
1146
1147	write_unlock(&journal->j_state_lock);
1148
1149	if (journal->j_commit_callback)
1150		journal->j_commit_callback(journal, commit_transaction);
1151	if (journal->j_fc_cleanup_callback)
1152		journal->j_fc_cleanup_callback(journal, 1, commit_transaction->t_tid);
1153
1154	trace_jbd2_end_commit(journal, commit_transaction);
1155	jbd2_debug(1, "JBD2: commit %d complete, head %d\n",
1156		  journal->j_commit_sequence, journal->j_tail_sequence);
1157
1158	write_lock(&journal->j_state_lock);
1159	journal->j_flags &= ~JBD2_FULL_COMMIT_ONGOING;
1160	journal->j_flags &= ~JBD2_FAST_COMMIT_ONGOING;
1161	spin_lock(&journal->j_list_lock);
1162	commit_transaction->t_state = T_FINISHED;
1163	/* Check if the transaction can be dropped now that we are finished */
1164	if (commit_transaction->t_checkpoint_list == NULL) {
1165		__jbd2_journal_drop_transaction(journal, commit_transaction);
1166		jbd2_journal_free_transaction(commit_transaction);
1167	}
1168	spin_unlock(&journal->j_list_lock);
1169	write_unlock(&journal->j_state_lock);
1170	wake_up(&journal->j_wait_done_commit);
1171	wake_up(&journal->j_fc_wait);
1172
1173	/*
1174	 * Calculate overall stats
1175	 */
1176	spin_lock(&journal->j_history_lock);
1177	journal->j_stats.ts_tid++;
1178	journal->j_stats.ts_requested += stats.ts_requested;
1179	journal->j_stats.run.rs_wait += stats.run.rs_wait;
1180	journal->j_stats.run.rs_request_delay += stats.run.rs_request_delay;
1181	journal->j_stats.run.rs_running += stats.run.rs_running;
1182	journal->j_stats.run.rs_locked += stats.run.rs_locked;
1183	journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
1184	journal->j_stats.run.rs_logging += stats.run.rs_logging;
1185	journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
1186	journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
1187	journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
1188	spin_unlock(&journal->j_history_lock);
1189}
Configure Feed

Configure Feed