Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'bcachefs-2025-06-26' of git://evilpiepirate.org/bcachefs

Pull bcachefs fixes from Kent Overstreet:

- Lots of small check/repair fixes, primarily in subvol loop and
directory structure loop (when involving snapshots).

- Fix a few 6.16 regressions: rare UAF in the foreground allocator path
when taking a transaction restart from the transaction bump
allocator, and some small fallout from the change to log the error
being corrected in the journal when repairing errors, also some
fallout from the btree node read error logging improvements.

(Alan, Bharadwaj)

- New option: journal_rewind

This lets the entire filesystem be reset to an earlier point in time.

Note that this is only a disaster recovery tool, and right now there
are major caveats to using it (discards should be disabled, in
particular), but it successfully restored the filesystem of one of
the users who was bit by the subvolume deletion bug and didn't have
backups. I'll likely be making some changes to the discard path in
the future to make this a reliable recovery tool.

- Some new btree iterator tracepoints, for tracking down some
livelock-ish behaviour we've been seeing in the main data write path.

* tag 'bcachefs-2025-06-26' of git://evilpiepirate.org/bcachefs: (51 commits)
bcachefs: Plumb correct ip to trans_relock_fail tracepoint
bcachefs: Ensure we rewind to run recovery passes
bcachefs: Ensure btree node scan runs before checking for scanned nodes
bcachefs: btree_root_unreadable_and_scan_found_nothing should not be autofix
bcachefs: fix bch2_journal_keys_peek_prev_min() underflow
bcachefs: Use wait_on_allocator() when allocating journal
bcachefs: Check for bad write buffer key when moving from journal
bcachefs: Don't unlock the trans if ret doesn't match BCH_ERR_operation_blocked
bcachefs: Fix range in bch2_lookup_indirect_extent() error path
bcachefs: fix spurious error_throw
bcachefs: Add missing bch2_err_class() to fileattr_set()
bcachefs: Add missing key type checks to check_snapshot_exists()
bcachefs: Don't log fsck err in the journal if doing repair elsewhere
bcachefs: Fix *__bch2_trans_subbuf_alloc() error path
bcachefs: Fix missing newlines before ero
bcachefs: fix spurious error in read_btree_roots()
bcachefs: fsck: Fix oops in key_visible_in_snapshot()
bcachefs: fsck: fix unhandled restart in topology repair
bcachefs: fsck: Fix check_directory_structure when no check_dirents
bcachefs: Fix restart handling in btree_node_scrub_work()
...

+733 -450
+9 -4
fs/bcachefs/alloc_background.c
··· 1406 1406 : BCH_DATA_free; 1407 1407 struct printbuf buf = PRINTBUF; 1408 1408 1409 + unsigned fsck_flags = (async_repair ? FSCK_ERR_NO_LOG : 0)| 1410 + FSCK_CAN_FIX|FSCK_CAN_IGNORE; 1411 + 1409 1412 struct bpos bucket = iter->pos; 1410 1413 bucket.offset &= ~(~0ULL << 56); 1411 1414 u64 genbits = iter->pos.offset & (~0ULL << 56); ··· 1422 1419 return ret; 1423 1420 1424 1421 if (!bch2_dev_bucket_exists(c, bucket)) { 1425 - if (fsck_err(trans, need_discard_freespace_key_to_invalid_dev_bucket, 1426 - "entry in %s btree for nonexistant dev:bucket %llu:%llu", 1427 - bch2_btree_id_str(iter->btree_id), bucket.inode, bucket.offset)) 1422 + if (__fsck_err(trans, fsck_flags, 1423 + need_discard_freespace_key_to_invalid_dev_bucket, 1424 + "entry in %s btree for nonexistant dev:bucket %llu:%llu", 1425 + bch2_btree_id_str(iter->btree_id), bucket.inode, bucket.offset)) 1428 1426 goto delete; 1429 1427 ret = 1; 1430 1428 goto out; ··· 1437 1433 if (a->data_type != state || 1438 1434 (state == BCH_DATA_free && 1439 1435 genbits != alloc_freespace_genbits(*a))) { 1440 - if (fsck_err(trans, need_discard_freespace_key_bad, 1436 + if (__fsck_err(trans, fsck_flags, 1437 + need_discard_freespace_key_bad, 1441 1438 "%s\nincorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)", 1442 1439 (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf), 1443 1440 bch2_btree_id_str(iter->btree_id),
+1 -1
fs/bcachefs/backpointers.c
··· 353 353 return ret ? bkey_s_c_err(ret) : bkey_s_c_null; 354 354 } else { 355 355 struct btree *b = __bch2_backpointer_get_node(trans, bp, iter, last_flushed, commit); 356 - if (b == ERR_PTR(bch_err_throw(c, backpointer_to_overwritten_btree_node))) 356 + if (b == ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node)) 357 357 return bkey_s_c_null; 358 358 if (IS_ERR_OR_NULL(b)) 359 359 return ((struct bkey_s_c) { .k = ERR_CAST(b) });
+2 -1
fs/bcachefs/bcachefs.h
··· 767 767 x(sysfs) \ 768 768 x(btree_write_buffer) \ 769 769 x(btree_node_scrub) \ 770 - x(async_recovery_passes) 770 + x(async_recovery_passes) \ 771 + x(ioctl_data) 771 772 772 773 enum bch_write_ref { 773 774 #define x(n) BCH_WRITE_REF_##n,
+25 -12
fs/bcachefs/btree_gc.c
··· 503 503 prt_newline(&buf); 504 504 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); 505 505 506 + /* 507 + * XXX: we're not passing the trans object here because we're not set up 508 + * to handle a transaction restart - this code needs to be rewritten 509 + * when we start doing online topology repair 510 + */ 511 + bch2_trans_unlock_long(trans); 506 512 if (mustfix_fsck_err_on(!have_child, 507 - trans, btree_node_topology_interior_node_empty, 513 + c, btree_node_topology_interior_node_empty, 508 514 "empty interior btree node at %s", buf.buf)) 509 515 ret = DROP_THIS_NODE; 510 516 err: ··· 534 528 return ret; 535 529 } 536 530 537 - static int bch2_check_root(struct btree_trans *trans, enum btree_id i, 531 + static int bch2_check_root(struct btree_trans *trans, enum btree_id btree, 538 532 bool *reconstructed_root) 539 533 { 540 534 struct bch_fs *c = trans->c; 541 - struct btree_root *r = bch2_btree_id_root(c, i); 535 + struct btree_root *r = bch2_btree_id_root(c, btree); 542 536 struct printbuf buf = PRINTBUF; 543 537 int ret = 0; 544 538 545 - bch2_btree_id_to_text(&buf, i); 539 + bch2_btree_id_to_text(&buf, btree); 546 540 547 541 if (r->error) { 548 542 bch_info(c, "btree root %s unreadable, must recover from scan", buf.buf); 549 543 550 - r->alive = false; 551 - r->error = 0; 544 + ret = bch2_btree_has_scanned_nodes(c, btree); 545 + if (ret < 0) 546 + goto err; 552 547 553 - if (!bch2_btree_has_scanned_nodes(c, i)) { 548 + if (!ret) { 554 549 __fsck_err(trans, 555 - FSCK_CAN_FIX|(!btree_id_important(i) ? FSCK_AUTOFIX : 0), 550 + FSCK_CAN_FIX|(!btree_id_important(btree) ? FSCK_AUTOFIX : 0), 556 551 btree_root_unreadable_and_scan_found_nothing, 557 552 "no nodes found for btree %s, continue?", buf.buf); 558 - bch2_btree_root_alloc_fake_trans(trans, i, 0); 553 + 554 + r->alive = false; 555 + r->error = 0; 556 + bch2_btree_root_alloc_fake_trans(trans, btree, 0); 559 557 } else { 560 - bch2_btree_root_alloc_fake_trans(trans, i, 1); 561 - bch2_shoot_down_journal_keys(c, i, 1, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); 562 - ret = bch2_get_scanned_nodes(c, i, 0, POS_MIN, SPOS_MAX); 558 + r->alive = false; 559 + r->error = 0; 560 + bch2_btree_root_alloc_fake_trans(trans, btree, 1); 561 + 562 + bch2_shoot_down_journal_keys(c, btree, 1, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); 563 + ret = bch2_get_scanned_nodes(c, btree, 0, POS_MIN, SPOS_MAX); 563 564 if (ret) 564 565 goto err; 565 566 }
+31 -43
fs/bcachefs/btree_io.c
··· 557 557 const char *fmt, ...) 558 558 { 559 559 if (c->recovery.curr_pass == BCH_RECOVERY_PASS_scan_for_btree_nodes) 560 - return bch_err_throw(c, fsck_fix); 560 + return ret == -BCH_ERR_btree_node_read_err_fixable 561 + ? bch_err_throw(c, fsck_fix) 562 + : ret; 561 563 562 564 bool have_retry = false; 563 565 int ret2; ··· 725 723 726 724 static int validate_bset(struct bch_fs *c, struct bch_dev *ca, 727 725 struct btree *b, struct bset *i, 728 - unsigned offset, unsigned sectors, int write, 726 + unsigned offset, int write, 729 727 struct bch_io_failures *failed, 730 728 struct printbuf *err_msg) 731 729 { 732 730 unsigned version = le16_to_cpu(i->version); 733 - unsigned ptr_written = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key)); 734 731 struct printbuf buf1 = PRINTBUF; 735 732 struct printbuf buf2 = PRINTBUF; 736 733 int ret = 0; ··· 778 777 c, ca, b, i, NULL, 779 778 btree_node_unsupported_version, 780 779 "BSET_SEPARATE_WHITEOUTS no longer supported"); 781 - 782 - if (!write && 783 - btree_err_on(offset + sectors > (ptr_written ?: btree_sectors(c)), 784 - -BCH_ERR_btree_node_read_err_fixable, 785 - c, ca, b, i, NULL, 786 - bset_past_end_of_btree_node, 787 - "bset past end of btree node (offset %u len %u but written %zu)", 788 - offset, sectors, ptr_written ?: btree_sectors(c))) 789 - i->u64s = 0; 790 780 791 781 btree_err_on(offset && !i->u64s, 792 782 -BCH_ERR_btree_node_read_err_fixable, ··· 1143 1151 "unknown checksum type %llu", BSET_CSUM_TYPE(i)); 1144 1152 1145 1153 if (first) { 1154 + sectors = vstruct_sectors(b->data, c->block_bits); 1155 + if (btree_err_on(b->written + sectors > (ptr_written ?: btree_sectors(c)), 1156 + -BCH_ERR_btree_node_read_err_fixable, 1157 + c, ca, b, i, NULL, 1158 + bset_past_end_of_btree_node, 1159 + "bset past end of btree node (offset %u len %u but written %zu)", 1160 + b->written, sectors, ptr_written ?: btree_sectors(c))) 1161 + i->u64s = 0; 1146 1162 if (good_csum_type) { 1147 1163 struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data); 1148 1164 bool csum_bad = bch2_crc_cmp(b->data->csum, csum); ··· 1178 1178 c, NULL, b, NULL, NULL, 1179 1179 btree_node_unsupported_version, 1180 1180 "btree node does not have NEW_EXTENT_OVERWRITE set"); 1181 - 1182 - sectors = vstruct_sectors(b->data, c->block_bits); 1183 1181 } else { 1182 + sectors = vstruct_sectors(bne, c->block_bits); 1183 + if (btree_err_on(b->written + sectors > (ptr_written ?: btree_sectors(c)), 1184 + -BCH_ERR_btree_node_read_err_fixable, 1185 + c, ca, b, i, NULL, 1186 + bset_past_end_of_btree_node, 1187 + "bset past end of btree node (offset %u len %u but written %zu)", 1188 + b->written, sectors, ptr_written ?: btree_sectors(c))) 1189 + i->u64s = 0; 1184 1190 if (good_csum_type) { 1185 1191 struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); 1186 1192 bool csum_bad = bch2_crc_cmp(bne->csum, csum); ··· 1207 1201 "decrypting btree node: %s", bch2_err_str(ret))) 1208 1202 goto fsck_err; 1209 1203 } 1210 - 1211 - sectors = vstruct_sectors(bne, c->block_bits); 1212 1204 } 1213 1205 1214 1206 b->version_ondisk = min(b->version_ondisk, 1215 1207 le16_to_cpu(i->version)); 1216 1208 1217 - ret = validate_bset(c, ca, b, i, b->written, sectors, READ, failed, err_msg); 1209 + ret = validate_bset(c, ca, b, i, b->written, READ, failed, err_msg); 1218 1210 if (ret) 1219 1211 goto fsck_err; 1220 1212 ··· 1986 1982 prt_newline(&err); 1987 1983 1988 1984 if (!btree_node_scrub_check(c, scrub->buf, scrub->written, &err)) { 1989 - struct btree_trans *trans = bch2_trans_get(c); 1990 - 1991 - struct btree_iter iter; 1992 - bch2_trans_node_iter_init(trans, &iter, scrub->btree, 1993 - scrub->key.k->k.p, 0, scrub->level - 1, 0); 1994 - 1995 - struct btree *b; 1996 - int ret = lockrestart_do(trans, 1997 - PTR_ERR_OR_ZERO(b = bch2_btree_iter_peek_node(trans, &iter))); 1998 - if (ret) 1999 - goto err; 2000 - 2001 - if (bkey_i_to_btree_ptr_v2(&b->key)->v.seq == scrub->seq) { 2002 - bch_err(c, "error validating btree node during scrub on %s at btree %s", 2003 - scrub->ca->name, err.buf); 2004 - 2005 - ret = bch2_btree_node_rewrite(trans, &iter, b, 0, 0); 2006 - } 2007 - err: 2008 - bch2_trans_iter_exit(trans, &iter); 2009 - bch2_trans_begin(trans); 2010 - bch2_trans_put(trans); 1985 + int ret = bch2_trans_do(c, 1986 + bch2_btree_node_rewrite_key(trans, scrub->btree, scrub->level - 1, 1987 + scrub->key.k, 0)); 1988 + if (!bch2_err_matches(ret, ENOENT) && 1989 + !bch2_err_matches(ret, EROFS)) 1990 + bch_err_fn_ratelimited(c, ret); 2011 1991 } 2012 1992 2013 1993 printbuf_exit(&err); ··· 2255 2267 } 2256 2268 2257 2269 static int validate_bset_for_write(struct bch_fs *c, struct btree *b, 2258 - struct bset *i, unsigned sectors) 2270 + struct bset *i) 2259 2271 { 2260 2272 int ret = bch2_bkey_validate(c, bkey_i_to_s_c(&b->key), 2261 2273 (struct bkey_validate_context) { ··· 2270 2282 } 2271 2283 2272 2284 ret = validate_bset_keys(c, b, i, WRITE, NULL, NULL) ?: 2273 - validate_bset(c, NULL, b, i, b->written, sectors, WRITE, NULL, NULL); 2285 + validate_bset(c, NULL, b, i, b->written, WRITE, NULL, NULL); 2274 2286 if (ret) { 2275 2287 bch2_inconsistent_error(c); 2276 2288 dump_stack(); ··· 2463 2475 2464 2476 /* if we're going to be encrypting, check metadata validity first: */ 2465 2477 if (validate_before_checksum && 2466 - validate_bset_for_write(c, b, i, sectors_to_write)) 2478 + validate_bset_for_write(c, b, i)) 2467 2479 goto err; 2468 2480 2469 2481 ret = bset_encrypt(c, i, b->written << 9); ··· 2480 2492 2481 2493 /* if we're not encrypting, check metadata after checksumming: */ 2482 2494 if (!validate_before_checksum && 2483 - validate_bset_for_write(c, b, i, sectors_to_write)) 2495 + validate_bset_for_write(c, b, i)) 2484 2496 goto err; 2485 2497 2486 2498 /*
+120 -55
fs/bcachefs/btree_iter.c
··· 2076 2076 2077 2077 static noinline 2078 2078 void bch2_btree_trans_peek_prev_updates(struct btree_trans *trans, struct btree_iter *iter, 2079 - struct bkey_s_c *k) 2079 + struct bpos search_key, struct bkey_s_c *k) 2080 2080 { 2081 2081 struct bpos end = path_l(btree_iter_path(trans, iter))->b->data->min_key; 2082 2082 2083 2083 trans_for_each_update(trans, i) 2084 2084 if (!i->key_cache_already_flushed && 2085 2085 i->btree_id == iter->btree_id && 2086 - bpos_le(i->k->k.p, iter->pos) && 2086 + bpos_le(i->k->k.p, search_key) && 2087 2087 bpos_ge(i->k->k.p, k->k ? k->k->p : end)) { 2088 2088 iter->k = i->k->k; 2089 2089 *k = bkey_i_to_s_c(i->k); ··· 2092 2092 2093 2093 static noinline 2094 2094 void bch2_btree_trans_peek_updates(struct btree_trans *trans, struct btree_iter *iter, 2095 + struct bpos search_key, 2095 2096 struct bkey_s_c *k) 2096 2097 { 2097 2098 struct btree_path *path = btree_iter_path(trans, iter); ··· 2101 2100 trans_for_each_update(trans, i) 2102 2101 if (!i->key_cache_already_flushed && 2103 2102 i->btree_id == iter->btree_id && 2104 - bpos_ge(i->k->k.p, path->pos) && 2103 + bpos_ge(i->k->k.p, search_key) && 2105 2104 bpos_le(i->k->k.p, k->k ? k->k->p : end)) { 2106 2105 iter->k = i->k->k; 2107 2106 *k = bkey_i_to_s_c(i->k); ··· 2123 2122 2124 2123 static struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans, 2125 2124 struct btree_iter *iter, 2125 + struct bpos search_pos, 2126 2126 struct bpos end_pos) 2127 2127 { 2128 2128 struct btree_path *path = btree_iter_path(trans, iter); 2129 2129 2130 2130 return bch2_journal_keys_peek_max(trans->c, iter->btree_id, 2131 2131 path->level, 2132 - path->pos, 2132 + search_pos, 2133 2133 end_pos, 2134 2134 &iter->journal_idx); 2135 2135 } ··· 2140 2138 struct btree_iter *iter) 2141 2139 { 2142 2140 struct btree_path *path = btree_iter_path(trans, iter); 2143 - struct bkey_i *k = bch2_btree_journal_peek(trans, iter, path->pos); 2141 + struct bkey_i *k = bch2_btree_journal_peek(trans, iter, path->pos, path->pos); 2144 2142 2145 2143 if (k) { 2146 2144 iter->k = k->k; ··· 2153 2151 static noinline 2154 2152 void btree_trans_peek_journal(struct btree_trans *trans, 2155 2153 struct btree_iter *iter, 2154 + struct bpos search_key, 2156 2155 struct bkey_s_c *k) 2157 2156 { 2158 2157 struct btree_path *path = btree_iter_path(trans, iter); 2159 2158 struct bkey_i *next_journal = 2160 - bch2_btree_journal_peek(trans, iter, 2159 + bch2_btree_journal_peek(trans, iter, search_key, 2161 2160 k->k ? k->k->p : path_l(path)->b->key.k.p); 2162 2161 if (next_journal) { 2163 2162 iter->k = next_journal->k; ··· 2168 2165 2169 2166 static struct bkey_i *bch2_btree_journal_peek_prev(struct btree_trans *trans, 2170 2167 struct btree_iter *iter, 2168 + struct bpos search_key, 2171 2169 struct bpos end_pos) 2172 2170 { 2173 2171 struct btree_path *path = btree_iter_path(trans, iter); 2174 2172 2175 2173 return bch2_journal_keys_peek_prev_min(trans->c, iter->btree_id, 2176 2174 path->level, 2177 - path->pos, 2175 + search_key, 2178 2176 end_pos, 2179 2177 &iter->journal_idx); 2180 2178 } ··· 2183 2179 static noinline 2184 2180 void btree_trans_peek_prev_journal(struct btree_trans *trans, 2185 2181 struct btree_iter *iter, 2182 + struct bpos search_key, 2186 2183 struct bkey_s_c *k) 2187 2184 { 2188 2185 struct btree_path *path = btree_iter_path(trans, iter); 2189 2186 struct bkey_i *next_journal = 2190 - bch2_btree_journal_peek_prev(trans, iter, 2187 + bch2_btree_journal_peek_prev(trans, iter, search_key, 2191 2188 k->k ? k->k->p : path_l(path)->b->key.k.p); 2192 2189 2193 2190 if (next_journal) { ··· 2297 2292 } 2298 2293 2299 2294 if (unlikely(iter->flags & BTREE_ITER_with_journal)) 2300 - btree_trans_peek_journal(trans, iter, &k); 2295 + btree_trans_peek_journal(trans, iter, search_key, &k); 2301 2296 2302 2297 if (unlikely((iter->flags & BTREE_ITER_with_updates) && 2303 2298 trans->nr_updates)) 2304 - bch2_btree_trans_peek_updates(trans, iter, &k); 2299 + bch2_btree_trans_peek_updates(trans, iter, search_key, &k); 2305 2300 2306 2301 if (k.k && bkey_deleted(k.k)) { 2307 2302 /* ··· 2331 2326 } 2332 2327 2333 2328 bch2_btree_iter_verify(trans, iter); 2329 + 2330 + if (trace___btree_iter_peek_enabled()) { 2331 + CLASS(printbuf, buf)(); 2332 + 2333 + int ret = bkey_err(k); 2334 + if (ret) 2335 + prt_str(&buf, bch2_err_str(ret)); 2336 + else if (k.k) 2337 + bch2_bkey_val_to_text(&buf, trans->c, k); 2338 + else 2339 + prt_str(&buf, "(null)"); 2340 + trace___btree_iter_peek(trans->c, buf.buf); 2341 + } 2342 + 2334 2343 return k; 2335 2344 } 2336 2345 ··· 2503 2484 2504 2485 bch2_btree_iter_verify_entry_exit(iter); 2505 2486 2487 + if (trace_btree_iter_peek_max_enabled()) { 2488 + CLASS(printbuf, buf)(); 2489 + 2490 + int ret = bkey_err(k); 2491 + if (ret) 2492 + prt_str(&buf, bch2_err_str(ret)); 2493 + else if (k.k) 2494 + bch2_bkey_val_to_text(&buf, trans->c, k); 2495 + else 2496 + prt_str(&buf, "(null)"); 2497 + trace_btree_iter_peek_max(trans->c, buf.buf); 2498 + } 2499 + 2506 2500 return k; 2507 2501 end: 2508 2502 bch2_btree_iter_set_pos(trans, iter, end); ··· 2589 2557 } 2590 2558 2591 2559 if (unlikely(iter->flags & BTREE_ITER_with_journal)) 2592 - btree_trans_peek_prev_journal(trans, iter, &k); 2560 + btree_trans_peek_prev_journal(trans, iter, search_key, &k); 2593 2561 2594 2562 if (unlikely((iter->flags & BTREE_ITER_with_updates) && 2595 2563 trans->nr_updates)) 2596 - bch2_btree_trans_peek_prev_updates(trans, iter, &k); 2564 + bch2_btree_trans_peek_prev_updates(trans, iter, search_key, &k); 2597 2565 2598 2566 if (likely(k.k && !bkey_deleted(k.k))) { 2599 2567 break; ··· 2756 2724 2757 2725 bch2_btree_iter_verify_entry_exit(iter); 2758 2726 bch2_btree_iter_verify(trans, iter); 2727 + 2728 + if (trace_btree_iter_peek_prev_min_enabled()) { 2729 + CLASS(printbuf, buf)(); 2730 + 2731 + int ret = bkey_err(k); 2732 + if (ret) 2733 + prt_str(&buf, bch2_err_str(ret)); 2734 + else if (k.k) 2735 + bch2_bkey_val_to_text(&buf, trans->c, k); 2736 + else 2737 + prt_str(&buf, "(null)"); 2738 + trace_btree_iter_peek_prev_min(trans->c, buf.buf); 2739 + } 2759 2740 return k; 2760 2741 end: 2761 2742 bch2_btree_iter_set_pos(trans, iter, end); ··· 2812 2767 /* extents can't span inode numbers: */ 2813 2768 if ((iter->flags & BTREE_ITER_is_extents) && 2814 2769 unlikely(iter->pos.offset == KEY_OFFSET_MAX)) { 2815 - if (iter->pos.inode == KEY_INODE_MAX) 2816 - return bkey_s_c_null; 2770 + if (iter->pos.inode == KEY_INODE_MAX) { 2771 + k = bkey_s_c_null; 2772 + goto out2; 2773 + } 2817 2774 2818 2775 bch2_btree_iter_set_pos(trans, iter, bpos_nosnap_successor(iter->pos)); 2819 2776 } ··· 2832 2785 } 2833 2786 2834 2787 struct btree_path *path = btree_iter_path(trans, iter); 2835 - if (unlikely(!btree_path_node(path, path->level))) 2836 - return bkey_s_c_null; 2788 + if (unlikely(!btree_path_node(path, path->level))) { 2789 + k = bkey_s_c_null; 2790 + goto out2; 2791 + } 2837 2792 2838 2793 btree_path_set_should_be_locked(trans, path); 2839 2794 ··· 2928 2879 bch2_btree_iter_verify(trans, iter); 2929 2880 ret = bch2_btree_iter_verify_ret(trans, iter, k); 2930 2881 if (unlikely(ret)) 2931 - return bkey_s_c_err(ret); 2882 + k = bkey_s_c_err(ret); 2883 + out2: 2884 + if (trace_btree_iter_peek_slot_enabled()) { 2885 + CLASS(printbuf, buf)(); 2886 + 2887 + int ret = bkey_err(k); 2888 + if (ret) 2889 + prt_str(&buf, bch2_err_str(ret)); 2890 + else if (k.k) 2891 + bch2_bkey_val_to_text(&buf, trans->c, k); 2892 + else 2893 + prt_str(&buf, "(null)"); 2894 + trace_btree_iter_peek_slot(trans->c, buf.buf); 2895 + } 2932 2896 2933 2897 return k; 2934 2898 } ··· 3194 3132 if (WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX)) { 3195 3133 #ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE 3196 3134 struct printbuf buf = PRINTBUF; 3135 + bch2_log_msg_start(c, &buf); 3136 + prt_printf(&buf, "bump allocator exceeded BTREE_TRANS_MEM_MAX (%u)\n", 3137 + BTREE_TRANS_MEM_MAX); 3138 + 3197 3139 bch2_trans_kmalloc_trace_to_text(&buf, &trans->trans_kmalloc_trace); 3198 3140 bch2_print_str(c, KERN_ERR, buf.buf); 3199 3141 printbuf_exit(&buf); ··· 3225 3159 mutex_unlock(&s->lock); 3226 3160 } 3227 3161 3228 - if (trans->used_mempool) { 3229 - if (trans->mem_bytes >= new_bytes) 3230 - goto out_change_top; 3231 - 3232 - /* No more space from mempool item, need malloc new one */ 3233 - new_mem = kmalloc(new_bytes, GFP_NOWAIT|__GFP_NOWARN); 3234 - if (unlikely(!new_mem)) { 3235 - bch2_trans_unlock(trans); 3236 - 3237 - new_mem = kmalloc(new_bytes, GFP_KERNEL); 3238 - if (!new_mem) 3239 - return ERR_PTR(-BCH_ERR_ENOMEM_trans_kmalloc); 3240 - 3241 - ret = bch2_trans_relock(trans); 3242 - if (ret) { 3243 - kfree(new_mem); 3244 - return ERR_PTR(ret); 3245 - } 3246 - } 3247 - memcpy(new_mem, trans->mem, trans->mem_top); 3248 - trans->used_mempool = false; 3249 - mempool_free(trans->mem, &c->btree_trans_mem_pool); 3250 - goto out_new_mem; 3162 + if (trans->used_mempool || new_bytes > BTREE_TRANS_MEM_MAX) { 3163 + EBUG_ON(trans->mem_bytes >= new_bytes); 3164 + return ERR_PTR(-BCH_ERR_ENOMEM_trans_kmalloc); 3251 3165 } 3252 3166 3253 - new_mem = krealloc(trans->mem, new_bytes, GFP_NOWAIT|__GFP_NOWARN); 3167 + if (old_bytes) { 3168 + trans->realloc_bytes_required = new_bytes; 3169 + trace_and_count(c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes); 3170 + return ERR_PTR(btree_trans_restart_ip(trans, 3171 + BCH_ERR_transaction_restart_mem_realloced, _RET_IP_)); 3172 + } 3173 + 3174 + EBUG_ON(trans->mem); 3175 + 3176 + new_mem = kmalloc(new_bytes, GFP_NOWAIT|__GFP_NOWARN); 3254 3177 if (unlikely(!new_mem)) { 3255 3178 bch2_trans_unlock(trans); 3256 3179 3257 - new_mem = krealloc(trans->mem, new_bytes, GFP_KERNEL); 3180 + new_mem = kmalloc(new_bytes, GFP_KERNEL); 3258 3181 if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) { 3259 3182 new_mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL); 3260 3183 new_bytes = BTREE_TRANS_MEM_MAX; 3261 - memcpy(new_mem, trans->mem, trans->mem_top); 3262 3184 trans->used_mempool = true; 3263 - kfree(trans->mem); 3264 3185 } 3265 3186 3266 - if (!new_mem) 3267 - return ERR_PTR(-BCH_ERR_ENOMEM_trans_kmalloc); 3187 + EBUG_ON(!new_mem); 3268 3188 3269 3189 trans->mem = new_mem; 3270 3190 trans->mem_bytes = new_bytes; ··· 3259 3207 if (ret) 3260 3208 return ERR_PTR(ret); 3261 3209 } 3262 - out_new_mem: 3210 + 3263 3211 trans->mem = new_mem; 3264 3212 trans->mem_bytes = new_bytes; 3265 - 3266 - if (old_bytes) { 3267 - trace_and_count(c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes); 3268 - return ERR_PTR(btree_trans_restart_ip(trans, 3269 - BCH_ERR_transaction_restart_mem_realloced, _RET_IP_)); 3270 - } 3271 - out_change_top: 3272 - bch2_trans_kmalloc_trace(trans, size, ip); 3273 3213 3274 3214 p = trans->mem + trans->mem_top; 3275 3215 trans->mem_top += size; ··· 3322 3278 3323 3279 trans->restart_count++; 3324 3280 trans->mem_top = 0; 3281 + 3282 + if (trans->restarted == BCH_ERR_transaction_restart_mem_realloced) { 3283 + EBUG_ON(!trans->mem || !trans->mem_bytes); 3284 + unsigned new_bytes = trans->realloc_bytes_required; 3285 + void *new_mem = krealloc(trans->mem, new_bytes, GFP_NOWAIT|__GFP_NOWARN); 3286 + if (unlikely(!new_mem)) { 3287 + bch2_trans_unlock(trans); 3288 + new_mem = krealloc(trans->mem, new_bytes, GFP_KERNEL); 3289 + 3290 + EBUG_ON(new_bytes > BTREE_TRANS_MEM_MAX); 3291 + 3292 + if (!new_mem) { 3293 + new_mem = mempool_alloc(&trans->c->btree_trans_mem_pool, GFP_KERNEL); 3294 + new_bytes = BTREE_TRANS_MEM_MAX; 3295 + trans->used_mempool = true; 3296 + kfree(trans->mem); 3297 + } 3298 + } 3299 + trans->mem = new_mem; 3300 + trans->mem_bytes = new_bytes; 3301 + } 3325 3302 3326 3303 trans_for_each_path(trans, path, i) { 3327 3304 path->should_be_locked = false;
+53 -25
fs/bcachefs/btree_journal_iter.c
··· 137 137 struct journal_key *k; 138 138 139 139 BUG_ON(*idx > keys->nr); 140 + 141 + if (!keys->nr) 142 + return NULL; 140 143 search: 141 144 if (!*idx) 142 145 *idx = __bch2_journal_key_search(keys, btree_id, level, pos); 143 146 144 - while (*idx && 145 - __journal_key_cmp(btree_id, level, end_pos, idx_to_key(keys, *idx - 1)) <= 0) { 147 + while (*idx < keys->nr && 148 + __journal_key_cmp(btree_id, level, end_pos, idx_to_key(keys, *idx)) >= 0) { 146 149 (*idx)++; 147 150 iters++; 148 151 if (iters == 10) { ··· 154 151 } 155 152 } 156 153 154 + if (*idx == keys->nr) 155 + --(*idx); 156 + 157 157 struct bkey_i *ret = NULL; 158 158 rcu_read_lock(); /* for overwritten_ranges */ 159 159 160 - while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) { 160 + while (true) { 161 + k = idx_to_key(keys, *idx); 161 162 if (__journal_key_cmp(btree_id, level, end_pos, k) > 0) 162 163 break; 163 164 164 165 if (k->overwritten) { 165 166 if (k->overwritten_range) 166 - *idx = rcu_dereference(k->overwritten_range)->start - 1; 167 - else 168 - *idx -= 1; 167 + *idx = rcu_dereference(k->overwritten_range)->start; 168 + if (!*idx) 169 + break; 170 + --(*idx); 169 171 continue; 170 172 } 171 173 ··· 179 171 break; 180 172 } 181 173 174 + if (!*idx) 175 + break; 182 176 --(*idx); 183 177 iters++; 184 178 if (iters == 10) { ··· 651 641 { 652 642 const struct journal_key *l = _l; 653 643 const struct journal_key *r = _r; 644 + int rewind = l->rewind && r->rewind ? -1 : 1; 654 645 655 646 return journal_key_cmp(l, r) ?: 656 - cmp_int(l->journal_seq, r->journal_seq) ?: 657 - cmp_int(l->journal_offset, r->journal_offset); 647 + ((cmp_int(l->journal_seq, r->journal_seq) ?: 648 + cmp_int(l->journal_offset, r->journal_offset)) * rewind); 658 649 } 659 650 660 651 void bch2_journal_keys_put(struct bch_fs *c) ··· 724 713 struct journal_keys *keys = &c->journal_keys; 725 714 size_t nr_read = 0; 726 715 716 + u64 rewind_seq = c->opts.journal_rewind ?: U64_MAX; 717 + 727 718 genradix_for_each(&c->journal_entries, iter, _i) { 728 719 i = *_i; 729 720 ··· 734 721 735 722 cond_resched(); 736 723 737 - for_each_jset_key(k, entry, &i->j) { 738 - struct journal_key n = (struct journal_key) { 739 - .btree_id = entry->btree_id, 740 - .level = entry->level, 741 - .k = k, 742 - .journal_seq = le64_to_cpu(i->j.seq), 743 - .journal_offset = k->_data - i->j._data, 744 - }; 724 + vstruct_for_each(&i->j, entry) { 725 + bool rewind = !entry->level && 726 + !btree_id_is_alloc(entry->btree_id) && 727 + le64_to_cpu(i->j.seq) >= rewind_seq; 745 728 746 - if (darray_push(keys, n)) { 747 - __journal_keys_sort(keys); 729 + if (entry->type != (rewind 730 + ? BCH_JSET_ENTRY_overwrite 731 + : BCH_JSET_ENTRY_btree_keys)) 732 + continue; 748 733 749 - if (keys->nr * 8 > keys->size * 7) { 750 - bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu keys at seq %llu", 751 - keys->nr, keys->size, nr_read, le64_to_cpu(i->j.seq)); 752 - return bch_err_throw(c, ENOMEM_journal_keys_sort); 734 + if (!rewind && le64_to_cpu(i->j.seq) < c->journal_replay_seq_start) 735 + continue; 736 + 737 + jset_entry_for_each_key(entry, k) { 738 + struct journal_key n = (struct journal_key) { 739 + .btree_id = entry->btree_id, 740 + .level = entry->level, 741 + .rewind = rewind, 742 + .k = k, 743 + .journal_seq = le64_to_cpu(i->j.seq), 744 + .journal_offset = k->_data - i->j._data, 745 + }; 746 + 747 + if (darray_push(keys, n)) { 748 + __journal_keys_sort(keys); 749 + 750 + if (keys->nr * 8 > keys->size * 7) { 751 + bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu keys at seq %llu", 752 + keys->nr, keys->size, nr_read, le64_to_cpu(i->j.seq)); 753 + return bch_err_throw(c, ENOMEM_journal_keys_sort); 754 + } 755 + 756 + BUG_ON(darray_push(keys, n)); 753 757 } 754 758 755 - BUG_ON(darray_push(keys, n)); 759 + nr_read++; 756 760 } 757 - 758 - nr_read++; 759 761 } 760 762 } 761 763
+3 -2
fs/bcachefs/btree_journal_iter_types.h
··· 11 11 u32 journal_offset; 12 12 enum btree_id btree_id:8; 13 13 unsigned level:8; 14 - bool allocated; 15 - bool overwritten; 14 + bool allocated:1; 15 + bool overwritten:1; 16 + bool rewind:1; 16 17 struct journal_key_range_overwritten __rcu * 17 18 overwritten_range; 18 19 struct bkey_i *k;
+6 -6
fs/bcachefs/btree_locking.c
··· 771 771 } 772 772 773 773 static noinline __cold void bch2_trans_relock_fail(struct btree_trans *trans, struct btree_path *path, 774 - struct get_locks_fail *f, bool trace) 774 + struct get_locks_fail *f, bool trace, ulong ip) 775 775 { 776 776 if (!trace) 777 777 goto out; ··· 796 796 prt_printf(&buf, " total locked %u.%u.%u", c.n[0], c.n[1], c.n[2]); 797 797 } 798 798 799 - trace_trans_restart_relock(trans, _RET_IP_, buf.buf); 799 + trace_trans_restart_relock(trans, ip, buf.buf); 800 800 printbuf_exit(&buf); 801 801 } 802 802 ··· 806 806 bch2_trans_verify_locks(trans); 807 807 } 808 808 809 - static inline int __bch2_trans_relock(struct btree_trans *trans, bool trace) 809 + static inline int __bch2_trans_relock(struct btree_trans *trans, bool trace, ulong ip) 810 810 { 811 811 bch2_trans_verify_locks(trans); 812 812 ··· 825 825 if (path->should_be_locked && 826 826 (ret = btree_path_get_locks(trans, path, false, &f, 827 827 BCH_ERR_transaction_restart_relock))) { 828 - bch2_trans_relock_fail(trans, path, &f, trace); 828 + bch2_trans_relock_fail(trans, path, &f, trace, ip); 829 829 return ret; 830 830 } 831 831 } ··· 838 838 839 839 int bch2_trans_relock(struct btree_trans *trans) 840 840 { 841 - return __bch2_trans_relock(trans, true); 841 + return __bch2_trans_relock(trans, true, _RET_IP_); 842 842 } 843 843 844 844 int bch2_trans_relock_notrace(struct btree_trans *trans) 845 845 { 846 - return __bch2_trans_relock(trans, false); 846 + return __bch2_trans_relock(trans, false, _RET_IP_); 847 847 } 848 848 849 849 void bch2_trans_unlock(struct btree_trans *trans)
+5 -1
fs/bcachefs/btree_node_scan.c
··· 521 521 return false; 522 522 } 523 523 524 - bool bch2_btree_has_scanned_nodes(struct bch_fs *c, enum btree_id btree) 524 + int bch2_btree_has_scanned_nodes(struct bch_fs *c, enum btree_id btree) 525 525 { 526 + int ret = bch2_run_print_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes); 527 + if (ret) 528 + return ret; 529 + 526 530 struct found_btree_node search = { 527 531 .btree_id = btree, 528 532 .level = 0,
+1 -1
fs/bcachefs/btree_node_scan.h
··· 4 4 5 5 int bch2_scan_for_btree_nodes(struct bch_fs *); 6 6 bool bch2_btree_node_is_stale(struct bch_fs *, struct btree *); 7 - bool bch2_btree_has_scanned_nodes(struct bch_fs *, enum btree_id); 7 + int bch2_btree_has_scanned_nodes(struct bch_fs *, enum btree_id); 8 8 int bch2_get_scanned_nodes(struct bch_fs *, enum btree_id, unsigned, struct bpos, struct bpos); 9 9 void bch2_find_btree_nodes_exit(struct find_btree_nodes *); 10 10
+12 -6
fs/bcachefs/btree_trans_commit.c
··· 595 595 int ret = 0; 596 596 597 597 bch2_trans_verify_not_unlocked_or_in_restart(trans); 598 - 598 + #if 0 599 + /* todo: bring back dynamic fault injection */ 599 600 if (race_fault()) { 600 601 trace_and_count(c, trans_restart_fault_inject, trans, trace_ip); 601 602 return btree_trans_restart(trans, BCH_ERR_transaction_restart_fault_inject); 602 603 } 603 - 604 + #endif 604 605 /* 605 606 * Check if the insert will fit in the leaf node with the write lock 606 607 * held, otherwise another thread could write the node changing the ··· 757 756 memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res), 758 757 btree_trans_journal_entries_start(trans), 759 758 trans->journal_entries.u64s); 759 + 760 + EBUG_ON(trans->journal_res.u64s < trans->journal_entries.u64s); 760 761 761 762 trans->journal_res.offset += trans->journal_entries.u64s; 762 763 trans->journal_res.u64s -= trans->journal_entries.u64s; ··· 1006 1003 { 1007 1004 struct btree_insert_entry *errored_at = NULL; 1008 1005 struct bch_fs *c = trans->c; 1006 + unsigned journal_u64s = 0; 1009 1007 int ret = 0; 1010 1008 1011 1009 bch2_trans_verify_not_unlocked_or_in_restart(trans); ··· 1035 1031 1036 1032 EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags)); 1037 1033 1038 - trans->journal_u64s = trans->journal_entries.u64s + jset_u64s(trans->accounting.u64s); 1034 + journal_u64s = jset_u64s(trans->accounting.u64s); 1039 1035 trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names); 1040 1036 if (trans->journal_transaction_names) 1041 - trans->journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s); 1037 + journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s); 1042 1038 1043 1039 trans_for_each_update(trans, i) { 1044 1040 struct btree_path *path = trans->paths + i->path; ··· 1058 1054 continue; 1059 1055 1060 1056 /* we're going to journal the key being updated: */ 1061 - trans->journal_u64s += jset_u64s(i->k->k.u64s); 1057 + journal_u64s += jset_u64s(i->k->k.u64s); 1062 1058 1063 1059 /* and we're also going to log the overwrite: */ 1064 1060 if (trans->journal_transaction_names) 1065 - trans->journal_u64s += jset_u64s(i->old_k.u64s); 1061 + journal_u64s += jset_u64s(i->old_k.u64s); 1066 1062 } 1067 1063 1068 1064 if (trans->extra_disk_res) { ··· 1079 1075 if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) 1080 1076 memset(&trans->journal_res, 0, sizeof(trans->journal_res)); 1081 1077 memset(&trans->fs_usage_delta, 0, sizeof(trans->fs_usage_delta)); 1078 + 1079 + trans->journal_u64s = journal_u64s + trans->journal_entries.u64s; 1082 1080 1083 1081 ret = do_bch2_trans_commit(trans, flags, &errored_at, _RET_IP_); 1084 1082
+1
fs/bcachefs/btree_types.h
··· 497 497 void *mem; 498 498 unsigned mem_top; 499 499 unsigned mem_bytes; 500 + unsigned realloc_bytes_required; 500 501 #ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE 501 502 darray_trans_kmalloc_trace trans_kmalloc_trace; 502 503 #endif
+11 -5
fs/bcachefs/btree_update.c
··· 549 549 unsigned u64s) 550 550 { 551 551 unsigned new_top = buf->u64s + u64s; 552 - unsigned old_size = buf->size; 552 + unsigned new_size = buf->size; 553 553 554 - if (new_top > buf->size) 555 - buf->size = roundup_pow_of_two(new_top); 554 + BUG_ON(roundup_pow_of_two(new_top) > U16_MAX); 556 555 557 - void *n = bch2_trans_kmalloc_nomemzero(trans, buf->size * sizeof(u64)); 556 + if (new_top > new_size) 557 + new_size = roundup_pow_of_two(new_top); 558 + 559 + void *n = bch2_trans_kmalloc_nomemzero(trans, new_size * sizeof(u64)); 558 560 if (IS_ERR(n)) 559 561 return n; 562 + 563 + unsigned offset = (u64 *) n - (u64 *) trans->mem; 564 + BUG_ON(offset > U16_MAX); 560 565 561 566 if (buf->u64s) 562 567 memcpy(n, 563 568 btree_trans_subbuf_base(trans, buf), 564 - old_size * sizeof(u64)); 569 + buf->size * sizeof(u64)); 565 570 buf->base = (u64 *) n - (u64 *) trans->mem; 571 + buf->size = new_size; 566 572 567 573 void *p = btree_trans_subbuf_top(trans, buf); 568 574 buf->u64s = new_top;
+2 -3
fs/bcachefs/btree_update.h
··· 170 170 171 171 int bch2_btree_insert_clone_trans(struct btree_trans *, enum btree_id, struct bkey_i *); 172 172 173 - int bch2_btree_write_buffer_insert_err(struct btree_trans *, 174 - enum btree_id, struct bkey_i *); 173 + int bch2_btree_write_buffer_insert_err(struct bch_fs *, enum btree_id, struct bkey_i *); 175 174 176 175 static inline int __must_check bch2_trans_update_buffered(struct btree_trans *trans, 177 176 enum btree_id btree, ··· 181 182 EBUG_ON(k->k.u64s > BTREE_WRITE_BUFERED_U64s_MAX); 182 183 183 184 if (unlikely(!btree_type_uses_write_buffer(btree))) { 184 - int ret = bch2_btree_write_buffer_insert_err(trans, btree, k); 185 + int ret = bch2_btree_write_buffer_insert_err(trans->c, btree, k); 185 186 dump_stack(); 186 187 return ret; 187 188 }
+8 -8
fs/bcachefs/btree_update_interior.c
··· 1287 1287 1288 1288 do { 1289 1289 ret = bch2_btree_reserve_get(trans, as, nr_nodes, target, flags, &cl); 1290 - 1290 + if (!bch2_err_matches(ret, BCH_ERR_operation_blocked)) 1291 + break; 1291 1292 bch2_trans_unlock(trans); 1292 1293 bch2_wait_on_allocator(c, &cl); 1293 - } while (bch2_err_matches(ret, BCH_ERR_operation_blocked)); 1294 + } while (1); 1294 1295 } 1295 1296 1296 1297 if (ret) { ··· 2294 2293 goto out; 2295 2294 } 2296 2295 2297 - static int bch2_btree_node_rewrite_key(struct btree_trans *trans, 2298 - enum btree_id btree, unsigned level, 2299 - struct bkey_i *k, unsigned flags) 2296 + int bch2_btree_node_rewrite_key(struct btree_trans *trans, 2297 + enum btree_id btree, unsigned level, 2298 + struct bkey_i *k, unsigned flags) 2300 2299 { 2301 2300 struct btree_iter iter; 2302 2301 bch2_trans_node_iter_init(trans, &iter, ··· 2368 2367 2369 2368 int ret = bch2_trans_do(c, bch2_btree_node_rewrite_key(trans, 2370 2369 a->btree_id, a->level, a->key.k, 0)); 2371 - if (ret != -ENOENT && 2372 - !bch2_err_matches(ret, EROFS) && 2373 - ret != -BCH_ERR_journal_shutdown) 2370 + if (!bch2_err_matches(ret, ENOENT) && 2371 + !bch2_err_matches(ret, EROFS)) 2374 2372 bch_err_fn_ratelimited(c, ret); 2375 2373 2376 2374 spin_lock(&c->btree_node_rewrites_lock);
+3
fs/bcachefs/btree_update_interior.h
··· 176 176 177 177 int bch2_btree_node_rewrite(struct btree_trans *, struct btree_iter *, 178 178 struct btree *, unsigned, unsigned); 179 + int bch2_btree_node_rewrite_key(struct btree_trans *, 180 + enum btree_id, unsigned, 181 + struct bkey_i *, unsigned); 179 182 int bch2_btree_node_rewrite_pos(struct btree_trans *, 180 183 enum btree_id, unsigned, 181 184 struct bpos, unsigned, unsigned);
+5 -3
fs/bcachefs/btree_write_buffer.c
··· 267 267 BUG_ON(wb->sorted.size < wb->flushing.keys.nr); 268 268 } 269 269 270 - int bch2_btree_write_buffer_insert_err(struct btree_trans *trans, 270 + int bch2_btree_write_buffer_insert_err(struct bch_fs *c, 271 271 enum btree_id btree, struct bkey_i *k) 272 272 { 273 - struct bch_fs *c = trans->c; 274 273 struct printbuf buf = PRINTBUF; 275 274 276 275 prt_printf(&buf, "attempting to do write buffer update on non wb btree="); ··· 331 332 struct btree_write_buffered_key *k = &wb->flushing.keys.data[i->idx]; 332 333 333 334 if (unlikely(!btree_type_uses_write_buffer(k->btree))) { 334 - ret = bch2_btree_write_buffer_insert_err(trans, k->btree, &k->k); 335 + ret = bch2_btree_write_buffer_insert_err(trans->c, k->btree, &k->k); 335 336 goto err; 336 337 } 337 338 ··· 675 676 goto err; 676 677 677 678 bch2_bkey_buf_copy(last_flushed, c, tmp.k); 679 + 680 + /* can we avoid the unconditional restart? */ 681 + trace_and_count(c, trans_restart_write_buffer_flush, trans, _RET_IP_); 678 682 ret = bch_err_throw(c, transaction_restart_write_buffer_flush); 679 683 } 680 684 err:
+6
fs/bcachefs/btree_write_buffer.h
··· 89 89 struct journal_keys_to_wb *dst, 90 90 enum btree_id btree, struct bkey_i *k) 91 91 { 92 + if (unlikely(!btree_type_uses_write_buffer(btree))) { 93 + int ret = bch2_btree_write_buffer_insert_err(c, btree, k); 94 + dump_stack(); 95 + return ret; 96 + } 97 + 92 98 EBUG_ON(!dst->seq); 93 99 94 100 return k->k.type == KEY_TYPE_accounting
+22 -7
fs/bcachefs/chardev.c
··· 319 319 ctx->stats.ret = BCH_IOCTL_DATA_EVENT_RET_done; 320 320 ctx->stats.data_type = (int) DATA_PROGRESS_DATA_TYPE_done; 321 321 } 322 + enumerated_ref_put(&ctx->c->writes, BCH_WRITE_REF_ioctl_data); 322 323 return 0; 323 324 } 324 325 ··· 379 378 struct bch_data_ctx *ctx; 380 379 int ret; 381 380 382 - if (!capable(CAP_SYS_ADMIN)) 383 - return -EPERM; 381 + if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_ioctl_data)) 382 + return -EROFS; 384 383 385 - if (arg.op >= BCH_DATA_OP_NR || arg.flags) 386 - return -EINVAL; 384 + if (!capable(CAP_SYS_ADMIN)) { 385 + ret = -EPERM; 386 + goto put_ref; 387 + } 388 + 389 + if (arg.op >= BCH_DATA_OP_NR || arg.flags) { 390 + ret = -EINVAL; 391 + goto put_ref; 392 + } 387 393 388 394 ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); 389 - if (!ctx) 390 - return -ENOMEM; 395 + if (!ctx) { 396 + ret = -ENOMEM; 397 + goto put_ref; 398 + } 391 399 392 400 ctx->c = c; 393 401 ctx->arg = arg; ··· 405 395 &bcachefs_data_ops, 406 396 bch2_data_thread); 407 397 if (ret < 0) 408 - kfree(ctx); 398 + goto cleanup; 399 + return ret; 400 + cleanup: 401 + kfree(ctx); 402 + put_ref: 403 + enumerated_ref_put(&c->writes, BCH_WRITE_REF_ioctl_data); 409 404 return ret; 410 405 } 411 406
+1
fs/bcachefs/data_update.c
··· 249 249 bch2_bkey_val_to_text(&buf, c, k); 250 250 prt_str(&buf, "\nnew: "); 251 251 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); 252 + prt_newline(&buf); 252 253 253 254 bch2_fs_emergency_read_only2(c, &buf); 254 255
-5
fs/bcachefs/errcode.h
··· 137 137 x(BCH_ERR_transaction_restart, transaction_restart_relock) \ 138 138 x(BCH_ERR_transaction_restart, transaction_restart_relock_path) \ 139 139 x(BCH_ERR_transaction_restart, transaction_restart_relock_path_intent) \ 140 - x(BCH_ERR_transaction_restart, transaction_restart_relock_after_fill) \ 141 140 x(BCH_ERR_transaction_restart, transaction_restart_too_many_iters) \ 142 141 x(BCH_ERR_transaction_restart, transaction_restart_lock_node_reused) \ 143 142 x(BCH_ERR_transaction_restart, transaction_restart_fill_relock) \ ··· 147 148 x(BCH_ERR_transaction_restart, transaction_restart_would_deadlock_write)\ 148 149 x(BCH_ERR_transaction_restart, transaction_restart_deadlock_recursion_limit)\ 149 150 x(BCH_ERR_transaction_restart, transaction_restart_upgrade) \ 150 - x(BCH_ERR_transaction_restart, transaction_restart_key_cache_upgrade) \ 151 151 x(BCH_ERR_transaction_restart, transaction_restart_key_cache_fill) \ 152 152 x(BCH_ERR_transaction_restart, transaction_restart_key_cache_raced) \ 153 - x(BCH_ERR_transaction_restart, transaction_restart_key_cache_realloced)\ 154 - x(BCH_ERR_transaction_restart, transaction_restart_journal_preres_get) \ 155 153 x(BCH_ERR_transaction_restart, transaction_restart_split_race) \ 156 154 x(BCH_ERR_transaction_restart, transaction_restart_write_buffer_flush) \ 157 155 x(BCH_ERR_transaction_restart, transaction_restart_nested) \ ··· 237 241 x(BCH_ERR_journal_res_blocked, journal_buf_enomem) \ 238 242 x(BCH_ERR_journal_res_blocked, journal_stuck) \ 239 243 x(BCH_ERR_journal_res_blocked, journal_retry_open) \ 240 - x(BCH_ERR_journal_res_blocked, journal_preres_get_blocked) \ 241 244 x(BCH_ERR_journal_res_blocked, bucket_alloc_blocked) \ 242 245 x(BCH_ERR_journal_res_blocked, stripe_alloc_blocked) \ 243 246 x(BCH_ERR_invalid, invalid_sb) \
+3 -1
fs/bcachefs/error.c
··· 621 621 if (s) 622 622 s->ret = ret; 623 623 624 - if (trans) 624 + if (trans && 625 + !(flags & FSCK_ERR_NO_LOG) && 626 + ret == -BCH_ERR_fsck_fix) 625 627 ret = bch2_trans_log_str(trans, bch2_sb_error_strs[err]) ?: ret; 626 628 err_unlock: 627 629 mutex_unlock(&c->fsck_error_msgs_lock);
+12 -1
fs/bcachefs/extent_update.c
··· 139 139 if (ret) 140 140 return ret; 141 141 142 - bch2_cut_back(end, k); 142 + /* tracepoint */ 143 + 144 + if (bpos_lt(end, k->k.p)) { 145 + if (trace_extent_trim_atomic_enabled()) { 146 + CLASS(printbuf, buf)(); 147 + bch2_bpos_to_text(&buf, end); 148 + prt_newline(&buf); 149 + bch2_bkey_val_to_text(&buf, trans->c, bkey_i_to_s_c(k)); 150 + trace_extent_trim_atomic(trans->c, buf.buf); 151 + } 152 + bch2_cut_back(end, k); 153 + } 143 154 return 0; 144 155 }
+2 -1
fs/bcachefs/fs.c
··· 1732 1732 bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s, 1733 1733 ATTR_CTIME); 1734 1734 mutex_unlock(&inode->ei_update_lock); 1735 - return ret; 1735 + 1736 + return bch2_err_class(ret); 1736 1737 } 1737 1738 1738 1739 static const struct file_operations bch_file_operations = {
+218 -99
fs/bcachefs/fsck.c
··· 327 327 (inode->bi_flags & BCH_INODE_has_child_snapshot)) 328 328 return false; 329 329 330 - return !inode->bi_dir && !(inode->bi_flags & BCH_INODE_unlinked); 330 + return !bch2_inode_has_backpointer(inode) && 331 + !(inode->bi_flags & BCH_INODE_unlinked); 331 332 } 332 333 333 334 static int maybe_delete_dirent(struct btree_trans *trans, struct bpos d_pos, u32 snapshot) ··· 373 372 if (inode->bi_subvol) { 374 373 inode->bi_parent_subvol = BCACHEFS_ROOT_SUBVOL; 375 374 375 + struct btree_iter subvol_iter; 376 + struct bkey_i_subvolume *subvol = 377 + bch2_bkey_get_mut_typed(trans, &subvol_iter, 378 + BTREE_ID_subvolumes, POS(0, inode->bi_subvol), 379 + 0, subvolume); 380 + ret = PTR_ERR_OR_ZERO(subvol); 381 + if (ret) 382 + return ret; 383 + 384 + subvol->v.fs_path_parent = BCACHEFS_ROOT_SUBVOL; 385 + bch2_trans_iter_exit(trans, &subvol_iter); 386 + 376 387 u64 root_inum; 377 388 ret = subvol_lookup(trans, inode->bi_parent_subvol, 378 389 &dirent_snapshot, &root_inum); ··· 399 386 ret = lookup_lostfound(trans, dirent_snapshot, &lostfound, inode->bi_inum); 400 387 if (ret) 401 388 return ret; 389 + 390 + bch_verbose(c, "got lostfound inum %llu", lostfound.bi_inum); 402 391 403 392 lostfound.bi_nlink += S_ISDIR(inode->bi_mode); 404 393 ··· 437 422 ret = __bch2_fsck_write_inode(trans, inode); 438 423 if (ret) 439 424 return ret; 425 + 426 + { 427 + CLASS(printbuf, buf)(); 428 + ret = bch2_inum_snapshot_to_path(trans, inode->bi_inum, 429 + inode->bi_snapshot, NULL, &buf); 430 + if (ret) 431 + return ret; 432 + 433 + bch_info(c, "reattached at %s", buf.buf); 434 + } 440 435 441 436 /* 442 437 * Fix up inodes in child snapshots: if they should also be reattached ··· 515 490 static int remove_backpointer(struct btree_trans *trans, 516 491 struct bch_inode_unpacked *inode) 517 492 { 518 - if (!inode->bi_dir) 493 + if (!bch2_inode_has_backpointer(inode)) 519 494 return 0; 495 + 496 + u32 snapshot = inode->bi_snapshot; 497 + 498 + if (inode->bi_parent_subvol) { 499 + int ret = bch2_subvolume_get_snapshot(trans, inode->bi_parent_subvol, &snapshot); 500 + if (ret) 501 + return ret; 502 + } 520 503 521 504 struct bch_fs *c = trans->c; 522 505 struct btree_iter iter; 523 506 struct bkey_s_c_dirent d = dirent_get_by_pos(trans, &iter, 524 - SPOS(inode->bi_dir, inode->bi_dir_offset, inode->bi_snapshot)); 507 + SPOS(inode->bi_dir, inode->bi_dir_offset, snapshot)); 525 508 int ret = bkey_err(d) ?: 526 509 dirent_points_to_inode(c, d, inode) ?: 527 510 bch2_fsck_remove_dirent(trans, d.k->p); ··· 728 695 static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen, 729 696 u32 id, u32 ancestor) 730 697 { 731 - ssize_t i; 732 - 733 698 EBUG_ON(id > ancestor); 734 - 735 - /* @ancestor should be the snapshot most recently added to @seen */ 736 - EBUG_ON(ancestor != seen->pos.snapshot); 737 - EBUG_ON(ancestor != darray_last(seen->ids)); 738 699 739 700 if (id == ancestor) 740 701 return true; ··· 745 718 * numerically, since snapshot ID lists are kept sorted, so if we find 746 719 * an id that's an ancestor of @id we're done: 747 720 */ 748 - 749 - for (i = seen->ids.nr - 2; 750 - i >= 0 && seen->ids.data[i] >= id; 751 - --i) 752 - if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i])) 721 + darray_for_each_reverse(seen->ids, i) 722 + if (*i != ancestor && bch2_snapshot_is_ancestor(c, id, *i)) 753 723 return false; 754 724 755 725 return true; ··· 830 806 if (!n->whiteout) { 831 807 return bch2_inode_unpack(inode, &n->inode); 832 808 } else { 833 - n->inode.bi_inum = inode.k->p.inode; 809 + n->inode.bi_inum = inode.k->p.offset; 834 810 n->inode.bi_snapshot = inode.k->p.snapshot; 835 811 return 0; 836 812 } ··· 927 903 w->last_pos.inode, k.k->p.snapshot, i->inode.bi_snapshot, 928 904 (bch2_bkey_val_to_text(&buf, c, k), 929 905 buf.buf))) { 930 - struct bch_inode_unpacked new = i->inode; 931 - struct bkey_i whiteout; 932 - 933 - new.bi_snapshot = k.k->p.snapshot; 934 - 935 906 if (!i->whiteout) { 907 + struct bch_inode_unpacked new = i->inode; 908 + new.bi_snapshot = k.k->p.snapshot; 936 909 ret = __bch2_fsck_write_inode(trans, &new); 937 910 } else { 911 + struct bkey_i whiteout; 938 912 bkey_init(&whiteout.k); 939 913 whiteout.k.type = KEY_TYPE_whiteout; 940 - whiteout.k.p = SPOS(0, i->inode.bi_inum, i->inode.bi_snapshot); 914 + whiteout.k.p = SPOS(0, i->inode.bi_inum, k.k->p.snapshot); 941 915 ret = bch2_btree_insert_nonextent(trans, BTREE_ID_inodes, 942 916 &whiteout, 943 917 BTREE_UPDATE_internal_snapshot_node); ··· 1157 1135 if (ret) 1158 1136 goto err; 1159 1137 1160 - if (u.bi_dir || u.bi_dir_offset) { 1138 + if (bch2_inode_has_backpointer(&u)) { 1161 1139 ret = check_inode_dirent_inode(trans, &u, &do_update); 1162 1140 if (ret) 1163 1141 goto err; 1164 1142 } 1165 1143 1166 - if (fsck_err_on(u.bi_dir && (u.bi_flags & BCH_INODE_unlinked), 1144 + if (fsck_err_on(bch2_inode_has_backpointer(&u) && 1145 + (u.bi_flags & BCH_INODE_unlinked), 1167 1146 trans, inode_unlinked_but_has_dirent, 1168 1147 "inode unlinked but has dirent\n%s", 1169 1148 (printbuf_reset(&buf), ··· 1461 1438 { 1462 1439 struct bch_fs *c = trans->c; 1463 1440 struct printbuf buf = PRINTBUF; 1441 + struct btree_iter iter2 = {}; 1464 1442 int ret = PTR_ERR_OR_ZERO(i); 1465 1443 if (ret) 1466 1444 return ret; ··· 1471 1447 1472 1448 bool have_inode = i && !i->whiteout; 1473 1449 1474 - if (!have_inode && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) { 1475 - ret = reconstruct_inode(trans, iter->btree_id, k.k->p.snapshot, k.k->p.inode) ?: 1476 - bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); 1477 - if (ret) 1478 - goto err; 1450 + if (!have_inode && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) 1451 + goto reconstruct; 1479 1452 1480 - inode->last_pos.inode--; 1481 - ret = bch_err_throw(c, transaction_restart_nested); 1482 - goto err; 1453 + if (have_inode && btree_matches_i_mode(iter->btree_id, i->inode.bi_mode)) 1454 + goto out; 1455 + 1456 + prt_printf(&buf, ", "); 1457 + 1458 + bool have_old_inode = false; 1459 + darray_for_each(inode->inodes, i2) 1460 + if (!i2->whiteout && 1461 + bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i2->inode.bi_snapshot) && 1462 + btree_matches_i_mode(iter->btree_id, i2->inode.bi_mode)) { 1463 + prt_printf(&buf, "but found good inode in older snapshot\n"); 1464 + bch2_inode_unpacked_to_text(&buf, &i2->inode); 1465 + prt_newline(&buf); 1466 + have_old_inode = true; 1467 + break; 1468 + } 1469 + 1470 + struct bkey_s_c k2; 1471 + unsigned nr_keys = 0; 1472 + 1473 + prt_printf(&buf, "found keys:\n"); 1474 + 1475 + for_each_btree_key_max_norestart(trans, iter2, iter->btree_id, 1476 + SPOS(k.k->p.inode, 0, k.k->p.snapshot), 1477 + POS(k.k->p.inode, U64_MAX), 1478 + 0, k2, ret) { 1479 + nr_keys++; 1480 + if (nr_keys <= 10) { 1481 + bch2_bkey_val_to_text(&buf, c, k2); 1482 + prt_newline(&buf); 1483 + } 1484 + if (nr_keys >= 100) 1485 + break; 1483 1486 } 1484 1487 1485 - if (fsck_err_on(!have_inode, 1486 - trans, key_in_missing_inode, 1487 - "key in missing inode:\n%s", 1488 - (printbuf_reset(&buf), 1489 - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) 1490 - goto delete; 1488 + if (ret) 1489 + goto err; 1491 1490 1492 - if (fsck_err_on(have_inode && !btree_matches_i_mode(iter->btree_id, i->inode.bi_mode), 1493 - trans, key_in_wrong_inode_type, 1494 - "key for wrong inode mode %o:\n%s", 1495 - i->inode.bi_mode, 1496 - (printbuf_reset(&buf), 1497 - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) 1498 - goto delete; 1491 + if (nr_keys > 100) 1492 + prt_printf(&buf, "found > %u keys for this missing inode\n", nr_keys); 1493 + else if (nr_keys > 10) 1494 + prt_printf(&buf, "found %u keys for this missing inode\n", nr_keys); 1495 + 1496 + if (!have_inode) { 1497 + if (fsck_err_on(!have_inode, 1498 + trans, key_in_missing_inode, 1499 + "key in missing inode%s", buf.buf)) { 1500 + /* 1501 + * Maybe a deletion that raced with data move, or something 1502 + * weird like that? But if we know the inode was deleted, or 1503 + * it's just a few keys, we can safely delete them. 1504 + * 1505 + * If it's many keys, we should probably recreate the inode 1506 + */ 1507 + if (have_old_inode || nr_keys <= 2) 1508 + goto delete; 1509 + else 1510 + goto reconstruct; 1511 + } 1512 + } else { 1513 + /* 1514 + * not autofix, this one would be a giant wtf - bit error in the 1515 + * inode corrupting i_mode? 1516 + * 1517 + * may want to try repairing inode instead of deleting 1518 + */ 1519 + if (fsck_err_on(!btree_matches_i_mode(iter->btree_id, i->inode.bi_mode), 1520 + trans, key_in_wrong_inode_type, 1521 + "key for wrong inode mode %o%s", 1522 + i->inode.bi_mode, buf.buf)) 1523 + goto delete; 1524 + } 1499 1525 out: 1500 1526 err: 1501 1527 fsck_err: 1528 + bch2_trans_iter_exit(trans, &iter2); 1502 1529 printbuf_exit(&buf); 1503 1530 bch_err_fn(c, ret); 1504 1531 return ret; 1505 1532 delete: 1533 + /* 1534 + * XXX: print out more info 1535 + * count up extents for this inode, check if we have different inode in 1536 + * an older snapshot version, perhaps decide if we want to reconstitute 1537 + */ 1506 1538 ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_internal_snapshot_node); 1539 + goto out; 1540 + reconstruct: 1541 + ret = reconstruct_inode(trans, iter->btree_id, k.k->p.snapshot, k.k->p.inode) ?: 1542 + bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); 1543 + if (ret) 1544 + goto err; 1545 + 1546 + inode->last_pos.inode--; 1547 + ret = bch_err_throw(c, transaction_restart_nested); 1507 1548 goto out; 1508 1549 } 1509 1550 ··· 1911 1822 !key_visible_in_snapshot(c, s, i->inode.bi_snapshot, k.k->p.snapshot)) 1912 1823 continue; 1913 1824 1914 - if (fsck_err_on(k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 && 1825 + u64 last_block = round_up(i->inode.bi_size, block_bytes(c)) >> 9; 1826 + 1827 + if (fsck_err_on(k.k->p.offset > last_block && 1915 1828 !bkey_extent_is_reservation(k), 1916 1829 trans, extent_past_end_of_inode, 1917 1830 "extent type past end of inode %llu:%u, i_size %llu\n%s", 1918 1831 i->inode.bi_inum, i->inode.bi_snapshot, i->inode.bi_size, 1919 1832 (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { 1920 - struct btree_iter iter2; 1833 + struct bkey_i *whiteout = bch2_trans_kmalloc(trans, sizeof(*whiteout)); 1834 + ret = PTR_ERR_OR_ZERO(whiteout); 1835 + if (ret) 1836 + goto err; 1921 1837 1922 - bch2_trans_copy_iter(trans, &iter2, iter); 1923 - bch2_btree_iter_set_snapshot(trans, &iter2, i->inode.bi_snapshot); 1838 + bkey_init(&whiteout->k); 1839 + whiteout->k.p = SPOS(k.k->p.inode, 1840 + last_block, 1841 + i->inode.bi_snapshot); 1842 + bch2_key_resize(&whiteout->k, 1843 + min(KEY_SIZE_MAX & (~0 << c->block_bits), 1844 + U64_MAX - whiteout->k.p.offset)); 1845 + 1846 + 1847 + /* 1848 + * Need a normal (not BTREE_ITER_all_snapshots) 1849 + * iterator, if we're deleting in a different 1850 + * snapshot and need to emit a whiteout 1851 + */ 1852 + struct btree_iter iter2; 1853 + bch2_trans_iter_init(trans, &iter2, BTREE_ID_extents, 1854 + bkey_start_pos(&whiteout->k), 1855 + BTREE_ITER_intent); 1924 1856 ret = bch2_btree_iter_traverse(trans, &iter2) ?: 1925 - bch2_btree_delete_at(trans, &iter2, 1857 + bch2_trans_update(trans, &iter2, whiteout, 1926 1858 BTREE_UPDATE_internal_snapshot_node); 1927 1859 bch2_trans_iter_exit(trans, &iter2); 1928 1860 if (ret) ··· 2059 1949 continue; 2060 1950 } 2061 1951 2062 - if (fsck_err_on(i->inode.bi_nlink != i->count, 2063 - trans, inode_dir_wrong_nlink, 2064 - "directory %llu:%u with wrong i_nlink: got %u, should be %llu", 2065 - w->last_pos.inode, i->inode.bi_snapshot, i->inode.bi_nlink, i->count)) { 2066 - i->inode.bi_nlink = i->count; 2067 - ret = bch2_fsck_write_inode(trans, &i->inode); 2068 - if (ret) 2069 - break; 1952 + if (i->inode.bi_nlink != i->count) { 1953 + CLASS(printbuf, buf)(); 1954 + 1955 + lockrestart_do(trans, 1956 + bch2_inum_snapshot_to_path(trans, w->last_pos.inode, 1957 + i->inode.bi_snapshot, NULL, &buf)); 1958 + 1959 + if (fsck_err_on(i->inode.bi_nlink != i->count, 1960 + trans, inode_dir_wrong_nlink, 1961 + "directory with wrong i_nlink: got %u, should be %llu\n%s", 1962 + i->inode.bi_nlink, i->count, buf.buf)) { 1963 + i->inode.bi_nlink = i->count; 1964 + ret = bch2_fsck_write_inode(trans, &i->inode); 1965 + if (ret) 1966 + break; 1967 + } 2070 1968 } 2071 1969 } 2072 1970 fsck_err: ··· 2611 2493 if (k.k->type != KEY_TYPE_subvolume) 2612 2494 return 0; 2613 2495 2496 + subvol_inum start = { 2497 + .subvol = k.k->p.offset, 2498 + .inum = le64_to_cpu(bkey_s_c_to_subvolume(k).v->inode), 2499 + }; 2500 + 2614 2501 while (k.k->p.offset != BCACHEFS_ROOT_SUBVOL) { 2615 2502 ret = darray_push(&subvol_path, k.k->p.offset); 2616 2503 if (ret) ··· 2634 2511 2635 2512 if (darray_u32_has(&subvol_path, parent)) { 2636 2513 printbuf_reset(&buf); 2637 - prt_printf(&buf, "subvolume loop:\n"); 2514 + prt_printf(&buf, "subvolume loop: "); 2638 2515 2639 - darray_for_each_reverse(subvol_path, i) 2640 - prt_printf(&buf, "%u ", *i); 2641 - prt_printf(&buf, "%u", parent); 2516 + ret = bch2_inum_to_path(trans, start, &buf); 2517 + if (ret) 2518 + goto err; 2642 2519 2643 2520 if (fsck_err(trans, subvol_loop, "%s", buf.buf)) 2644 2521 ret = reattach_subvol(trans, s); ··· 2682 2559 return ret; 2683 2560 } 2684 2561 2685 - struct pathbuf_entry { 2686 - u64 inum; 2687 - u32 snapshot; 2688 - }; 2689 - 2690 - typedef DARRAY(struct pathbuf_entry) pathbuf; 2691 - 2692 - static int bch2_bi_depth_renumber_one(struct btree_trans *trans, struct pathbuf_entry *p, 2562 + static int bch2_bi_depth_renumber_one(struct btree_trans *trans, 2563 + u64 inum, u32 snapshot, 2693 2564 u32 new_depth) 2694 2565 { 2695 2566 struct btree_iter iter; 2696 2567 struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, 2697 - SPOS(0, p->inum, p->snapshot), 0); 2568 + SPOS(0, inum, snapshot), 0); 2698 2569 2699 2570 struct bch_inode_unpacked inode; 2700 2571 int ret = bkey_err(k) ?: ··· 2707 2590 return ret; 2708 2591 } 2709 2592 2710 - static int bch2_bi_depth_renumber(struct btree_trans *trans, pathbuf *path, u32 new_bi_depth) 2593 + static int bch2_bi_depth_renumber(struct btree_trans *trans, darray_u64 *path, 2594 + u32 snapshot, u32 new_bi_depth) 2711 2595 { 2712 2596 u32 restart_count = trans->restart_count; 2713 2597 int ret = 0; 2714 2598 2715 2599 darray_for_each_reverse(*path, i) { 2716 2600 ret = nested_lockrestart_do(trans, 2717 - bch2_bi_depth_renumber_one(trans, i, new_bi_depth)); 2601 + bch2_bi_depth_renumber_one(trans, *i, snapshot, new_bi_depth)); 2718 2602 bch_err_fn(trans->c, ret); 2719 2603 if (ret) 2720 2604 break; ··· 2726 2608 return ret ?: trans_was_restarted(trans, restart_count); 2727 2609 } 2728 2610 2729 - static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot) 2730 - { 2731 - darray_for_each(*p, i) 2732 - if (i->inum == inum && 2733 - i->snapshot == snapshot) 2734 - return true; 2735 - return false; 2736 - } 2737 - 2738 2611 static int check_path_loop(struct btree_trans *trans, struct bkey_s_c inode_k) 2739 2612 { 2740 2613 struct bch_fs *c = trans->c; 2741 2614 struct btree_iter inode_iter = {}; 2742 - pathbuf path = {}; 2615 + darray_u64 path = {}; 2743 2616 struct printbuf buf = PRINTBUF; 2744 2617 u32 snapshot = inode_k.k->p.snapshot; 2745 2618 bool redo_bi_depth = false; 2746 2619 u32 min_bi_depth = U32_MAX; 2747 2620 int ret = 0; 2748 2621 2622 + struct bpos start = inode_k.k->p; 2623 + 2749 2624 struct bch_inode_unpacked inode; 2750 2625 ret = bch2_inode_unpack(inode_k, &inode); 2751 2626 if (ret) 2752 2627 return ret; 2753 2628 2754 - while (!inode.bi_subvol) { 2629 + /* 2630 + * If we're running full fsck, check_dirents() will have already ran, 2631 + * and we shouldn't see any missing backpointers here - otherwise that's 2632 + * handled separately, by check_unreachable_inodes 2633 + */ 2634 + while (!inode.bi_subvol && 2635 + bch2_inode_has_backpointer(&inode)) { 2755 2636 struct btree_iter dirent_iter; 2756 2637 struct bkey_s_c_dirent d; 2757 - u32 parent_snapshot = snapshot; 2758 2638 2759 - d = inode_get_dirent(trans, &dirent_iter, &inode, &parent_snapshot); 2639 + d = dirent_get_by_pos(trans, &dirent_iter, 2640 + SPOS(inode.bi_dir, inode.bi_dir_offset, snapshot)); 2760 2641 ret = bkey_err(d.s_c); 2761 2642 if (ret && !bch2_err_matches(ret, ENOENT)) 2762 2643 goto out; ··· 2773 2656 2774 2657 bch2_trans_iter_exit(trans, &dirent_iter); 2775 2658 2776 - ret = darray_push(&path, ((struct pathbuf_entry) { 2777 - .inum = inode.bi_inum, 2778 - .snapshot = snapshot, 2779 - })); 2659 + ret = darray_push(&path, inode.bi_inum); 2780 2660 if (ret) 2781 2661 return ret; 2782 - 2783 - snapshot = parent_snapshot; 2784 2662 2785 2663 bch2_trans_iter_exit(trans, &inode_iter); 2786 2664 inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, ··· 2798 2686 break; 2799 2687 2800 2688 inode = parent_inode; 2801 - snapshot = inode_k.k->p.snapshot; 2802 2689 redo_bi_depth = true; 2803 2690 2804 - if (path_is_dup(&path, inode.bi_inum, snapshot)) { 2691 + if (darray_find(path, inode.bi_inum)) { 2805 2692 printbuf_reset(&buf); 2806 - prt_printf(&buf, "directory structure loop:\n"); 2807 - darray_for_each_reverse(path, i) 2808 - prt_printf(&buf, "%llu:%u ", i->inum, i->snapshot); 2809 - prt_printf(&buf, "%llu:%u", inode.bi_inum, snapshot); 2693 + prt_printf(&buf, "directory structure loop in snapshot %u: ", 2694 + snapshot); 2695 + 2696 + ret = bch2_inum_snapshot_to_path(trans, start.offset, start.snapshot, NULL, &buf); 2697 + if (ret) 2698 + goto out; 2699 + 2700 + if (c->opts.verbose) { 2701 + prt_newline(&buf); 2702 + darray_for_each(path, i) 2703 + prt_printf(&buf, "%llu ", *i); 2704 + } 2810 2705 2811 2706 if (fsck_err(trans, dir_loop, "%s", buf.buf)) { 2812 2707 ret = remove_backpointer(trans, &inode); ··· 2833 2714 min_bi_depth = 0; 2834 2715 2835 2716 if (redo_bi_depth) 2836 - ret = bch2_bi_depth_renumber(trans, &path, min_bi_depth); 2717 + ret = bch2_bi_depth_renumber(trans, &path, snapshot, min_bi_depth); 2837 2718 out: 2838 2719 fsck_err: 2839 2720 bch2_trans_iter_exit(trans, &inode_iter); ··· 2850 2731 int bch2_check_directory_structure(struct bch_fs *c) 2851 2732 { 2852 2733 int ret = bch2_trans_run(c, 2853 - for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, POS_MIN, 2734 + for_each_btree_key_reverse_commit(trans, iter, BTREE_ID_inodes, POS_MIN, 2854 2735 BTREE_ITER_intent| 2855 2736 BTREE_ITER_prefetch| 2856 2737 BTREE_ITER_all_snapshots, k,
+5
fs/bcachefs/inode.h
··· 254 254 : c->opts.casefold; 255 255 } 256 256 257 + static inline bool bch2_inode_has_backpointer(const struct bch_inode_unpacked *bi) 258 + { 259 + return bi->bi_dir || bi->bi_dir_offset; 260 + } 261 + 257 262 /* i_nlink: */ 258 263 259 264 static inline unsigned nlink_bias(umode_t mode)
+6 -1
fs/bcachefs/io_read.c
··· 1491 1491 prt_printf(out, "have_ioref:\t%u\n", rbio->have_ioref); 1492 1492 prt_printf(out, "narrow_crcs:\t%u\n", rbio->narrow_crcs); 1493 1493 prt_printf(out, "context:\t%u\n", rbio->context); 1494 - prt_printf(out, "ret:\t%s\n", bch2_err_str(rbio->ret)); 1494 + 1495 + int ret = READ_ONCE(rbio->ret); 1496 + if (ret < 0) 1497 + prt_printf(out, "ret:\t%s\n", bch2_err_str(ret)); 1498 + else 1499 + prt_printf(out, "ret:\t%i\n", ret); 1495 1500 1496 1501 prt_printf(out, "flags:\t"); 1497 1502 bch2_prt_bitflags(out, bch2_read_bio_flags, rbio->flags);
+7 -13
fs/bcachefs/journal.c
··· 1283 1283 ret = 0; /* wait and retry */ 1284 1284 1285 1285 bch2_disk_reservation_put(c, &disk_res); 1286 - closure_sync(&cl); 1286 + bch2_wait_on_allocator(c, &cl); 1287 1287 } 1288 1288 1289 1289 return ret; ··· 1474 1474 clear_bit(JOURNAL_running, &j->flags); 1475 1475 } 1476 1476 1477 - int bch2_fs_journal_start(struct journal *j, u64 cur_seq) 1477 + int bch2_fs_journal_start(struct journal *j, u64 last_seq, u64 cur_seq) 1478 1478 { 1479 1479 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1480 1480 struct journal_entry_pin_list *p; 1481 1481 struct journal_replay *i, **_i; 1482 1482 struct genradix_iter iter; 1483 1483 bool had_entries = false; 1484 - u64 last_seq = cur_seq, nr, seq; 1485 1484 1486 1485 /* 1487 1486 * ··· 1494 1495 return -EINVAL; 1495 1496 } 1496 1497 1497 - genradix_for_each_reverse(&c->journal_entries, iter, _i) { 1498 - i = *_i; 1498 + /* Clean filesystem? */ 1499 + if (!last_seq) 1500 + last_seq = cur_seq; 1499 1501 1500 - if (journal_replay_ignore(i)) 1501 - continue; 1502 - 1503 - last_seq = le64_to_cpu(i->j.last_seq); 1504 - break; 1505 - } 1506 - 1507 - nr = cur_seq - last_seq; 1502 + u64 nr = cur_seq - last_seq; 1508 1503 1509 1504 /* 1510 1505 * Extra fudge factor, in case we crashed when the journal pin fifo was ··· 1525 1532 j->pin.back = cur_seq; 1526 1533 atomic64_set(&j->seq, cur_seq - 1); 1527 1534 1535 + u64 seq; 1528 1536 fifo_for_each_entry_ptr(p, &j->pin, seq) 1529 1537 journal_pin_list_init(p, 1); 1530 1538
+1 -1
fs/bcachefs/journal.h
··· 453 453 void bch2_dev_journal_stop(struct journal *, struct bch_dev *); 454 454 455 455 void bch2_fs_journal_stop(struct journal *); 456 - int bch2_fs_journal_start(struct journal *, u64); 456 + int bch2_fs_journal_start(struct journal *, u64, u64); 457 457 void bch2_journal_set_replay_done(struct journal *); 458 458 459 459 void bch2_dev_journal_exit(struct bch_dev *);
+20 -6
fs/bcachefs/journal_io.c
··· 160 160 struct printbuf buf = PRINTBUF; 161 161 int ret = JOURNAL_ENTRY_ADD_OK; 162 162 163 + if (last_seq && c->opts.journal_rewind) 164 + last_seq = min(last_seq, c->opts.journal_rewind); 165 + 163 166 if (!c->journal.oldest_seq_found_ondisk || 164 167 le64_to_cpu(j->seq) < c->journal.oldest_seq_found_ondisk) 165 168 c->journal.oldest_seq_found_ondisk = le64_to_cpu(j->seq); ··· 1433 1430 printbuf_reset(&buf); 1434 1431 prt_printf(&buf, "journal read done, replaying entries %llu-%llu", 1435 1432 *last_seq, *blacklist_seq - 1); 1433 + 1434 + /* 1435 + * Drop blacklisted entries and entries older than last_seq (or start of 1436 + * journal rewind: 1437 + */ 1438 + u64 drop_before = *last_seq; 1439 + if (c->opts.journal_rewind) { 1440 + drop_before = min(drop_before, c->opts.journal_rewind); 1441 + prt_printf(&buf, " (rewinding from %llu)", c->opts.journal_rewind); 1442 + } 1443 + 1444 + *last_seq = drop_before; 1436 1445 if (*start_seq != *blacklist_seq) 1437 1446 prt_printf(&buf, " (unflushed %llu-%llu)", *blacklist_seq, *start_seq - 1); 1438 1447 bch_info(c, "%s", buf.buf); 1439 - 1440 - /* Drop blacklisted entries and entries older than last_seq: */ 1441 1448 genradix_for_each(&c->journal_entries, radix_iter, _i) { 1442 1449 i = *_i; 1443 1450 ··· 1455 1442 continue; 1456 1443 1457 1444 seq = le64_to_cpu(i->j.seq); 1458 - if (seq < *last_seq) { 1445 + if (seq < drop_before) { 1459 1446 journal_replay_free(c, i, false); 1460 1447 continue; 1461 1448 } ··· 1468 1455 } 1469 1456 } 1470 1457 1471 - ret = bch2_journal_check_for_missing(c, *last_seq, *blacklist_seq - 1); 1458 + ret = bch2_journal_check_for_missing(c, drop_before, *blacklist_seq - 1); 1472 1459 if (ret) 1473 1460 goto err; 1474 1461 ··· 1716 1703 bch2_log_msg_start(c, &buf); 1717 1704 1718 1705 if (err == -BCH_ERR_journal_write_err) 1719 - prt_printf(&buf, "unable to write journal to sufficient devices"); 1706 + prt_printf(&buf, "unable to write journal to sufficient devices\n"); 1720 1707 else 1721 - prt_printf(&buf, "journal write error marking replicas: %s", bch2_err_str(err)); 1708 + prt_printf(&buf, "journal write error marking replicas: %s\n", 1709 + bch2_err_str(err)); 1722 1710 1723 1711 bch2_fs_emergency_read_only2(c, &buf); 1724 1712
+23 -7
fs/bcachefs/namei.c
··· 625 625 { 626 626 unsigned orig_pos = path->pos; 627 627 int ret = 0; 628 + DARRAY(subvol_inum) inums = {}; 629 + 630 + if (!snapshot) { 631 + ret = bch2_subvolume_get_snapshot(trans, subvol, &snapshot); 632 + if (ret) 633 + goto disconnected; 634 + } 628 635 629 636 while (true) { 630 - if (!snapshot) { 631 - ret = bch2_subvolume_get_snapshot(trans, subvol, &snapshot); 632 - if (ret) 633 - goto disconnected; 637 + subvol_inum n = (subvol_inum) { subvol ?: snapshot, inum }; 638 + 639 + if (darray_find_p(inums, i, i->subvol == n.subvol && i->inum == n.inum)) { 640 + prt_str_reversed(path, "(loop)"); 641 + break; 634 642 } 643 + 644 + ret = darray_push(&inums, n); 645 + if (ret) 646 + goto err; 635 647 636 648 struct bch_inode_unpacked inode; 637 649 ret = bch2_inode_find_by_inum_snapshot(trans, inum, snapshot, &inode, 0); ··· 662 650 inum = inode.bi_dir; 663 651 if (inode.bi_parent_subvol) { 664 652 subvol = inode.bi_parent_subvol; 665 - snapshot = 0; 653 + ret = bch2_subvolume_get_snapshot(trans, inode.bi_parent_subvol, &snapshot); 654 + if (ret) 655 + goto disconnected; 666 656 } 667 657 668 658 struct btree_iter d_iter; ··· 676 662 goto disconnected; 677 663 678 664 struct qstr dirent_name = bch2_dirent_get_name(d); 665 + 679 666 prt_bytes_reversed(path, dirent_name.name, dirent_name.len); 680 667 681 668 prt_char(path, '/'); ··· 692 677 goto err; 693 678 694 679 reverse_bytes(path->buf + orig_pos, path->pos - orig_pos); 680 + darray_exit(&inums); 695 681 return 0; 696 682 err: 683 + darray_exit(&inums); 697 684 return ret; 698 685 disconnected: 699 686 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) ··· 734 717 if (inode_points_to_dirent(target, d)) 735 718 return 0; 736 719 737 - if (!target->bi_dir && 738 - !target->bi_dir_offset) { 720 + if (!bch2_inode_has_backpointer(target)) { 739 721 fsck_err_on(S_ISDIR(target->bi_mode), 740 722 trans, inode_dir_missing_backpointer, 741 723 "directory with missing backpointer\n%s",
+5
fs/bcachefs/opts.h
··· 379 379 OPT_BOOL(), \ 380 380 BCH2_NO_SB_OPT, false, \ 381 381 NULL, "Exit recovery immediately prior to journal replay")\ 382 + x(journal_rewind, u64, \ 383 + OPT_FS|OPT_MOUNT, \ 384 + OPT_UINT(0, U64_MAX), \ 385 + BCH2_NO_SB_OPT, 0, \ 386 + NULL, "Rewind journal") \ 382 387 x(recovery_passes, u64, \ 383 388 OPT_FS|OPT_MOUNT, \ 384 389 OPT_BITFIELD(bch2_recovery_passes), \
+20 -4
fs/bcachefs/recovery.c
··· 607 607 buf.buf, bch2_err_str(ret))) { 608 608 if (btree_id_is_alloc(i)) 609 609 r->error = 0; 610 + ret = 0; 610 611 } 611 612 } 612 613 ··· 693 692 ret = true; 694 693 } 695 694 696 - if (new_version > c->sb.version_incompat && 695 + if (new_version > c->sb.version_incompat_allowed && 697 696 c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible) { 698 697 struct printbuf buf = PRINTBUF; 699 698 ··· 757 756 758 757 if (c->opts.nochanges) 759 758 c->opts.read_only = true; 759 + 760 + if (c->opts.journal_rewind) { 761 + bch_info(c, "rewinding journal, fsck required"); 762 + c->opts.fsck = true; 763 + } 764 + 765 + if (go_rw_in_recovery(c)) { 766 + /* 767 + * start workqueues/kworkers early - kthread creation checks for 768 + * pending signals, which is _very_ annoying 769 + */ 770 + ret = bch2_fs_init_rw(c); 771 + if (ret) 772 + goto err; 773 + } 760 774 761 775 mutex_lock(&c->sb_lock); 762 776 struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); ··· 981 965 982 966 ret = bch2_journal_log_msg(c, "starting journal at entry %llu, replaying %llu-%llu", 983 967 journal_seq, last_seq, blacklist_seq - 1) ?: 984 - bch2_fs_journal_start(&c->journal, journal_seq); 968 + bch2_fs_journal_start(&c->journal, last_seq, journal_seq); 985 969 if (ret) 986 970 goto err; 987 971 ··· 1142 1126 struct printbuf buf = PRINTBUF; 1143 1127 bch2_log_msg_start(c, &buf); 1144 1128 1145 - prt_printf(&buf, "error in recovery: %s", bch2_err_str(ret)); 1129 + prt_printf(&buf, "error in recovery: %s\n", bch2_err_str(ret)); 1146 1130 bch2_fs_emergency_read_only2(c, &buf); 1147 1131 1148 1132 bch2_print_str(c, KERN_ERR, buf.buf); ··· 1197 1181 * journal_res_get() will crash if called before this has 1198 1182 * set up the journal.pin FIFO and journal.cur pointer: 1199 1183 */ 1200 - ret = bch2_fs_journal_start(&c->journal, 1); 1184 + ret = bch2_fs_journal_start(&c->journal, 1, 1); 1201 1185 if (ret) 1202 1186 goto err; 1203 1187
+9 -10
fs/bcachefs/recovery_passes.c
··· 217 217 218 218 set_bit(BCH_FS_may_go_rw, &c->flags); 219 219 220 - if (keys->nr || 221 - !c->opts.read_only || 222 - !c->sb.clean || 223 - c->opts.recovery_passes || 224 - (c->opts.fsck && !(c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)))) { 220 + if (go_rw_in_recovery(c)) { 225 221 if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)) { 226 222 bch_info(c, "mounting a filesystem with no alloc info read-write; will recreate"); 227 223 bch2_reconstruct_alloc(c); ··· 313 317 */ 314 318 bool in_recovery = test_bit(BCH_FS_in_recovery, &c->flags); 315 319 bool persistent = !in_recovery || !(*flags & RUN_RECOVERY_PASS_nopersistent); 320 + bool rewind = in_recovery && 321 + r->curr_pass > pass && 322 + !(r->passes_complete & BIT_ULL(pass)); 316 323 317 324 if (persistent 318 325 ? !(c->sb.recovery_passes_required & BIT_ULL(pass)) ··· 324 325 325 326 if (!(*flags & RUN_RECOVERY_PASS_ratelimit) && 326 327 (r->passes_ratelimiting & BIT_ULL(pass))) 328 + return true; 329 + 330 + if (rewind) 327 331 return true; 328 332 329 333 return false; ··· 342 340 { 343 341 struct bch_fs_recovery *r = &c->recovery; 344 342 int ret = 0; 345 - 346 343 347 344 lockdep_assert_held(&c->sb_lock); 348 345 ··· 413 412 { 414 413 int ret = 0; 415 414 416 - scoped_guard(mutex, &c->sb_lock) { 417 - if (!recovery_pass_needs_set(c, pass, &flags)) 418 - return 0; 419 - 415 + if (recovery_pass_needs_set(c, pass, &flags)) { 416 + guard(mutex)(&c->sb_lock); 420 417 ret = __bch2_run_explicit_recovery_pass(c, out, pass, flags); 421 418 bch2_write_super(c); 422 419 }
+9
fs/bcachefs/recovery_passes.h
··· 17 17 RUN_RECOVERY_PASS_ratelimit = BIT(1), 18 18 }; 19 19 20 + static inline bool go_rw_in_recovery(struct bch_fs *c) 21 + { 22 + return (c->journal_keys.nr || 23 + !c->opts.read_only || 24 + !c->sb.clean || 25 + c->opts.recovery_passes || 26 + (c->opts.fsck && !(c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)))); 27 + } 28 + 20 29 int bch2_run_print_explicit_recovery_pass(struct bch_fs *, enum bch_recovery_pass); 21 30 22 31 int __bch2_run_explicit_recovery_pass(struct bch_fs *, struct printbuf *,
+7 -5
fs/bcachefs/reflink.c
··· 64 64 REFLINK_P_IDX(p.v), 65 65 le32_to_cpu(p.v->front_pad), 66 66 le32_to_cpu(p.v->back_pad)); 67 + 68 + if (REFLINK_P_ERROR(p.v)) 69 + prt_str(out, " error"); 67 70 } 68 71 69 72 bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r) ··· 272 269 return k; 273 270 274 271 if (unlikely(!bkey_extent_is_reflink_data(k.k))) { 275 - unsigned size = min((u64) k.k->size, 276 - REFLINK_P_IDX(p.v) + p.k->size + le32_to_cpu(p.v->back_pad) - 277 - reflink_offset); 278 - bch2_key_resize(&iter->k, size); 272 + u64 missing_end = min(k.k->p.offset, 273 + REFLINK_P_IDX(p.v) + p.k->size + le32_to_cpu(p.v->back_pad)); 274 + BUG_ON(reflink_offset == missing_end); 279 275 280 276 int ret = bch2_indirect_extent_missing_error(trans, p, reflink_offset, 281 - k.k->p.offset, should_commit); 277 + missing_end, should_commit); 282 278 if (ret) { 283 279 bch2_trans_iter_exit(trans, iter); 284 280 return bkey_s_c_err(ret);
+10 -9
fs/bcachefs/sb-errors_format.h
··· 3 3 #define _BCACHEFS_SB_ERRORS_FORMAT_H 4 4 5 5 enum bch_fsck_flags { 6 - FSCK_CAN_FIX = 1 << 0, 7 - FSCK_CAN_IGNORE = 1 << 1, 8 - FSCK_AUTOFIX = 1 << 2, 6 + FSCK_CAN_FIX = BIT(0), 7 + FSCK_CAN_IGNORE = BIT(1), 8 + FSCK_AUTOFIX = BIT(2), 9 + FSCK_ERR_NO_LOG = BIT(3), 9 10 }; 10 11 11 12 #define BCH_SB_ERRS() \ ··· 218 217 x(inode_str_hash_invalid, 194, 0) \ 219 218 x(inode_v3_fields_start_bad, 195, 0) \ 220 219 x(inode_snapshot_mismatch, 196, 0) \ 221 - x(snapshot_key_missing_inode_snapshot, 314, 0) \ 220 + x(snapshot_key_missing_inode_snapshot, 314, FSCK_AUTOFIX) \ 222 221 x(inode_unlinked_but_clean, 197, 0) \ 223 222 x(inode_unlinked_but_nlink_nonzero, 198, 0) \ 224 223 x(inode_unlinked_and_not_open, 281, 0) \ ··· 252 251 x(deleted_inode_not_unlinked, 214, FSCK_AUTOFIX) \ 253 252 x(deleted_inode_has_child_snapshots, 288, FSCK_AUTOFIX) \ 254 253 x(extent_overlapping, 215, 0) \ 255 - x(key_in_missing_inode, 216, 0) \ 254 + x(key_in_missing_inode, 216, FSCK_AUTOFIX) \ 256 255 x(key_in_wrong_inode_type, 217, 0) \ 257 - x(extent_past_end_of_inode, 218, 0) \ 256 + x(extent_past_end_of_inode, 218, FSCK_AUTOFIX) \ 258 257 x(dirent_empty_name, 219, 0) \ 259 258 x(dirent_val_too_big, 220, 0) \ 260 259 x(dirent_name_too_long, 221, 0) \ 261 260 x(dirent_name_embedded_nul, 222, 0) \ 262 261 x(dirent_name_dot_or_dotdot, 223, 0) \ 263 262 x(dirent_name_has_slash, 224, 0) \ 264 - x(dirent_d_type_wrong, 225, 0) \ 263 + x(dirent_d_type_wrong, 225, FSCK_AUTOFIX) \ 265 264 x(inode_bi_parent_wrong, 226, 0) \ 266 265 x(dirent_in_missing_dir_inode, 227, 0) \ 267 266 x(dirent_in_non_dir_inode, 228, 0) \ 268 - x(dirent_to_missing_inode, 229, 0) \ 267 + x(dirent_to_missing_inode, 229, FSCK_AUTOFIX) \ 269 268 x(dirent_to_overwritten_inode, 302, 0) \ 270 269 x(dirent_to_missing_subvol, 230, 0) \ 271 270 x(dirent_to_itself, 231, 0) \ ··· 301 300 x(btree_node_bkey_bad_u64s, 260, 0) \ 302 301 x(btree_node_topology_empty_interior_node, 261, 0) \ 303 302 x(btree_ptr_v2_min_key_bad, 262, 0) \ 304 - x(btree_root_unreadable_and_scan_found_nothing, 263, FSCK_AUTOFIX) \ 303 + x(btree_root_unreadable_and_scan_found_nothing, 263, 0) \ 305 304 x(snapshot_node_missing, 264, FSCK_AUTOFIX) \ 306 305 x(dup_backpointer_to_bad_csum_extent, 265, 0) \ 307 306 x(btree_bitmap_not_marked, 266, FSCK_AUTOFIX) \
+9 -5
fs/bcachefs/snapshot.c
··· 135 135 136 136 bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor) 137 137 { 138 - bool ret; 138 + #ifdef CONFIG_BCACHEFS_DEBUG 139 + u32 orig_id = id; 140 + #endif 139 141 140 142 guard(rcu)(); 141 143 struct snapshot_table *t = rcu_dereference(c->snapshots); ··· 149 147 while (id && id < ancestor - IS_ANCESTOR_BITMAP) 150 148 id = get_ancestor_below(t, id, ancestor); 151 149 152 - ret = id && id < ancestor 150 + bool ret = id && id < ancestor 153 151 ? test_ancestor_bitmap(t, id, ancestor) 154 152 : id == ancestor; 155 153 156 - EBUG_ON(ret != __bch2_snapshot_is_ancestor_early(t, id, ancestor)); 154 + EBUG_ON(ret != __bch2_snapshot_is_ancestor_early(t, orig_id, ancestor)); 157 155 return ret; 158 156 } 159 157 ··· 871 869 872 870 for_each_btree_key_norestart(trans, iter, BTREE_ID_snapshot_trees, POS_MIN, 873 871 0, k, ret) { 874 - if (le32_to_cpu(bkey_s_c_to_snapshot_tree(k).v->root_snapshot) == id) { 872 + if (k.k->type == KEY_TYPE_snapshot_tree && 873 + le32_to_cpu(bkey_s_c_to_snapshot_tree(k).v->root_snapshot) == id) { 875 874 tree_id = k.k->p.offset; 876 875 break; 877 876 } ··· 900 897 901 898 for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, 902 899 0, k, ret) { 903 - if (le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot) == id) { 900 + if (k.k->type == KEY_TYPE_subvolume && 901 + le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot) == id) { 904 902 snapshot->v.subvol = cpu_to_le32(k.k->p.offset); 905 903 SET_BCH_SNAPSHOT_SUBVOL(&snapshot->v, true); 906 904 break;
+11 -2
fs/bcachefs/super.c
··· 210 210 static int bch2_dev_sysfs_online(struct bch_fs *, struct bch_dev *); 211 211 static void bch2_dev_io_ref_stop(struct bch_dev *, int); 212 212 static void __bch2_dev_read_only(struct bch_fs *, struct bch_dev *); 213 - static int bch2_fs_init_rw(struct bch_fs *); 214 213 215 214 struct bch_fs *bch2_dev_to_fs(dev_t dev) 216 215 { ··· 793 794 return ret; 794 795 } 795 796 796 - static int bch2_fs_init_rw(struct bch_fs *c) 797 + int bch2_fs_init_rw(struct bch_fs *c) 797 798 { 798 799 if (test_bit(BCH_FS_rw_init_done, &c->flags)) 799 800 return 0; ··· 1013 1014 bch2_fs_vfs_init(c); 1014 1015 if (ret) 1015 1016 goto err; 1017 + 1018 + if (go_rw_in_recovery(c)) { 1019 + /* 1020 + * start workqueues/kworkers early - kthread creation checks for 1021 + * pending signals, which is _very_ annoying 1022 + */ 1023 + ret = bch2_fs_init_rw(c); 1024 + if (ret) 1025 + goto err; 1026 + } 1016 1027 1017 1028 #ifdef CONFIG_UNICODE 1018 1029 /* Default encoding until we can potentially have more as an option. */
+1
fs/bcachefs/super.h
··· 46 46 void bch2_fs_free(struct bch_fs *); 47 47 void bch2_fs_stop(struct bch_fs *); 48 48 49 + int bch2_fs_init_rw(struct bch_fs *); 49 50 int bch2_fs_start(struct bch_fs *); 50 51 struct bch_fs *bch2_fs_open(darray_const_str *, struct bch_opts *); 51 52
+28 -97
fs/bcachefs/trace.h
··· 1080 1080 __entry->must_wait) 1081 1081 ); 1082 1082 1083 - TRACE_EVENT(trans_restart_journal_preres_get, 1084 - TP_PROTO(struct btree_trans *trans, 1085 - unsigned long caller_ip, 1086 - unsigned flags), 1087 - TP_ARGS(trans, caller_ip, flags), 1088 - 1089 - TP_STRUCT__entry( 1090 - __array(char, trans_fn, 32 ) 1091 - __field(unsigned long, caller_ip ) 1092 - __field(unsigned, flags ) 1093 - ), 1094 - 1095 - TP_fast_assign( 1096 - strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); 1097 - __entry->caller_ip = caller_ip; 1098 - __entry->flags = flags; 1099 - ), 1100 - 1101 - TP_printk("%s %pS %x", __entry->trans_fn, 1102 - (void *) __entry->caller_ip, 1103 - __entry->flags) 1104 - ); 1105 - 1083 + #if 0 1084 + /* todo: bring back dynamic fault injection */ 1106 1085 DEFINE_EVENT(transaction_event, trans_restart_fault_inject, 1107 1086 TP_PROTO(struct btree_trans *trans, 1108 1087 unsigned long caller_ip), 1109 1088 TP_ARGS(trans, caller_ip) 1110 1089 ); 1090 + #endif 1111 1091 1112 1092 DEFINE_EVENT(transaction_event, trans_traverse_all, 1113 1093 TP_PROTO(struct btree_trans *trans, ··· 1175 1195 TP_ARGS(trans, caller_ip, path) 1176 1196 ); 1177 1197 1178 - DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_after_fill, 1179 - TP_PROTO(struct btree_trans *trans, 1180 - unsigned long caller_ip, 1181 - struct btree_path *path), 1182 - TP_ARGS(trans, caller_ip, path) 1183 - ); 1184 - 1185 - DEFINE_EVENT(transaction_event, trans_restart_key_cache_upgrade, 1186 - TP_PROTO(struct btree_trans *trans, 1187 - unsigned long caller_ip), 1188 - TP_ARGS(trans, caller_ip) 1189 - ); 1190 - 1191 1198 DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_key_cache_fill, 1192 1199 TP_PROTO(struct btree_trans *trans, 1193 1200 unsigned long caller_ip, ··· 1190 1223 ); 1191 1224 1192 1225 DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_path_intent, 1193 - TP_PROTO(struct btree_trans *trans, 1194 - unsigned long caller_ip, 1195 - struct btree_path *path), 1196 - TP_ARGS(trans, caller_ip, path) 1197 - ); 1198 - 1199 - DEFINE_EVENT(transaction_restart_iter, trans_restart_traverse, 1200 1226 TP_PROTO(struct btree_trans *trans, 1201 1227 unsigned long caller_ip, 1202 1228 struct btree_path *path), ··· 1252 1292 __entry->trans_fn, 1253 1293 (void *) __entry->caller_ip, 1254 1294 __entry->bytes) 1255 - ); 1256 - 1257 - TRACE_EVENT(trans_restart_key_cache_key_realloced, 1258 - TP_PROTO(struct btree_trans *trans, 1259 - unsigned long caller_ip, 1260 - struct btree_path *path, 1261 - unsigned old_u64s, 1262 - unsigned new_u64s), 1263 - TP_ARGS(trans, caller_ip, path, old_u64s, new_u64s), 1264 - 1265 - TP_STRUCT__entry( 1266 - __array(char, trans_fn, 32 ) 1267 - __field(unsigned long, caller_ip ) 1268 - __field(enum btree_id, btree_id ) 1269 - TRACE_BPOS_entries(pos) 1270 - __field(u32, old_u64s ) 1271 - __field(u32, new_u64s ) 1272 - ), 1273 - 1274 - TP_fast_assign( 1275 - strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); 1276 - __entry->caller_ip = caller_ip; 1277 - 1278 - __entry->btree_id = path->btree_id; 1279 - TRACE_BPOS_assign(pos, path->pos); 1280 - __entry->old_u64s = old_u64s; 1281 - __entry->new_u64s = new_u64s; 1282 - ), 1283 - 1284 - TP_printk("%s %pS btree %s pos %llu:%llu:%u old_u64s %u new_u64s %u", 1285 - __entry->trans_fn, 1286 - (void *) __entry->caller_ip, 1287 - bch2_btree_id_str(__entry->btree_id), 1288 - __entry->pos_inode, 1289 - __entry->pos_offset, 1290 - __entry->pos_snapshot, 1291 - __entry->old_u64s, 1292 - __entry->new_u64s) 1293 1295 ); 1294 1296 1295 1297 DEFINE_EVENT(transaction_event, trans_restart_write_buffer_flush, ··· 1408 1486 ); 1409 1487 1410 1488 DEFINE_EVENT(fs_str, io_move_evacuate_bucket, 1489 + TP_PROTO(struct bch_fs *c, const char *str), 1490 + TP_ARGS(c, str) 1491 + ); 1492 + 1493 + DEFINE_EVENT(fs_str, extent_trim_atomic, 1494 + TP_PROTO(struct bch_fs *c, const char *str), 1495 + TP_ARGS(c, str) 1496 + ); 1497 + 1498 + DEFINE_EVENT(fs_str, btree_iter_peek_slot, 1499 + TP_PROTO(struct bch_fs *c, const char *str), 1500 + TP_ARGS(c, str) 1501 + ); 1502 + 1503 + DEFINE_EVENT(fs_str, __btree_iter_peek, 1504 + TP_PROTO(struct bch_fs *c, const char *str), 1505 + TP_ARGS(c, str) 1506 + ); 1507 + 1508 + DEFINE_EVENT(fs_str, btree_iter_peek_max, 1509 + TP_PROTO(struct bch_fs *c, const char *str), 1510 + TP_ARGS(c, str) 1511 + ); 1512 + 1513 + DEFINE_EVENT(fs_str, btree_iter_peek_prev_min, 1411 1514 TP_PROTO(struct bch_fs *c, const char *str), 1412 1515 TP_ARGS(c, str) 1413 1516 ); ··· 1849 1902 __entry->dup_locked) 1850 1903 ); 1851 1904 1852 - TRACE_EVENT(btree_path_free_trans_begin, 1853 - TP_PROTO(btree_path_idx_t path), 1854 - TP_ARGS(path), 1855 - 1856 - TP_STRUCT__entry( 1857 - __field(btree_path_idx_t, idx ) 1858 - ), 1859 - 1860 - TP_fast_assign( 1861 - __entry->idx = path; 1862 - ), 1863 - 1864 - TP_printk(" path %3u", __entry->idx) 1865 - ); 1866 - 1867 1905 #else /* CONFIG_BCACHEFS_PATH_TRACEPOINTS */ 1868 1906 #ifndef _TRACE_BCACHEFS_H 1869 1907 ··· 1866 1934 static inline void trace_btree_path_traverse_end(struct btree_trans *trans, struct btree_path *path) {} 1867 1935 static inline void trace_btree_path_set_pos(struct btree_trans *trans, struct btree_path *path, struct bpos *new_pos) {} 1868 1936 static inline void trace_btree_path_free(struct btree_trans *trans, btree_path_idx_t path, struct btree_path *dup) {} 1869 - static inline void trace_btree_path_free_trans_begin(btree_path_idx_t path) {} 1870 1937 1871 1938 #endif 1872 1939 #endif /* CONFIG_BCACHEFS_PATH_TRACEPOINTS */