Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-unstable

* git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-unstable:
Btrfs: fix btrfs fallocate oops and deadlock
Btrfs: use the right node in reada_for_balance
Btrfs: fix oops on page->mapping->host during writepage
Btrfs: add a priority queue to the async thread helpers
Btrfs: use WRITE_SYNC for synchronous writes

+273 -84
+47 -13
fs/btrfs/async-thread.c
··· 25 25 #define WORK_QUEUED_BIT 0 26 26 #define WORK_DONE_BIT 1 27 27 #define WORK_ORDER_DONE_BIT 2 28 + #define WORK_HIGH_PRIO_BIT 3 28 29 29 30 /* 30 31 * container for the kthread task pointer and the list of pending work ··· 37 36 38 37 /* list of struct btrfs_work that are waiting for service */ 39 38 struct list_head pending; 39 + struct list_head prio_pending; 40 40 41 41 /* list of worker threads from struct btrfs_workers */ 42 42 struct list_head worker_list; ··· 105 103 106 104 spin_lock_irqsave(&workers->lock, flags); 107 105 108 - while (!list_empty(&workers->order_list)) { 109 - work = list_entry(workers->order_list.next, 110 - struct btrfs_work, order_list); 111 - 106 + while (1) { 107 + if (!list_empty(&workers->prio_order_list)) { 108 + work = list_entry(workers->prio_order_list.next, 109 + struct btrfs_work, order_list); 110 + } else if (!list_empty(&workers->order_list)) { 111 + work = list_entry(workers->order_list.next, 112 + struct btrfs_work, order_list); 113 + } else { 114 + break; 115 + } 112 116 if (!test_bit(WORK_DONE_BIT, &work->flags)) 113 117 break; 114 118 ··· 151 143 do { 152 144 spin_lock_irq(&worker->lock); 153 145 again_locked: 154 - while (!list_empty(&worker->pending)) { 155 - cur = worker->pending.next; 146 + while (1) { 147 + if (!list_empty(&worker->prio_pending)) 148 + cur = worker->prio_pending.next; 149 + else if (!list_empty(&worker->pending)) 150 + cur = worker->pending.next; 151 + else 152 + break; 153 + 156 154 work = list_entry(cur, struct btrfs_work, list); 157 155 list_del(&work->list); 158 156 clear_bit(WORK_QUEUED_BIT, &work->flags); ··· 177 163 178 164 spin_lock_irq(&worker->lock); 179 165 check_idle_worker(worker); 180 - 181 166 } 182 167 if (freezing(current)) { 183 168 worker->working = 0; ··· 191 178 * jump_in? 192 179 */ 193 180 smp_mb(); 194 - if (!list_empty(&worker->pending)) 181 + if (!list_empty(&worker->pending) || 182 + !list_empty(&worker->prio_pending)) 195 183 continue; 196 184 197 185 /* ··· 205 191 */ 206 192 schedule_timeout(1); 207 193 smp_mb(); 208 - if (!list_empty(&worker->pending)) 194 + if (!list_empty(&worker->pending) || 195 + !list_empty(&worker->prio_pending)) 209 196 continue; 210 197 211 198 if (kthread_should_stop()) ··· 215 200 /* still no more work?, sleep for real */ 216 201 spin_lock_irq(&worker->lock); 217 202 set_current_state(TASK_INTERRUPTIBLE); 218 - if (!list_empty(&worker->pending)) 203 + if (!list_empty(&worker->pending) || 204 + !list_empty(&worker->prio_pending)) 219 205 goto again_locked; 220 206 221 207 /* ··· 264 248 INIT_LIST_HEAD(&workers->worker_list); 265 249 INIT_LIST_HEAD(&workers->idle_list); 266 250 INIT_LIST_HEAD(&workers->order_list); 251 + INIT_LIST_HEAD(&workers->prio_order_list); 267 252 spin_lock_init(&workers->lock); 268 253 workers->max_workers = max; 269 254 workers->idle_thresh = 32; ··· 290 273 } 291 274 292 275 INIT_LIST_HEAD(&worker->pending); 276 + INIT_LIST_HEAD(&worker->prio_pending); 293 277 INIT_LIST_HEAD(&worker->worker_list); 294 278 spin_lock_init(&worker->lock); 295 279 atomic_set(&worker->num_pending, 0); ··· 414 396 goto out; 415 397 416 398 spin_lock_irqsave(&worker->lock, flags); 417 - list_add_tail(&work->list, &worker->pending); 399 + if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) 400 + list_add_tail(&work->list, &worker->prio_pending); 401 + else 402 + list_add_tail(&work->list, &worker->pending); 418 403 atomic_inc(&worker->num_pending); 419 404 420 405 /* by definition we're busy, take ourselves off the idle ··· 443 422 return 0; 444 423 } 445 424 425 + void btrfs_set_work_high_prio(struct btrfs_work *work) 426 + { 427 + set_bit(WORK_HIGH_PRIO_BIT, &work->flags); 428 + } 429 + 446 430 /* 447 431 * places a struct btrfs_work into the pending queue of one of the kthreads 448 432 */ ··· 464 438 worker = find_worker(workers); 465 439 if (workers->ordered) { 466 440 spin_lock_irqsave(&workers->lock, flags); 467 - list_add_tail(&work->order_list, &workers->order_list); 441 + if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) { 442 + list_add_tail(&work->order_list, 443 + &workers->prio_order_list); 444 + } else { 445 + list_add_tail(&work->order_list, &workers->order_list); 446 + } 468 447 spin_unlock_irqrestore(&workers->lock, flags); 469 448 } else { 470 449 INIT_LIST_HEAD(&work->order_list); ··· 477 446 478 447 spin_lock_irqsave(&worker->lock, flags); 479 448 480 - list_add_tail(&work->list, &worker->pending); 449 + if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) 450 + list_add_tail(&work->list, &worker->prio_pending); 451 + else 452 + list_add_tail(&work->list, &worker->pending); 481 453 atomic_inc(&worker->num_pending); 482 454 check_busy_worker(worker); 483 455
+2
fs/btrfs/async-thread.h
··· 85 85 * of work items waiting for completion 86 86 */ 87 87 struct list_head order_list; 88 + struct list_head prio_order_list; 88 89 89 90 /* lock for finding the next worker thread to queue on */ 90 91 spinlock_t lock; ··· 99 98 int btrfs_stop_workers(struct btrfs_workers *workers); 100 99 void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max); 101 100 int btrfs_requeue_work(struct btrfs_work *work); 101 + void btrfs_set_work_high_prio(struct btrfs_work *work); 102 102 #endif
+12 -5
fs/btrfs/ctree.c
··· 1325 1325 int ret = 0; 1326 1326 int blocksize; 1327 1327 1328 - parent = path->nodes[level - 1]; 1328 + parent = path->nodes[level + 1]; 1329 1329 if (!parent) 1330 1330 return 0; 1331 1331 1332 1332 nritems = btrfs_header_nritems(parent); 1333 - slot = path->slots[level]; 1333 + slot = path->slots[level + 1]; 1334 1334 blocksize = btrfs_level_size(root, level); 1335 1335 1336 1336 if (slot > 0) { ··· 1341 1341 block1 = 0; 1342 1342 free_extent_buffer(eb); 1343 1343 } 1344 - if (slot < nritems) { 1344 + if (slot + 1 < nritems) { 1345 1345 block2 = btrfs_node_blockptr(parent, slot + 1); 1346 1346 gen = btrfs_node_ptr_generation(parent, slot + 1); 1347 1347 eb = btrfs_find_tree_block(root, block2, blocksize); ··· 1351 1351 } 1352 1352 if (block1 || block2) { 1353 1353 ret = -EAGAIN; 1354 + 1355 + /* release the whole path */ 1354 1356 btrfs_release_path(root, path); 1357 + 1358 + /* read the blocks */ 1355 1359 if (block1) 1356 1360 readahead_tree_block(root, block1, blocksize, 0); 1357 1361 if (block2) ··· 1365 1361 eb = read_tree_block(root, block1, blocksize, 0); 1366 1362 free_extent_buffer(eb); 1367 1363 } 1368 - if (block1) { 1364 + if (block2) { 1369 1365 eb = read_tree_block(root, block2, blocksize, 0); 1370 1366 free_extent_buffer(eb); 1371 1367 } ··· 1485 1481 * of the btree by dropping locks before 1486 1482 * we read. 1487 1483 */ 1488 - btrfs_release_path(NULL, p); 1484 + btrfs_unlock_up_safe(p, level + 1); 1485 + btrfs_set_path_blocking(p); 1486 + 1489 1487 if (tmp) 1490 1488 free_extent_buffer(tmp); 1491 1489 if (p->reada) 1492 1490 reada_for_search(root, p, level, slot, key->objectid); 1493 1491 1492 + btrfs_release_path(NULL, p); 1494 1493 tmp = read_tree_block(root, blocknr, blocksize, gen); 1495 1494 if (tmp) 1496 1495 free_extent_buffer(tmp);
+7 -2
fs/btrfs/disk-io.c
··· 579 579 async->bio_flags = bio_flags; 580 580 581 581 atomic_inc(&fs_info->nr_async_submits); 582 + 583 + if (rw & (1 << BIO_RW_SYNCIO)) 584 + btrfs_set_work_high_prio(&async->work); 585 + 582 586 btrfs_queue_worker(&fs_info->workers, &async->work); 583 587 #if 0 584 588 int limit = btrfs_async_submit_limit(fs_info); ··· 660 656 return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, 661 657 mirror_num, 0); 662 658 } 659 + 663 660 /* 664 661 * kthread helpers are used to submit writes so that checksumming 665 662 * can happen in parallel across all CPUs ··· 2100 2095 device->barriers = 0; 2101 2096 get_bh(bh); 2102 2097 lock_buffer(bh); 2103 - ret = submit_bh(WRITE, bh); 2098 + ret = submit_bh(WRITE_SYNC, bh); 2104 2099 } 2105 2100 } else { 2106 - ret = submit_bh(WRITE, bh); 2101 + ret = submit_bh(WRITE_SYNC, bh); 2107 2102 } 2108 2103 2109 2104 if (!ret && wait) {
+64 -24
fs/btrfs/extent_io.c
··· 50 50 /* tells writepage not to lock the state bits for this range 51 51 * it still does the unlocking 52 52 */ 53 - int extent_locked; 53 + unsigned int extent_locked:1; 54 + 55 + /* tells the submit_bio code to use a WRITE_SYNC */ 56 + unsigned int sync_io:1; 54 57 }; 55 58 56 59 int __init extent_io_init(void) ··· 2104 2101 return ret; 2105 2102 } 2106 2103 2104 + static noinline void update_nr_written(struct page *page, 2105 + struct writeback_control *wbc, 2106 + unsigned long nr_written) 2107 + { 2108 + wbc->nr_to_write -= nr_written; 2109 + if (wbc->range_cyclic || (wbc->nr_to_write > 0 && 2110 + wbc->range_start == 0 && wbc->range_end == LLONG_MAX)) 2111 + page->mapping->writeback_index = page->index + nr_written; 2112 + } 2113 + 2107 2114 /* 2108 2115 * the writepage semantics are similar to regular writepage. extent 2109 2116 * records are inserted to lock ranges in the tree, and as dirty areas ··· 2149 2136 u64 delalloc_end; 2150 2137 int page_started; 2151 2138 int compressed; 2139 + int write_flags; 2152 2140 unsigned long nr_written = 0; 2141 + 2142 + if (wbc->sync_mode == WB_SYNC_ALL) 2143 + write_flags = WRITE_SYNC_PLUG; 2144 + else 2145 + write_flags = WRITE; 2153 2146 2154 2147 WARN_ON(!PageLocked(page)); 2155 2148 pg_offset = i_size & (PAGE_CACHE_SIZE - 1); ··· 2183 2164 delalloc_end = 0; 2184 2165 page_started = 0; 2185 2166 if (!epd->extent_locked) { 2167 + /* 2168 + * make sure the wbc mapping index is at least updated 2169 + * to this page. 2170 + */ 2171 + update_nr_written(page, wbc, 0); 2172 + 2186 2173 while (delalloc_end < page_end) { 2187 2174 nr_delalloc = find_lock_delalloc_range(inode, tree, 2188 2175 page, ··· 2210 2185 */ 2211 2186 if (page_started) { 2212 2187 ret = 0; 2213 - goto update_nr_written; 2188 + /* 2189 + * we've unlocked the page, so we can't update 2190 + * the mapping's writeback index, just update 2191 + * nr_to_write. 2192 + */ 2193 + wbc->nr_to_write -= nr_written; 2194 + goto done_unlocked; 2214 2195 } 2215 2196 } 2216 2197 lock_extent(tree, start, page_end, GFP_NOFS); ··· 2229 2198 if (ret == -EAGAIN) { 2230 2199 unlock_extent(tree, start, page_end, GFP_NOFS); 2231 2200 redirty_page_for_writepage(wbc, page); 2201 + update_nr_written(page, wbc, nr_written); 2232 2202 unlock_page(page); 2233 2203 ret = 0; 2234 - goto update_nr_written; 2204 + goto done_unlocked; 2235 2205 } 2236 2206 } 2237 2207 2238 - nr_written++; 2208 + /* 2209 + * we don't want to touch the inode after unlocking the page, 2210 + * so we update the mapping writeback index now 2211 + */ 2212 + update_nr_written(page, wbc, nr_written + 1); 2239 2213 2240 2214 end = page_end; 2241 2215 if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) ··· 2350 2314 (unsigned long long)end); 2351 2315 } 2352 2316 2353 - ret = submit_extent_page(WRITE, tree, page, sector, 2354 - iosize, pg_offset, bdev, 2355 - &epd->bio, max_nr, 2317 + ret = submit_extent_page(write_flags, tree, page, 2318 + sector, iosize, pg_offset, 2319 + bdev, &epd->bio, max_nr, 2356 2320 end_bio_extent_writepage, 2357 2321 0, 0, 0); 2358 2322 if (ret) ··· 2372 2336 unlock_extent(tree, unlock_start, page_end, GFP_NOFS); 2373 2337 unlock_page(page); 2374 2338 2375 - update_nr_written: 2376 - wbc->nr_to_write -= nr_written; 2377 - if (wbc->range_cyclic || (wbc->nr_to_write > 0 && 2378 - wbc->range_start == 0 && wbc->range_end == LLONG_MAX)) 2379 - page->mapping->writeback_index = page->index + nr_written; 2339 + done_unlocked: 2340 + 2380 2341 return 0; 2381 2342 } 2382 2343 ··· 2493 2460 return ret; 2494 2461 } 2495 2462 2463 + static void flush_epd_write_bio(struct extent_page_data *epd) 2464 + { 2465 + if (epd->bio) { 2466 + if (epd->sync_io) 2467 + submit_one_bio(WRITE_SYNC, epd->bio, 0, 0); 2468 + else 2469 + submit_one_bio(WRITE, epd->bio, 0, 0); 2470 + epd->bio = NULL; 2471 + } 2472 + } 2473 + 2496 2474 static noinline void flush_write_bio(void *data) 2497 2475 { 2498 2476 struct extent_page_data *epd = data; 2499 - if (epd->bio) { 2500 - submit_one_bio(WRITE, epd->bio, 0, 0); 2501 - epd->bio = NULL; 2502 - } 2477 + flush_epd_write_bio(epd); 2503 2478 } 2504 2479 2505 2480 int extent_write_full_page(struct extent_io_tree *tree, struct page *page, ··· 2521 2480 .tree = tree, 2522 2481 .get_extent = get_extent, 2523 2482 .extent_locked = 0, 2483 + .sync_io = wbc->sync_mode == WB_SYNC_ALL, 2524 2484 }; 2525 2485 struct writeback_control wbc_writepages = { 2526 2486 .bdi = wbc->bdi, 2527 - .sync_mode = WB_SYNC_NONE, 2487 + .sync_mode = wbc->sync_mode, 2528 2488 .older_than_this = NULL, 2529 2489 .nr_to_write = 64, 2530 2490 .range_start = page_offset(page) + PAGE_CACHE_SIZE, 2531 2491 .range_end = (loff_t)-1, 2532 2492 }; 2533 2493 2534 - 2535 2494 ret = __extent_writepage(page, wbc, &epd); 2536 2495 2537 2496 extent_write_cache_pages(tree, mapping, &wbc_writepages, 2538 2497 __extent_writepage, &epd, flush_write_bio); 2539 - if (epd.bio) 2540 - submit_one_bio(WRITE, epd.bio, 0, 0); 2498 + flush_epd_write_bio(&epd); 2541 2499 return ret; 2542 2500 } 2543 2501 ··· 2555 2515 .tree = tree, 2556 2516 .get_extent = get_extent, 2557 2517 .extent_locked = 1, 2518 + .sync_io = mode == WB_SYNC_ALL, 2558 2519 }; 2559 2520 struct writeback_control wbc_writepages = { 2560 2521 .bdi = inode->i_mapping->backing_dev_info, ··· 2581 2540 start += PAGE_CACHE_SIZE; 2582 2541 } 2583 2542 2584 - if (epd.bio) 2585 - submit_one_bio(WRITE, epd.bio, 0, 0); 2543 + flush_epd_write_bio(&epd); 2586 2544 return ret; 2587 2545 } 2588 2546 ··· 2596 2556 .tree = tree, 2597 2557 .get_extent = get_extent, 2598 2558 .extent_locked = 0, 2559 + .sync_io = wbc->sync_mode == WB_SYNC_ALL, 2599 2560 }; 2600 2561 2601 2562 ret = extent_write_cache_pages(tree, mapping, wbc, 2602 2563 __extent_writepage, &epd, 2603 2564 flush_write_bio); 2604 - if (epd.bio) 2605 - submit_one_bio(WRITE, epd.bio, 0, 0); 2565 + flush_epd_write_bio(&epd); 2606 2566 return ret; 2607 2567 } 2608 2568
+4 -2
fs/btrfs/file.c
··· 830 830 831 831 ret = btrfs_del_items(trans, root, path, del_slot, del_nr); 832 832 BUG_ON(ret); 833 - goto done; 833 + goto release; 834 834 } else if (split == start) { 835 835 if (locked_end < extent_end) { 836 836 ret = try_lock_extent(&BTRFS_I(inode)->io_tree, ··· 926 926 } 927 927 done: 928 928 btrfs_mark_buffer_dirty(leaf); 929 + 930 + release: 929 931 btrfs_release_path(root, path); 930 932 if (split_end && split == start) { 931 933 split = end; ··· 1133 1131 if (will_write) { 1134 1132 btrfs_fdatawrite_range(inode->i_mapping, pos, 1135 1133 pos + write_bytes - 1, 1136 - WB_SYNC_NONE); 1134 + WB_SYNC_ALL); 1137 1135 } else { 1138 1136 balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1139 1137 num_pages);
+28 -8
fs/btrfs/inode.c
··· 4970 4970 return err; 4971 4971 } 4972 4972 4973 - static int prealloc_file_range(struct inode *inode, u64 start, u64 end, 4973 + static int prealloc_file_range(struct btrfs_trans_handle *trans, 4974 + struct inode *inode, u64 start, u64 end, 4974 4975 u64 alloc_hint, int mode) 4975 4976 { 4976 - struct btrfs_trans_handle *trans; 4977 4977 struct btrfs_root *root = BTRFS_I(inode)->root; 4978 4978 struct btrfs_key ins; 4979 4979 u64 alloc_size; 4980 4980 u64 cur_offset = start; 4981 4981 u64 num_bytes = end - start; 4982 4982 int ret = 0; 4983 - 4984 - trans = btrfs_join_transaction(root, 1); 4985 - BUG_ON(!trans); 4986 - btrfs_set_trans_block_group(trans, inode); 4987 4983 4988 4984 while (num_bytes > 0) { 4989 4985 alloc_size = min(num_bytes, root->fs_info->max_extent); ··· 5011 5015 BUG_ON(ret); 5012 5016 } 5013 5017 5014 - btrfs_end_transaction(trans, root); 5015 5018 return ret; 5016 5019 } 5017 5020 ··· 5024 5029 u64 alloc_hint = 0; 5025 5030 u64 mask = BTRFS_I(inode)->root->sectorsize - 1; 5026 5031 struct extent_map *em; 5032 + struct btrfs_trans_handle *trans; 5027 5033 int ret; 5028 5034 5029 5035 alloc_start = offset & ~mask; 5030 5036 alloc_end = (offset + len + mask) & ~mask; 5037 + 5038 + /* 5039 + * wait for ordered IO before we have any locks. We'll loop again 5040 + * below with the locks held. 5041 + */ 5042 + btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start); 5031 5043 5032 5044 mutex_lock(&inode->i_mutex); 5033 5045 if (alloc_start > inode->i_size) { ··· 5045 5043 5046 5044 while (1) { 5047 5045 struct btrfs_ordered_extent *ordered; 5046 + 5047 + trans = btrfs_start_transaction(BTRFS_I(inode)->root, 1); 5048 + if (!trans) { 5049 + ret = -EIO; 5050 + goto out; 5051 + } 5052 + 5053 + /* the extent lock is ordered inside the running 5054 + * transaction 5055 + */ 5048 5056 lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, 5049 5057 alloc_end - 1, GFP_NOFS); 5050 5058 ordered = btrfs_lookup_first_ordered_extent(inode, ··· 5065 5053 btrfs_put_ordered_extent(ordered); 5066 5054 unlock_extent(&BTRFS_I(inode)->io_tree, 5067 5055 alloc_start, alloc_end - 1, GFP_NOFS); 5056 + btrfs_end_transaction(trans, BTRFS_I(inode)->root); 5057 + 5058 + /* 5059 + * we can't wait on the range with the transaction 5060 + * running or with the extent lock held 5061 + */ 5068 5062 btrfs_wait_ordered_range(inode, alloc_start, 5069 5063 alloc_end - alloc_start); 5070 5064 } else { ··· 5088 5070 last_byte = min(extent_map_end(em), alloc_end); 5089 5071 last_byte = (last_byte + mask) & ~mask; 5090 5072 if (em->block_start == EXTENT_MAP_HOLE) { 5091 - ret = prealloc_file_range(inode, cur_offset, 5073 + ret = prealloc_file_range(trans, inode, cur_offset, 5092 5074 last_byte, alloc_hint, mode); 5093 5075 if (ret < 0) { 5094 5076 free_extent_map(em); ··· 5107 5089 } 5108 5090 unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, alloc_end - 1, 5109 5091 GFP_NOFS); 5092 + 5093 + btrfs_end_transaction(trans, BTRFS_I(inode)->root); 5110 5094 out: 5111 5095 mutex_unlock(&inode->i_mutex); 5112 5096 return ret;
+1 -1
fs/btrfs/ordered-data.c
··· 489 489 /* start IO across the range first to instantiate any delalloc 490 490 * extents 491 491 */ 492 - btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_NONE); 492 + btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_ALL); 493 493 494 494 /* The compression code will leave pages locked but return from 495 495 * writepage without setting the page writeback. Starting again
+97 -27
fs/btrfs/volumes.c
··· 125 125 return NULL; 126 126 } 127 127 128 + static void requeue_list(struct btrfs_pending_bios *pending_bios, 129 + struct bio *head, struct bio *tail) 130 + { 131 + 132 + struct bio *old_head; 133 + 134 + old_head = pending_bios->head; 135 + pending_bios->head = head; 136 + if (pending_bios->tail) 137 + tail->bi_next = old_head; 138 + else 139 + pending_bios->tail = tail; 140 + } 141 + 128 142 /* 129 143 * we try to collect pending bios for a device so we don't get a large 130 144 * number of procs sending bios down to the same device. This greatly ··· 155 141 struct bio *pending; 156 142 struct backing_dev_info *bdi; 157 143 struct btrfs_fs_info *fs_info; 144 + struct btrfs_pending_bios *pending_bios; 158 145 struct bio *tail; 159 146 struct bio *cur; 160 147 int again = 0; 161 - unsigned long num_run = 0; 148 + unsigned long num_run; 149 + unsigned long num_sync_run; 162 150 unsigned long limit; 163 151 unsigned long last_waited = 0; 164 152 ··· 169 153 limit = btrfs_async_submit_limit(fs_info); 170 154 limit = limit * 2 / 3; 171 155 156 + /* we want to make sure that every time we switch from the sync 157 + * list to the normal list, we unplug 158 + */ 159 + num_sync_run = 0; 160 + 172 161 loop: 173 162 spin_lock(&device->io_lock); 163 + num_run = 0; 174 164 175 165 loop_lock: 166 + 176 167 /* take all the bios off the list at once and process them 177 168 * later on (without the lock held). But, remember the 178 169 * tail and other pointers so the bios can be properly reinserted 179 170 * into the list if we hit congestion 180 171 */ 181 - pending = device->pending_bios; 182 - tail = device->pending_bio_tail; 172 + if (device->pending_sync_bios.head) 173 + pending_bios = &device->pending_sync_bios; 174 + else 175 + pending_bios = &device->pending_bios; 176 + 177 + pending = pending_bios->head; 178 + tail = pending_bios->tail; 183 179 WARN_ON(pending && !tail); 184 - device->pending_bios = NULL; 185 - device->pending_bio_tail = NULL; 186 180 187 181 /* 188 182 * if pending was null this time around, no bios need processing ··· 202 176 * device->running_pending is used to synchronize with the 203 177 * schedule_bio code. 204 178 */ 205 - if (pending) { 206 - again = 1; 207 - device->running_pending = 1; 208 - } else { 179 + if (device->pending_sync_bios.head == NULL && 180 + device->pending_bios.head == NULL) { 209 181 again = 0; 210 182 device->running_pending = 0; 183 + } else { 184 + again = 1; 185 + device->running_pending = 1; 211 186 } 187 + 188 + pending_bios->head = NULL; 189 + pending_bios->tail = NULL; 190 + 212 191 spin_unlock(&device->io_lock); 213 192 193 + /* 194 + * if we're doing the regular priority list, make sure we unplug 195 + * for any high prio bios we've sent down 196 + */ 197 + if (pending_bios == &device->pending_bios && num_sync_run > 0) { 198 + num_sync_run = 0; 199 + blk_run_backing_dev(bdi, NULL); 200 + } 201 + 214 202 while (pending) { 203 + 204 + rmb(); 205 + if (pending_bios != &device->pending_sync_bios && 206 + device->pending_sync_bios.head && 207 + num_run > 16) { 208 + cond_resched(); 209 + spin_lock(&device->io_lock); 210 + requeue_list(pending_bios, pending, tail); 211 + goto loop_lock; 212 + } 213 + 215 214 cur = pending; 216 215 pending = pending->bi_next; 217 216 cur->bi_next = NULL; ··· 247 196 wake_up(&fs_info->async_submit_wait); 248 197 249 198 BUG_ON(atomic_read(&cur->bi_cnt) == 0); 250 - bio_get(cur); 251 199 submit_bio(cur->bi_rw, cur); 252 - bio_put(cur); 253 200 num_run++; 201 + if (bio_sync(cur)) 202 + num_sync_run++; 203 + 204 + if (need_resched()) { 205 + if (num_sync_run) { 206 + blk_run_backing_dev(bdi, NULL); 207 + num_sync_run = 0; 208 + } 209 + cond_resched(); 210 + } 254 211 255 212 /* 256 213 * we made progress, there is more work to do and the bdi ··· 267 208 */ 268 209 if (pending && bdi_write_congested(bdi) && num_run > 16 && 269 210 fs_info->fs_devices->open_devices > 1) { 270 - struct bio *old_head; 271 211 struct io_context *ioc; 272 212 273 213 ioc = current->io_context; ··· 291 233 * against it before looping 292 234 */ 293 235 last_waited = ioc->last_waited; 236 + if (need_resched()) { 237 + if (num_sync_run) { 238 + blk_run_backing_dev(bdi, NULL); 239 + num_sync_run = 0; 240 + } 241 + cond_resched(); 242 + } 294 243 continue; 295 244 } 296 245 spin_lock(&device->io_lock); 297 - 298 - old_head = device->pending_bios; 299 - device->pending_bios = pending; 300 - if (device->pending_bio_tail) 301 - tail->bi_next = old_head; 302 - else 303 - device->pending_bio_tail = tail; 304 - 246 + requeue_list(pending_bios, pending, tail); 305 247 device->running_pending = 1; 306 248 307 249 spin_unlock(&device->io_lock); ··· 309 251 goto done; 310 252 } 311 253 } 254 + 255 + if (num_sync_run) { 256 + num_sync_run = 0; 257 + blk_run_backing_dev(bdi, NULL); 258 + } 259 + 260 + cond_resched(); 312 261 if (again) 313 262 goto loop; 314 263 315 264 spin_lock(&device->io_lock); 316 - if (device->pending_bios) 265 + if (device->pending_bios.head || device->pending_sync_bios.head) 317 266 goto loop_lock; 318 267 spin_unlock(&device->io_lock); 319 268 ··· 2562 2497 max_errors = 1; 2563 2498 } 2564 2499 } 2565 - if (multi_ret && rw == WRITE && 2500 + if (multi_ret && (rw & (1 << BIO_RW)) && 2566 2501 stripes_allocated < stripes_required) { 2567 2502 stripes_allocated = map->num_stripes; 2568 2503 free_extent_map(em); ··· 2827 2762 int rw, struct bio *bio) 2828 2763 { 2829 2764 int should_queue = 1; 2765 + struct btrfs_pending_bios *pending_bios; 2830 2766 2831 2767 /* don't bother with additional async steps for reads, right now */ 2832 2768 if (!(rw & (1 << BIO_RW))) { ··· 2849 2783 bio->bi_rw |= rw; 2850 2784 2851 2785 spin_lock(&device->io_lock); 2786 + if (bio_sync(bio)) 2787 + pending_bios = &device->pending_sync_bios; 2788 + else 2789 + pending_bios = &device->pending_bios; 2852 2790 2853 - if (device->pending_bio_tail) 2854 - device->pending_bio_tail->bi_next = bio; 2791 + if (pending_bios->tail) 2792 + pending_bios->tail->bi_next = bio; 2855 2793 2856 - device->pending_bio_tail = bio; 2857 - if (!device->pending_bios) 2858 - device->pending_bios = bio; 2794 + pending_bios->tail = bio; 2795 + if (!pending_bios->head) 2796 + pending_bios->head = bio; 2859 2797 if (device->running_pending) 2860 2798 should_queue = 0; 2861 2799
+11 -2
fs/btrfs/volumes.h
··· 23 23 #include "async-thread.h" 24 24 25 25 struct buffer_head; 26 + struct btrfs_pending_bios { 27 + struct bio *head; 28 + struct bio *tail; 29 + }; 30 + 26 31 struct btrfs_device { 27 32 struct list_head dev_list; 28 33 struct list_head dev_alloc_list; 29 34 struct btrfs_fs_devices *fs_devices; 30 35 struct btrfs_root *dev_root; 31 - struct bio *pending_bios; 32 - struct bio *pending_bio_tail; 36 + 37 + /* regular prio bios */ 38 + struct btrfs_pending_bios pending_bios; 39 + /* WRITE_SYNC bios */ 40 + struct btrfs_pending_bios pending_sync_bios; 41 + 33 42 int running_pending; 34 43 u64 generation; 35 44