Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client

tjh.dev / kernel

fork

Configure Feed

Issues Pull Requests Commits Tags

Feed URL

Select the types of activity you want to include in your feed.

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

fork

Configure Feed

Issues Pull Requests Commits Tags

Feed URL

Select the types of activity you want to include in your feed.

Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client:
ceph: use separate class for ceph sockets' sk_lock
ceph: reserve one more caps space when doing readdir
ceph: queue_cap_snap should always queue dirty context
ceph: fix dentry reference leak in dcache readdir
ceph: decode v5 of osdmap (pool names) [protocol change]
ceph: fix ack counter reset on connection reset
ceph: fix leaked inode ref due to snap metadata writeback race
ceph: fix snap context reference leaks
ceph: allow writeback of snapped pages older than 'oldest' snapc
ceph: fix dentry rehashing on virtual .snap dir

Linus Torvalds 16 years ago 96e35b40 f5c07a2d

+212 -134

10 changed files

expand all collapse all

ceph

addr.c

caps.c

dir.c

inode.c

messenger.c

osdmap.c

osdmap.h

rados.h

snap.c

super.h

+30 -32

fs/ceph/addr.c

reviewed

··· 337 337 /* 338 338 * Get ref for the oldest snapc for an inode with dirty data... that is, the 339 339 * only snap context we are allowed to write back. 340 340 - * 341 341 - * Caller holds i_lock. 342 340 */ 343 343 - static struct ceph_snap_context *__get_oldest_context(struct inode *inode, 344 344 - u64 *snap_size) 341 341 + static struct ceph_snap_context *get_oldest_context(struct inode *inode, 342 342 + u64 *snap_size) 345 343 { 346 344 struct ceph_inode_info *ci = ceph_inode(inode); 347 345 struct ceph_snap_context *snapc = NULL; 348 346 struct ceph_cap_snap *capsnap = NULL; 349 347 348 348 + spin_lock(&inode->i_lock); 350 349 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { 351 350 dout(" cap_snap %p snapc %p has %d dirty pages\n", capsnap, 352 351 capsnap->context, capsnap->dirty_pages); ··· 356 357 break; 357 358 } 358 359 } 359 359 - if (!snapc && ci->i_snap_realm) { 360 360 - snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context); 360 360 + if (!snapc && ci->i_head_snapc) { 361 361 + snapc = ceph_get_snap_context(ci->i_head_snapc); 361 362 dout(" head snapc %p has %d dirty pages\n", 362 363 snapc, ci->i_wrbuffer_ref_head); 363 364 } 364 364 - return snapc; 365 365 - } 366 366 - 367 367 - static struct ceph_snap_context *get_oldest_context(struct inode *inode, 368 368 - u64 *snap_size) 369 369 - { 370 370 - struct ceph_snap_context *snapc = NULL; 371 371 - 372 372 - spin_lock(&inode->i_lock); 373 373 - snapc = __get_oldest_context(inode, snap_size); 374 365 spin_unlock(&inode->i_lock); 375 366 return snapc; 376 367 } ··· 381 392 int len = PAGE_CACHE_SIZE; 382 393 loff_t i_size; 383 394 int err = 0; 384 384 - struct ceph_snap_context *snapc; 395 395 + struct ceph_snap_context *snapc, *oldest; 385 396 u64 snap_size = 0; 386 397 long writeback_stat; 387 398 ··· 402 413 dout("writepage %p page %p not dirty?\n", inode, page); 403 414 goto out; 404 415 } 405 405 - if (snapc != get_oldest_context(inode, &snap_size)) { 416 416 + oldest = get_oldest_context(inode, &snap_size); 417 417 + if (snapc->seq > oldest->seq) { 406 418 dout("writepage %p page %p snapc %p not writeable - noop\n", 407 419 inode, page, (void *)page->private); 408 420 /* we should only noop if called by kswapd */ 409 421 WARN_ON((current->flags & PF_MEMALLOC) == 0); 422 422 + ceph_put_snap_context(oldest); 410 423 goto out; 411 424 } 425 425 + ceph_put_snap_context(oldest); 412 426 413 427 /* is this a partial page at end of file? */ 414 428 if (snap_size) ··· 450 458 ClearPagePrivate(page); 451 459 end_page_writeback(page); 452 460 ceph_put_wrbuffer_cap_refs(ci, 1, snapc); 453 453 - ceph_put_snap_context(snapc); 461 461 + ceph_put_snap_context(snapc); /* page's reference */ 454 462 out: 455 463 return err; 456 464 } ··· 550 558 dout("inode %p skipping page %p\n", inode, page); 551 559 wbc->pages_skipped++; 552 560 } 561 561 + ceph_put_snap_context((void *)page->private); 553 562 page->private = 0; 554 563 ClearPagePrivate(page); 555 555 - ceph_put_snap_context(snapc); 556 564 dout("unlocking %d %p\n", i, page); 557 565 end_page_writeback(page); 558 566 ··· 610 618 int range_whole = 0; 611 619 int should_loop = 1; 612 620 pgoff_t max_pages = 0, max_pages_ever = 0; 613 613 - struct ceph_snap_context *snapc = NULL, *last_snapc = NULL; 621 621 + struct ceph_snap_context *snapc = NULL, *last_snapc = NULL, *pgsnapc; 614 622 struct pagevec pvec; 615 623 int done = 0; 616 624 int rc = 0; ··· 762 770 } 763 771 764 772 /* only if matching snap context */ 765 765 - if (snapc != (void *)page->private) { 766 766 - dout("page snapc %p != oldest %p\n", 767 767 - (void *)page->private, snapc); 773 773 + pgsnapc = (void *)page->private; 774 774 + if (pgsnapc->seq > snapc->seq) { 775 775 + dout("page snapc %p %lld > oldest %p %lld\n", 776 776 + pgsnapc, pgsnapc->seq, snapc, snapc->seq); 768 777 unlock_page(page); 769 778 if (!locked_pages) 770 779 continue; /* keep looking for snap */ ··· 907 914 struct ceph_snap_context *snapc) 908 915 { 909 916 struct ceph_snap_context *oldest = get_oldest_context(inode, NULL); 910 910 - return !oldest || snapc->seq <= oldest->seq; 917 917 + int ret = !oldest || snapc->seq <= oldest->seq; 918 918 + 919 919 + ceph_put_snap_context(oldest); 920 920 + return ret; 911 921 } 912 922 913 923 /* ··· 932 936 int pos_in_page = pos & ~PAGE_CACHE_MASK; 933 937 int end_in_page = pos_in_page + len; 934 938 loff_t i_size; 935 935 - struct ceph_snap_context *snapc; 936 939 int r; 940 940 + struct ceph_snap_context *snapc, *oldest; 937 941 938 942 retry_locked: 939 943 /* writepages currently holds page lock, but if we change that later, */ ··· 943 947 BUG_ON(!ci->i_snap_realm); 944 948 down_read(&mdsc->snap_rwsem); 945 949 BUG_ON(!ci->i_snap_realm->cached_context); 946 946 - if (page->private && 947 947 - (void *)page->private != ci->i_snap_realm->cached_context) { 950 950 + snapc = (void *)page->private; 951 951 + if (snapc && snapc != ci->i_head_snapc) { 948 952 /* 949 953 * this page is already dirty in another (older) snap 950 954 * context! is it writeable now? 951 955 */ 952 952 - snapc = get_oldest_context(inode, NULL); 956 956 + oldest = get_oldest_context(inode, NULL); 953 957 up_read(&mdsc->snap_rwsem); 954 958 955 955 - if (snapc != (void *)page->private) { 959 959 + if (snapc->seq > oldest->seq) { 960 960 + ceph_put_snap_context(oldest); 956 961 dout(" page %p snapc %p not current or oldest\n", 957 957 - page, (void *)page->private); 962 962 + page, snapc); 958 963 /* 959 964 * queue for writeback, and wait for snapc to 960 965 * be writeable or written 961 966 */ 962 962 - snapc = ceph_get_snap_context((void *)page->private); 967 967 + snapc = ceph_get_snap_context(snapc); 963 968 unlock_page(page); 964 969 ceph_queue_writeback(inode); 965 970 r = wait_event_interruptible(ci->i_cap_wq, ··· 970 973 return r; 971 974 return -EAGAIN; 972 975 } 976 976 + ceph_put_snap_context(oldest); 973 977 974 978 /* yay, writeable, do it now (without dropping page lock) */ 975 979 dout(" page %p snapc %p not current, but oldest\n",

+32 -10

fs/ceph/caps.c

reviewed

··· 1205 1205 if (capsnap->dirty_pages || capsnap->writing) 1206 1206 continue; 1207 1207 1208 1208 + /* 1209 1209 + * if cap writeback already occurred, we should have dropped 1210 1210 + * the capsnap in ceph_put_wrbuffer_cap_refs. 1211 1211 + */ 1212 1212 + BUG_ON(capsnap->dirty == 0); 1213 1213 + 1208 1214 /* pick mds, take s_mutex */ 1209 1215 mds = __ceph_get_cap_mds(ci, &mseq); 1210 1216 if (session && session->s_mds != mds) { ··· 2124 2118 } 2125 2119 spin_unlock(&inode->i_lock); 2126 2120 2127 2127 - dout("put_cap_refs %p had %s %s\n", inode, ceph_cap_string(had), 2128 2128 - last ? "last" : ""); 2121 2121 + dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had), 2122 2122 + last ? " last" : "", put ? " put" : ""); 2129 2123 2130 2124 if (last && !flushsnaps) 2131 2125 ceph_check_caps(ci, 0, NULL); ··· 2149 2143 { 2150 2144 struct inode *inode = &ci->vfs_inode; 2151 2145 int last = 0; 2152 2152 - int last_snap = 0; 2146 2146 + int complete_capsnap = 0; 2147 2147 + int drop_capsnap = 0; 2153 2148 int found = 0; 2154 2149 struct ceph_cap_snap *capsnap = NULL; 2155 2150 ··· 2173 2166 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { 2174 2167 if (capsnap->context == snapc) { 2175 2168 found = 1; 2176 2176 - capsnap->dirty_pages -= nr; 2177 2177 - last_snap = !capsnap->dirty_pages; 2178 2169 break; 2179 2170 } 2180 2171 } 2181 2172 BUG_ON(!found); 2173 2173 + capsnap->dirty_pages -= nr; 2174 2174 + if (capsnap->dirty_pages == 0) { 2175 2175 + complete_capsnap = 1; 2176 2176 + if (capsnap->dirty == 0) 2177 2177 + /* cap writeback completed before we created 2178 2178 + * the cap_snap; no FLUSHSNAP is needed */ 2179 2179 + drop_capsnap = 1; 2180 2180 + } 2182 2181 dout("put_wrbuffer_cap_refs on %p cap_snap %p " 2183 2183 - " snap %lld %d/%d -> %d/%d %s%s\n", 2182 2182 + " snap %lld %d/%d -> %d/%d %s%s%s\n", 2184 2183 inode, capsnap, capsnap->context->seq, 2185 2184 ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr, 2186 2185 ci->i_wrbuffer_ref, capsnap->dirty_pages, 2187 2186 last ? " (wrbuffer last)" : "", 2188 2188 - last_snap ? " (capsnap last)" : ""); 2187 2187 + complete_capsnap ? " (complete capsnap)" : "", 2188 2188 + drop_capsnap ? " (drop capsnap)" : ""); 2189 2189 + if (drop_capsnap) { 2190 2190 + ceph_put_snap_context(capsnap->context); 2191 2191 + list_del(&capsnap->ci_item); 2192 2192 + list_del(&capsnap->flushing_item); 2193 2193 + ceph_put_cap_snap(capsnap); 2194 2194 + } 2189 2195 } 2190 2196 2191 2197 spin_unlock(&inode->i_lock); ··· 2206 2186 if (last) { 2207 2187 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); 2208 2188 iput(inode); 2209 2209 - } else if (last_snap) { 2189 2189 + } else if (complete_capsnap) { 2210 2190 ceph_flush_snaps(ci); 2211 2191 wake_up(&ci->i_cap_wq); 2212 2192 } 2193 2193 + if (drop_capsnap) 2194 2194 + iput(inode); 2213 2195 } 2214 2196 2215 2197 /* ··· 2487 2465 break; 2488 2466 } 2489 2467 WARN_ON(capsnap->dirty_pages || capsnap->writing); 2490 2490 - dout(" removing cap_snap %p follows %lld\n", 2491 2491 - capsnap, follows); 2468 2468 + dout(" removing %p cap_snap %p follows %lld\n", 2469 2469 + inode, capsnap, follows); 2492 2470 ceph_put_snap_context(capsnap->context); 2493 2471 list_del(&capsnap->ci_item); 2494 2472 list_del(&capsnap->flushing_item);

+4 -3

fs/ceph/dir.c

reviewed

··· 171 171 spin_lock(&inode->i_lock); 172 172 spin_lock(&dcache_lock); 173 173 174 174 + last = dentry; 175 175 + 174 176 if (err < 0) 175 177 goto out_unlock; 176 176 - 177 177 - last = dentry; 178 178 179 179 p = p->prev; 180 180 filp->f_pos++; ··· 312 312 req->r_readdir_offset = fi->next_offset; 313 313 req->r_args.readdir.frag = cpu_to_le32(frag); 314 314 req->r_args.readdir.max_entries = cpu_to_le32(max_entries); 315 315 - req->r_num_caps = max_entries; 315 315 + req->r_num_caps = max_entries + 1; 316 316 err = ceph_mdsc_do_request(mdsc, NULL, req); 317 317 if (err < 0) { 318 318 ceph_mdsc_put_request(req); ··· 489 489 struct inode *inode = ceph_get_snapdir(parent); 490 490 dout("ENOENT on snapdir %p '%.*s', linking to snapdir %p\n", 491 491 dentry, dentry->d_name.len, dentry->d_name.name, inode); 492 492 + BUG_ON(!d_unhashed(dentry)); 492 493 d_add(dentry, inode); 493 494 err = 0; 494 495 }

+9 -1

fs/ceph/inode.c

reviewed

··· 886 886 struct inode *in = NULL; 887 887 struct ceph_mds_reply_inode *ininfo; 888 888 struct ceph_vino vino; 889 889 + struct ceph_client *client = ceph_sb_to_client(sb); 889 890 int i = 0; 890 891 int err = 0; 891 892 ··· 950 949 return err; 951 950 } 952 951 953 953 - if (rinfo->head->is_dentry && !req->r_aborted) { 952 952 + /* 953 953 + * ignore null lease/binding on snapdir ENOENT, or else we 954 954 + * will have trouble splicing in the virtual snapdir later 955 955 + */ 956 956 + if (rinfo->head->is_dentry && !req->r_aborted && 957 957 + (rinfo->head->is_target || strncmp(req->r_dentry->d_name.name, 958 958 + client->mount_args->snapdir_name, 959 959 + req->r_dentry->d_name.len))) { 954 960 /* 955 961 * lookup link rename : null -> possibly existing inode 956 962 * mknod symlink mkdir : null -> new inode

fs/ceph/messenger.c

reviewed

··· 30 30 static char tag_ack = CEPH_MSGR_TAG_ACK; 31 31 static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE; 32 32 33 33 + #ifdef CONFIG_LOCKDEP 34 34 + static struct lock_class_key socket_class; 35 35 + #endif 36 36 + 33 37 34 38 static void queue_con(struct ceph_connection *con); 35 39 static void con_work(struct work_struct *); ··· 232 228 con->sock = sock; 233 229 sock->sk->sk_allocation = GFP_NOFS; 234 230 231 231 + #ifdef CONFIG_LOCKDEP 232 232 + lockdep_set_class(&sock->sk->sk_lock, &socket_class); 233 233 + #endif 234 234 + 235 235 set_sock_callbacks(sock, con); 236 236 237 237 dout("connect %s\n", pr_addr(&con->peer_addr.in_addr)); ··· 341 333 con->out_msg = NULL; 342 334 } 343 335 con->in_seq = 0; 336 336 + con->in_seq_acked = 0; 344 337 } 345 338 346 339 /*

+109 -71

fs/ceph/osdmap.c

reviewed

··· 314 314 return ERR_PTR(err); 315 315 } 316 316 317 317 - 318 318 - /* 319 319 - * osd map 320 320 - */ 321 321 - void ceph_osdmap_destroy(struct ceph_osdmap *map) 322 322 - { 323 323 - dout("osdmap_destroy %p\n", map); 324 324 - if (map->crush) 325 325 - crush_destroy(map->crush); 326 326 - while (!RB_EMPTY_ROOT(&map->pg_temp)) { 327 327 - struct ceph_pg_mapping *pg = 328 328 - rb_entry(rb_first(&map->pg_temp), 329 329 - struct ceph_pg_mapping, node); 330 330 - rb_erase(&pg->node, &map->pg_temp); 331 331 - kfree(pg); 332 332 - } 333 333 - while (!RB_EMPTY_ROOT(&map->pg_pools)) { 334 334 - struct ceph_pg_pool_info *pi = 335 335 - rb_entry(rb_first(&map->pg_pools), 336 336 - struct ceph_pg_pool_info, node); 337 337 - rb_erase(&pi->node, &map->pg_pools); 338 338 - kfree(pi); 339 339 - } 340 340 - kfree(map->osd_state); 341 341 - kfree(map->osd_weight); 342 342 - kfree(map->osd_addr); 343 343 - kfree(map); 344 344 - } 345 345 - 346 346 - /* 347 347 - * adjust max osd value. reallocate arrays. 348 348 - */ 349 349 - static int osdmap_set_max_osd(struct ceph_osdmap *map, int max) 350 350 - { 351 351 - u8 *state; 352 352 - struct ceph_entity_addr *addr; 353 353 - u32 *weight; 354 354 - 355 355 - state = kcalloc(max, sizeof(*state), GFP_NOFS); 356 356 - addr = kcalloc(max, sizeof(*addr), GFP_NOFS); 357 357 - weight = kcalloc(max, sizeof(*weight), GFP_NOFS); 358 358 - if (state == NULL || addr == NULL || weight == NULL) { 359 359 - kfree(state); 360 360 - kfree(addr); 361 361 - kfree(weight); 362 362 - return -ENOMEM; 363 363 - } 364 364 - 365 365 - /* copy old? */ 366 366 - if (map->osd_state) { 367 367 - memcpy(state, map->osd_state, map->max_osd*sizeof(*state)); 368 368 - memcpy(addr, map->osd_addr, map->max_osd*sizeof(*addr)); 369 369 - memcpy(weight, map->osd_weight, map->max_osd*sizeof(*weight)); 370 370 - kfree(map->osd_state); 371 371 - kfree(map->osd_addr); 372 372 - kfree(map->osd_weight); 373 373 - } 374 374 - 375 375 - map->osd_state = state; 376 376 - map->osd_weight = weight; 377 377 - map->osd_addr = addr; 378 378 - map->max_osd = max; 379 379 - return 0; 380 380 - } 381 381 - 382 317 /* 383 318 * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid 384 319 * to a set of osds) ··· 417 482 return NULL; 418 483 } 419 484 485 485 + static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi) 486 486 + { 487 487 + rb_erase(&pi->node, root); 488 488 + kfree(pi->name); 489 489 + kfree(pi); 490 490 + } 491 491 + 420 492 void __decode_pool(void **p, struct ceph_pg_pool_info *pi) 421 493 { 422 494 ceph_decode_copy(p, &pi->v, sizeof(pi->v)); 423 495 calc_pg_masks(pi); 424 496 *p += le32_to_cpu(pi->v.num_snaps) * sizeof(u64); 425 497 *p += le32_to_cpu(pi->v.num_removed_snap_intervals) * sizeof(u64) * 2; 498 498 + } 499 499 + 500 500 + static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map) 501 501 + { 502 502 + struct ceph_pg_pool_info *pi; 503 503 + u32 num, len, pool; 504 504 + 505 505 + ceph_decode_32_safe(p, end, num, bad); 506 506 + dout(" %d pool names\n", num); 507 507 + while (num--) { 508 508 + ceph_decode_32_safe(p, end, pool, bad); 509 509 + ceph_decode_32_safe(p, end, len, bad); 510 510 + dout(" pool %d len %d\n", pool, len); 511 511 + pi = __lookup_pg_pool(&map->pg_pools, pool); 512 512 + if (pi) { 513 513 + kfree(pi->name); 514 514 + pi->name = kmalloc(len + 1, GFP_NOFS); 515 515 + if (pi->name) { 516 516 + memcpy(pi->name, *p, len); 517 517 + pi->name[len] = '\0'; 518 518 + dout(" name is %s\n", pi->name); 519 519 + } 520 520 + } 521 521 + *p += len; 522 522 + } 523 523 + return 0; 524 524 + 525 525 + bad: 526 526 + return -EINVAL; 527 527 + } 528 528 + 529 529 + /* 530 530 + * osd map 531 531 + */ 532 532 + void ceph_osdmap_destroy(struct ceph_osdmap *map) 533 533 + { 534 534 + dout("osdmap_destroy %p\n", map); 535 535 + if (map->crush) 536 536 + crush_destroy(map->crush); 537 537 + while (!RB_EMPTY_ROOT(&map->pg_temp)) { 538 538 + struct ceph_pg_mapping *pg = 539 539 + rb_entry(rb_first(&map->pg_temp), 540 540 + struct ceph_pg_mapping, node); 541 541 + rb_erase(&pg->node, &map->pg_temp); 542 542 + kfree(pg); 543 543 + } 544 544 + while (!RB_EMPTY_ROOT(&map->pg_pools)) { 545 545 + struct ceph_pg_pool_info *pi = 546 546 + rb_entry(rb_first(&map->pg_pools), 547 547 + struct ceph_pg_pool_info, node); 548 548 + __remove_pg_pool(&map->pg_pools, pi); 549 549 + } 550 550 + kfree(map->osd_state); 551 551 + kfree(map->osd_weight); 552 552 + kfree(map->osd_addr); 553 553 + kfree(map); 554 554 + } 555 555 + 556 556 + /* 557 557 + * adjust max osd value. reallocate arrays. 558 558 + */ 559 559 + static int osdmap_set_max_osd(struct ceph_osdmap *map, int max) 560 560 + { 561 561 + u8 *state; 562 562 + struct ceph_entity_addr *addr; 563 563 + u32 *weight; 564 564 + 565 565 + state = kcalloc(max, sizeof(*state), GFP_NOFS); 566 566 + addr = kcalloc(max, sizeof(*addr), GFP_NOFS); 567 567 + weight = kcalloc(max, sizeof(*weight), GFP_NOFS); 568 568 + if (state == NULL || addr == NULL || weight == NULL) { 569 569 + kfree(state); 570 570 + kfree(addr); 571 571 + kfree(weight); 572 572 + return -ENOMEM; 573 573 + } 574 574 + 575 575 + /* copy old? */ 576 576 + if (map->osd_state) { 577 577 + memcpy(state, map->osd_state, map->max_osd*sizeof(*state)); 578 578 + memcpy(addr, map->osd_addr, map->max_osd*sizeof(*addr)); 579 579 + memcpy(weight, map->osd_weight, map->max_osd*sizeof(*weight)); 580 580 + kfree(map->osd_state); 581 581 + kfree(map->osd_addr); 582 582 + kfree(map->osd_weight); 583 583 + } 584 584 + 585 585 + map->osd_state = state; 586 586 + map->osd_weight = weight; 587 587 + map->osd_addr = addr; 588 588 + map->max_osd = max; 589 589 + return 0; 426 590 } 427 591 428 592 /* ··· 560 526 ceph_decode_32_safe(p, end, max, bad); 561 527 while (max--) { 562 528 ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad); 563 563 - pi = kmalloc(sizeof(*pi), GFP_NOFS); 529 529 + pi = kzalloc(sizeof(*pi), GFP_NOFS); 564 530 if (!pi) 565 531 goto bad; 566 532 pi->id = ceph_decode_32(p); ··· 573 539 __decode_pool(p, pi); 574 540 __insert_pg_pool(&map->pg_pools, pi); 575 541 } 542 542 + 543 543 + if (version >= 5 && __decode_pool_names(p, end, map) < 0) 544 544 + goto bad; 545 545 + 576 546 ceph_decode_32_safe(p, end, map->pool_max, bad); 577 547 578 548 ceph_decode_32_safe(p, end, map->flags, bad); ··· 750 712 } 751 713 pi = __lookup_pg_pool(&map->pg_pools, pool); 752 714 if (!pi) { 753 753 - pi = kmalloc(sizeof(*pi), GFP_NOFS); 715 715 + pi = kzalloc(sizeof(*pi), GFP_NOFS); 754 716 if (!pi) { 755 717 err = -ENOMEM; 756 718 goto bad; ··· 760 722 } 761 723 __decode_pool(p, pi); 762 724 } 725 725 + if (version >= 5 && __decode_pool_names(p, end, map) < 0) 726 726 + goto bad; 763 727 764 728 /* old_pool */ 765 729 ceph_decode_32_safe(p, end, len, bad); ··· 770 730 771 731 ceph_decode_32_safe(p, end, pool, bad); 772 732 pi = __lookup_pg_pool(&map->pg_pools, pool); 773 773 - if (pi) { 774 774 - rb_erase(&pi->node, &map->pg_pools); 775 775 - kfree(pi); 776 776 - } 733 733 + if (pi) 734 734 + __remove_pg_pool(&map->pg_pools, pi); 777 735 } 778 736 779 737 /* new_up */

fs/ceph/osdmap.h

reviewed

··· 23 23 int id; 24 24 struct ceph_pg_pool v; 25 25 int pg_num_mask, pgp_num_mask, lpg_num_mask, lpgp_num_mask; 26 26 + char *name; 26 27 }; 27 28 28 29 struct ceph_pg_mapping {

+4 -2

fs/ceph/rados.h

reviewed

··· 11 11 /* 12 12 * osdmap encoding versions 13 13 */ 14 14 - #define CEPH_OSDMAP_INC_VERSION 4 15 15 - #define CEPH_OSDMAP_VERSION 4 14 14 + #define CEPH_OSDMAP_INC_VERSION 5 15 15 + #define CEPH_OSDMAP_INC_VERSION_EXT 5 16 16 + #define CEPH_OSDMAP_VERSION 5 17 17 + #define CEPH_OSDMAP_VERSION_EXT 5 16 18 17 19 /* 18 20 * fs id

+13 -13

fs/ceph/snap.c

reviewed

··· 431 431 * Caller must hold snap_rwsem for read (i.e., the realm topology won't 432 432 * change). 433 433 */ 434 434 - void ceph_queue_cap_snap(struct ceph_inode_info *ci, 435 435 - struct ceph_snap_context *snapc) 434 434 + void ceph_queue_cap_snap(struct ceph_inode_info *ci) 436 435 { 437 436 struct inode *inode = &ci->vfs_inode; 438 437 struct ceph_cap_snap *capsnap; ··· 450 451 as no new writes are allowed to start when pending, so any 451 452 writes in progress now were started before the previous 452 453 cap_snap. lucky us. */ 453 453 - dout("queue_cap_snap %p snapc %p seq %llu used %d" 454 454 - " already pending\n", inode, snapc, snapc->seq, used); 454 454 + dout("queue_cap_snap %p already pending\n", inode); 455 455 kfree(capsnap); 456 456 } else if (ci->i_wrbuffer_ref_head || (used & CEPH_CAP_FILE_WR)) { 457 457 + struct ceph_snap_context *snapc = ci->i_head_snapc; 458 458 + 457 459 igrab(inode); 458 460 459 461 atomic_set(&capsnap->nref, 1); ··· 463 463 INIT_LIST_HEAD(&capsnap->flushing_item); 464 464 465 465 capsnap->follows = snapc->seq - 1; 466 466 - capsnap->context = ceph_get_snap_context(snapc); 467 466 capsnap->issued = __ceph_caps_issued(ci, NULL); 468 467 capsnap->dirty = __ceph_caps_dirty(ci); 469 468 ··· 479 480 snapshot. */ 480 481 capsnap->dirty_pages = ci->i_wrbuffer_ref_head; 481 482 ci->i_wrbuffer_ref_head = 0; 482 482 - ceph_put_snap_context(ci->i_head_snapc); 483 483 + capsnap->context = snapc; 483 484 ci->i_head_snapc = NULL; 484 485 list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps); 485 486 ··· 521 522 capsnap->ctime = inode->i_ctime; 522 523 capsnap->time_warp_seq = ci->i_time_warp_seq; 523 524 if (capsnap->dirty_pages) { 524 524 - dout("finish_cap_snap %p cap_snap %p snapc %p %llu s=%llu " 525 525 + dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu " 525 526 "still has %d dirty pages\n", inode, capsnap, 526 527 capsnap->context, capsnap->context->seq, 527 527 - capsnap->size, capsnap->dirty_pages); 528 528 + ceph_cap_string(capsnap->dirty), capsnap->size, 529 529 + capsnap->dirty_pages); 528 530 return 0; 529 531 } 530 530 - dout("finish_cap_snap %p cap_snap %p snapc %p %llu s=%llu clean\n", 532 532 + dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu\n", 531 533 inode, capsnap, capsnap->context, 532 532 - capsnap->context->seq, capsnap->size); 534 534 + capsnap->context->seq, ceph_cap_string(capsnap->dirty), 535 535 + capsnap->size); 533 536 534 537 spin_lock(&mdsc->snap_flush_lock); 535 538 list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list); ··· 603 602 if (lastinode) 604 603 iput(lastinode); 605 604 lastinode = inode; 606 606 - ceph_queue_cap_snap(ci, realm->cached_context); 605 605 + ceph_queue_cap_snap(ci); 607 606 spin_lock(&realm->inodes_with_caps_lock); 608 607 } 609 608 spin_unlock(&realm->inodes_with_caps_lock); ··· 825 824 spin_unlock(&realm->inodes_with_caps_lock); 826 825 spin_unlock(&inode->i_lock); 827 826 828 828 - ceph_queue_cap_snap(ci, 829 829 - ci->i_snap_realm->cached_context); 827 827 + ceph_queue_cap_snap(ci); 830 828 831 829 iput(inode); 832 830 continue;

+1 -2

fs/ceph/super.h

reviewed

··· 715 715 extern void ceph_handle_snap(struct ceph_mds_client *mdsc, 716 716 struct ceph_mds_session *session, 717 717 struct ceph_msg *msg); 718 718 - extern void ceph_queue_cap_snap(struct ceph_inode_info *ci, 719 719 - struct ceph_snap_context *snapc); 718 718 + extern void ceph_queue_cap_snap(struct ceph_inode_info *ci); 720 719 extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci, 721 720 struct ceph_cap_snap *capsnap); 722 721 extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc);