Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client

Pull Ceph fixes from Sage Weil:
"We have a few wire protocol compatibility fixes, ports of a few recent
CRUSH mapping changes, and a couple error path fixes"

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client:
libceph: MOSDOpReply v7 encoding
libceph: advertise support for TUNABLES5
crush: decode and initialize chooseleaf_stable
crush: add chooseleaf_stable tunable
crush: ensure take bucket value is valid
crush: ensure bucket id is valid before indexing buckets array
ceph: fix snap context leak in error path
ceph: checking for IS_ERR instead of NULL

+75 -17
+3 -3
fs/ceph/file.c
··· 698 698 699 699 req = ceph_osdc_alloc_request(orig_req->r_osdc, snapc, 2, 700 700 false, GFP_NOFS); 701 - if (IS_ERR(req)) { 702 - ret = PTR_ERR(req); 701 + if (!req) { 702 + ret = -ENOMEM; 703 703 req = orig_req; 704 704 goto out; 705 705 } ··· 716 716 ceph_osdc_build_request(req, req->r_ops[0].extent.offset, 717 717 snapc, CEPH_NOSNAP, &aio_req->mtime); 718 718 719 - ceph_put_snap_context(snapc); 720 719 ceph_osdc_put_request(orig_req); 721 720 722 721 req->r_callback = ceph_aio_complete_req; ··· 730 731 ceph_aio_complete_req(req, NULL); 731 732 } 732 733 734 + ceph_put_snap_context(snapc); 733 735 kfree(aio_work); 734 736 } 735 737
+15 -1
include/linux/ceph/ceph_features.h
··· 63 63 #define CEPH_FEATURE_OSD_MIN_SIZE_RECOVERY (1ULL<<49) 64 64 // duplicated since it was introduced at the same time as MIN_SIZE_RECOVERY 65 65 #define CEPH_FEATURE_OSD_PROXY_FEATURES (1ULL<<49) /* overlap w/ above */ 66 + #define CEPH_FEATURE_MON_METADATA (1ULL<<50) 67 + #define CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT (1ULL<<51) /* can sort objs bitwise */ 68 + #define CEPH_FEATURE_OSD_PROXY_WRITE_FEATURES (1ULL<<52) 69 + #define CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3 (1ULL<<53) 70 + #define CEPH_FEATURE_OSD_HITSET_GMT (1ULL<<54) 71 + #define CEPH_FEATURE_HAMMER_0_94_4 (1ULL<<55) 72 + #define CEPH_FEATURE_NEW_OSDOP_ENCODING (1ULL<<56) /* New, v7 encoding */ 73 + #define CEPH_FEATURE_MON_STATEFUL_SUB (1ULL<<57) /* stateful mon subscription */ 74 + #define CEPH_FEATURE_MON_ROUTE_OSDMAP (1ULL<<57) /* peon sends osdmaps */ 75 + #define CEPH_FEATURE_CRUSH_TUNABLES5 (1ULL<<58) /* chooseleaf stable mode */ 76 + // duplicated since it was introduced at the same time as CEPH_FEATURE_CRUSH_TUNABLES5 77 + #define CEPH_FEATURE_NEW_OSDOPREPLY_ENCODING (1ULL<<58) /* New, v7 encoding */ 66 78 67 79 /* 68 80 * The introduction of CEPH_FEATURE_OSD_SNAPMAPPER caused the feature ··· 120 108 CEPH_FEATURE_CRUSH_TUNABLES3 | \ 121 109 CEPH_FEATURE_OSD_PRIMARY_AFFINITY | \ 122 110 CEPH_FEATURE_MSGR_KEEPALIVE2 | \ 123 - CEPH_FEATURE_CRUSH_V4) 111 + CEPH_FEATURE_CRUSH_V4 | \ 112 + CEPH_FEATURE_CRUSH_TUNABLES5 | \ 113 + CEPH_FEATURE_NEW_OSDOPREPLY_ENCODING) 124 114 125 115 #define CEPH_FEATURES_REQUIRED_DEFAULT \ 126 116 (CEPH_FEATURE_NOSRCADDR | \
+7 -1
include/linux/crush/crush.h
··· 59 59 CRUSH_RULE_SET_CHOOSELEAF_TRIES = 9, /* override chooseleaf_descend_once */ 60 60 CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES = 10, 61 61 CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES = 11, 62 - CRUSH_RULE_SET_CHOOSELEAF_VARY_R = 12 62 + CRUSH_RULE_SET_CHOOSELEAF_VARY_R = 12, 63 + CRUSH_RULE_SET_CHOOSELEAF_STABLE = 13 63 64 }; 64 65 65 66 /* ··· 205 204 * that want to limit reshuffling, a value of 3 or 4 will make the 206 205 * mappings line up a bit better with previous mappings. */ 207 206 __u8 chooseleaf_vary_r; 207 + 208 + /* if true, it makes chooseleaf firstn to return stable results (if 209 + * no local retry) so that data migrations would be optimal when some 210 + * device fails. */ 211 + __u8 chooseleaf_stable; 208 212 209 213 #ifndef __KERNEL__ 210 214 /*
+26 -7
net/ceph/crush/mapper.c
··· 403 403 * @local_retries: localized retries 404 404 * @local_fallback_retries: localized fallback retries 405 405 * @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose) 406 + * @stable: stable mode starts rep=0 in the recursive call for all replicas 406 407 * @vary_r: pass r to recursive calls 407 408 * @out2: second output vector for leaf items (if @recurse_to_leaf) 408 409 * @parent_r: r value passed from the parent ··· 420 419 unsigned int local_fallback_retries, 421 420 int recurse_to_leaf, 422 421 unsigned int vary_r, 422 + unsigned int stable, 423 423 int *out2, 424 424 int parent_r) 425 425 { ··· 435 433 int collide, reject; 436 434 int count = out_size; 437 435 438 - dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d\n", 436 + dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d stable %d\n", 439 437 recurse_to_leaf ? "_LEAF" : "", 440 438 bucket->id, x, outpos, numrep, 441 439 tries, recurse_tries, local_retries, local_fallback_retries, 442 - parent_r); 440 + parent_r, stable); 443 441 444 - for (rep = outpos; rep < numrep && count > 0 ; rep++) { 442 + for (rep = stable ? 0 : outpos; rep < numrep && count > 0 ; rep++) { 445 443 /* keep trying until we get a non-out, non-colliding item */ 446 444 ftotal = 0; 447 445 skip_rep = 0; ··· 514 512 if (crush_choose_firstn(map, 515 513 map->buckets[-1-item], 516 514 weight, weight_max, 517 - x, outpos+1, 0, 515 + x, stable ? 1 : outpos+1, 0, 518 516 out2, outpos, count, 519 517 recurse_tries, 0, 520 518 local_retries, 521 519 local_fallback_retries, 522 520 0, 523 521 vary_r, 522 + stable, 524 523 NULL, 525 524 sub_r) <= outpos) 526 525 /* didn't get leaf */ ··· 819 816 int choose_local_fallback_retries = map->choose_local_fallback_tries; 820 817 821 818 int vary_r = map->chooseleaf_vary_r; 819 + int stable = map->chooseleaf_stable; 822 820 823 821 if ((__u32)ruleno >= map->max_rules) { 824 822 dprintk(" bad ruleno %d\n", ruleno); ··· 839 835 case CRUSH_RULE_TAKE: 840 836 if ((curstep->arg1 >= 0 && 841 837 curstep->arg1 < map->max_devices) || 842 - (-1-curstep->arg1 < map->max_buckets && 838 + (-1-curstep->arg1 >= 0 && 839 + -1-curstep->arg1 < map->max_buckets && 843 840 map->buckets[-1-curstep->arg1])) { 844 841 w[0] = curstep->arg1; 845 842 wsize = 1; ··· 874 869 vary_r = curstep->arg1; 875 870 break; 876 871 872 + case CRUSH_RULE_SET_CHOOSELEAF_STABLE: 873 + if (curstep->arg1 >= 0) 874 + stable = curstep->arg1; 875 + break; 876 + 877 877 case CRUSH_RULE_CHOOSELEAF_FIRSTN: 878 878 case CRUSH_RULE_CHOOSE_FIRSTN: 879 879 firstn = 1; ··· 898 888 osize = 0; 899 889 900 890 for (i = 0; i < wsize; i++) { 891 + int bno; 901 892 /* 902 893 * see CRUSH_N, CRUSH_N_MINUS macros. 903 894 * basically, numrep <= 0 means relative to ··· 911 900 continue; 912 901 } 913 902 j = 0; 903 + /* make sure bucket id is valid */ 904 + bno = -1 - w[i]; 905 + if (bno < 0 || bno >= map->max_buckets) { 906 + /* w[i] is probably CRUSH_ITEM_NONE */ 907 + dprintk(" bad w[i] %d\n", w[i]); 908 + continue; 909 + } 914 910 if (firstn) { 915 911 int recurse_tries; 916 912 if (choose_leaf_tries) ··· 929 911 recurse_tries = choose_tries; 930 912 osize += crush_choose_firstn( 931 913 map, 932 - map->buckets[-1-w[i]], 914 + map->buckets[bno], 933 915 weight, weight_max, 934 916 x, numrep, 935 917 curstep->arg2, ··· 941 923 choose_local_fallback_retries, 942 924 recurse_to_leaf, 943 925 vary_r, 926 + stable, 944 927 c+osize, 945 928 0); 946 929 } else { ··· 949 930 numrep : (result_max-osize)); 950 931 crush_choose_indep( 951 932 map, 952 - map->buckets[-1-w[i]], 933 + map->buckets[bno], 953 934 weight, weight_max, 954 935 x, out_size, numrep, 955 936 curstep->arg2,
+10
net/ceph/osd_client.c
··· 1770 1770 u32 osdmap_epoch; 1771 1771 int already_completed; 1772 1772 u32 bytes; 1773 + u8 decode_redir; 1773 1774 unsigned int i; 1774 1775 1775 1776 tid = le64_to_cpu(msg->hdr.tid); ··· 1842 1841 p += 8 + 4; /* skip replay_version */ 1843 1842 p += 8; /* skip user_version */ 1844 1843 1844 + if (le16_to_cpu(msg->hdr.version) >= 7) 1845 + ceph_decode_8_safe(&p, end, decode_redir, bad_put); 1846 + else 1847 + decode_redir = 1; 1848 + } else { 1849 + decode_redir = 0; 1850 + } 1851 + 1852 + if (decode_redir) { 1845 1853 err = ceph_redirect_decode(&p, end, &redir); 1846 1854 if (err) 1847 1855 goto bad_put;
+14 -5
net/ceph/osdmap.c
··· 342 342 c->choose_local_tries = ceph_decode_32(p); 343 343 c->choose_local_fallback_tries = ceph_decode_32(p); 344 344 c->choose_total_tries = ceph_decode_32(p); 345 - dout("crush decode tunable choose_local_tries = %d", 345 + dout("crush decode tunable choose_local_tries = %d\n", 346 346 c->choose_local_tries); 347 - dout("crush decode tunable choose_local_fallback_tries = %d", 347 + dout("crush decode tunable choose_local_fallback_tries = %d\n", 348 348 c->choose_local_fallback_tries); 349 - dout("crush decode tunable choose_total_tries = %d", 349 + dout("crush decode tunable choose_total_tries = %d\n", 350 350 c->choose_total_tries); 351 351 352 352 ceph_decode_need(p, end, sizeof(u32), done); 353 353 c->chooseleaf_descend_once = ceph_decode_32(p); 354 - dout("crush decode tunable chooseleaf_descend_once = %d", 354 + dout("crush decode tunable chooseleaf_descend_once = %d\n", 355 355 c->chooseleaf_descend_once); 356 356 357 357 ceph_decode_need(p, end, sizeof(u8), done); 358 358 c->chooseleaf_vary_r = ceph_decode_8(p); 359 - dout("crush decode tunable chooseleaf_vary_r = %d", 359 + dout("crush decode tunable chooseleaf_vary_r = %d\n", 360 360 c->chooseleaf_vary_r); 361 + 362 + /* skip straw_calc_version, allowed_bucket_algs */ 363 + ceph_decode_need(p, end, sizeof(u8) + sizeof(u32), done); 364 + *p += sizeof(u8) + sizeof(u32); 365 + 366 + ceph_decode_need(p, end, sizeof(u8), done); 367 + c->chooseleaf_stable = ceph_decode_8(p); 368 + dout("crush decode tunable chooseleaf_stable = %d\n", 369 + c->chooseleaf_stable); 361 370 362 371 done: 363 372 dout("crush_decode success\n");