Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'ceph-for-7.1-rc1' of https://github.com/ceph/ceph-client

Pull ceph updates from Ilya Dryomov:
"We have a series from Alex which extends CephFS client metrics with
support for per-subvolume data I/O performance and latency tracking
(metadata operations aren't included) and a good variety of fixes and
cleanups across RBD and CephFS"

* tag 'ceph-for-7.1-rc1' of https://github.com/ceph/ceph-client:
ceph: add subvolume metrics collection and reporting
ceph: parse subvolume_id from InodeStat v9 and store in inode
ceph: handle InodeStat v8 versioned field in reply parsing
libceph: Fix slab-out-of-bounds access in auth message processing
rbd: fix null-ptr-deref when device_add_disk() fails
crush: cleanup in crush_do_rule() method
ceph: clear s_cap_reconnect when ceph_pagelist_encode_32() fails
ceph: only d_add() negative dentries when they are unhashed
libceph: update outdated comment in ceph_sock_write_space()
libceph: Remove obsolete session key alignment logic
ceph: fix num_ops off-by-one when crypto allocation fails
libceph: Prevent potential null-ptr-deref in ceph_handle_auth_reply()

+1144 -50
+3 -3
drivers/block/rbd.c
··· 7165 7165 7166 7166 rc = device_add_disk(&rbd_dev->dev, rbd_dev->disk, NULL); 7167 7167 if (rc) 7168 - goto err_out_cleanup_disk; 7168 + goto err_out_device; 7169 7169 7170 7170 spin_lock(&rbd_dev_list_lock); 7171 7171 list_add_tail(&rbd_dev->node, &rbd_dev_list); ··· 7179 7179 module_put(THIS_MODULE); 7180 7180 return rc; 7181 7181 7182 - err_out_cleanup_disk: 7183 - rbd_free_disk(rbd_dev); 7182 + err_out_device: 7183 + device_del(&rbd_dev->dev); 7184 7184 err_out_image_lock: 7185 7185 rbd_dev_image_unlock(rbd_dev); 7186 7186 rbd_dev_device_release(rbd_dev);
+1 -1
fs/ceph/Makefile
··· 8 8 ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \ 9 9 export.o caps.o snap.o xattr.o quota.o io.o \ 10 10 mds_client.o mdsmap.o strings.o ceph_frag.o \ 11 - debugfs.o util.o metric.o 11 + debugfs.o util.o metric.o subvolume_metrics.o 12 12 13 13 ceph-$(CONFIG_CEPH_FSCACHE) += cache.o 14 14 ceph-$(CONFIG_CEPH_FS_POSIX_ACL) += acl.o
+18
fs/ceph/addr.c
··· 19 19 #include "mds_client.h" 20 20 #include "cache.h" 21 21 #include "metric.h" 22 + #include "subvolume_metrics.h" 22 23 #include "crypto.h" 23 24 #include <linux/ceph/osd_client.h> 24 25 #include <linux/ceph/striper.h> ··· 260 259 osd_data->length), false); 261 260 } 262 261 if (err > 0) { 262 + ceph_subvolume_metrics_record_io(fsc->mdsc, ceph_inode(inode), 263 + false, err, 264 + req->r_start_latency, 265 + req->r_end_latency); 263 266 subreq->transferred = err; 264 267 err = 0; 265 268 } ··· 828 823 829 824 ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, 830 825 req->r_end_latency, len, err); 826 + if (err >= 0 && len > 0) 827 + ceph_subvolume_metrics_record_io(fsc->mdsc, ci, true, len, 828 + req->r_start_latency, 829 + req->r_end_latency); 831 830 fscrypt_free_bounce_page(bounce_page); 832 831 ceph_osdc_put_request(req); 833 832 if (err == 0) ··· 971 962 972 963 ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, 973 964 req->r_end_latency, len, rc); 965 + 966 + if (rc >= 0 && len > 0) 967 + ceph_subvolume_metrics_record_io(mdsc, ci, true, len, 968 + req->r_start_latency, 969 + req->r_end_latency); 974 970 975 971 ceph_put_wrbuffer_cap_refs(ci, total_pages, snapc); 976 972 ··· 1379 1365 rc = move_dirty_folio_in_page_array(mapping, wbc, ceph_wbc, 1380 1366 folio); 1381 1367 if (rc) { 1368 + /* Did we just begin a new contiguous op? Nevermind! */ 1369 + if (ceph_wbc->len == 0) 1370 + ceph_wbc->num_ops--; 1371 + 1382 1372 folio_redirty_for_writepage(wbc, folio); 1383 1373 folio_unlock(folio); 1384 1374 break;
+157
fs/ceph/debugfs.c
··· 9 9 #include <linux/seq_file.h> 10 10 #include <linux/math64.h> 11 11 #include <linux/ktime.h> 12 + #include <linux/atomic.h> 12 13 13 14 #include <linux/ceph/libceph.h> 14 15 #include <linux/ceph/mon_client.h> 15 16 #include <linux/ceph/auth.h> 16 17 #include <linux/ceph/debugfs.h> 18 + #include <linux/ceph/decode.h> 17 19 18 20 #include "super.h" 19 21 ··· 23 21 24 22 #include "mds_client.h" 25 23 #include "metric.h" 24 + #include "subvolume_metrics.h" 25 + 26 + /** 27 + * struct ceph_session_feature_desc - Maps feature bits to names for debugfs 28 + * @bit: Feature bit number from enum ceph_feature_type (see mds_client.h) 29 + * @name: Human-readable feature name for debugfs output 30 + * 31 + * Used by metric_features_show() to display negotiated session features. 32 + */ 33 + struct ceph_session_feature_desc { 34 + unsigned int bit; 35 + const char *name; 36 + }; 37 + 38 + static const struct ceph_session_feature_desc ceph_session_feature_table[] = { 39 + { CEPHFS_FEATURE_METRIC_COLLECT, "METRIC_COLLECT" }, 40 + { CEPHFS_FEATURE_REPLY_ENCODING, "REPLY_ENCODING" }, 41 + { CEPHFS_FEATURE_RECLAIM_CLIENT, "RECLAIM_CLIENT" }, 42 + { CEPHFS_FEATURE_LAZY_CAP_WANTED, "LAZY_CAP_WANTED" }, 43 + { CEPHFS_FEATURE_MULTI_RECONNECT, "MULTI_RECONNECT" }, 44 + { CEPHFS_FEATURE_DELEG_INO, "DELEG_INO" }, 45 + { CEPHFS_FEATURE_ALTERNATE_NAME, "ALTERNATE_NAME" }, 46 + { CEPHFS_FEATURE_NOTIFY_SESSION_STATE, "NOTIFY_SESSION_STATE" }, 47 + { CEPHFS_FEATURE_OP_GETVXATTR, "OP_GETVXATTR" }, 48 + { CEPHFS_FEATURE_32BITS_RETRY_FWD, "32BITS_RETRY_FWD" }, 49 + { CEPHFS_FEATURE_NEW_SNAPREALM_INFO, "NEW_SNAPREALM_INFO" }, 50 + { CEPHFS_FEATURE_HAS_OWNER_UIDGID, "HAS_OWNER_UIDGID" }, 51 + { CEPHFS_FEATURE_MDS_AUTH_CAPS_CHECK, "MDS_AUTH_CAPS_CHECK" }, 52 + { CEPHFS_FEATURE_SUBVOLUME_METRICS, "SUBVOLUME_METRICS" }, 53 + }; 26 54 27 55 static int mdsmap_show(struct seq_file *s, void *p) 28 56 { ··· 392 360 return 0; 393 361 } 394 362 363 + static int subvolume_metrics_show(struct seq_file *s, void *p) 364 + { 365 + struct ceph_fs_client *fsc = s->private; 366 + struct ceph_mds_client *mdsc = fsc->mdsc; 367 + struct ceph_subvol_metric_snapshot *snapshot = NULL; 368 + u32 nr = 0; 369 + u64 total_sent = 0; 370 + u64 nonzero_sends = 0; 371 + u32 i; 372 + 373 + if (!mdsc) { 374 + seq_puts(s, "mds client unavailable\n"); 375 + return 0; 376 + } 377 + 378 + mutex_lock(&mdsc->subvol_metrics_last_mutex); 379 + if (mdsc->subvol_metrics_last && mdsc->subvol_metrics_last_nr) { 380 + nr = mdsc->subvol_metrics_last_nr; 381 + snapshot = kmemdup_array(mdsc->subvol_metrics_last, nr, 382 + sizeof(*snapshot), GFP_KERNEL); 383 + if (!snapshot) 384 + nr = 0; 385 + } 386 + total_sent = mdsc->subvol_metrics_sent; 387 + nonzero_sends = mdsc->subvol_metrics_nonzero_sends; 388 + mutex_unlock(&mdsc->subvol_metrics_last_mutex); 389 + 390 + seq_puts(s, "Last sent subvolume metrics:\n"); 391 + if (!nr) { 392 + seq_puts(s, " (none)\n"); 393 + } else { 394 + seq_puts(s, " subvol_id rd_ops wr_ops rd_bytes wr_bytes rd_lat_us wr_lat_us\n"); 395 + for (i = 0; i < nr; i++) { 396 + const struct ceph_subvol_metric_snapshot *e = &snapshot[i]; 397 + 398 + seq_printf(s, " %-18llu %-9llu %-9llu %-14llu %-14llu %-14llu %-14llu\n", 399 + e->subvolume_id, 400 + e->read_ops, e->write_ops, 401 + e->read_bytes, e->write_bytes, 402 + e->read_latency_us, e->write_latency_us); 403 + } 404 + } 405 + kfree(snapshot); 406 + 407 + seq_puts(s, "\nStatistics:\n"); 408 + seq_printf(s, " entries_sent: %llu\n", total_sent); 409 + seq_printf(s, " non_zero_sends: %llu\n", nonzero_sends); 410 + 411 + seq_puts(s, "\nPending (unsent) subvolume metrics:\n"); 412 + ceph_subvolume_metrics_dump(&mdsc->subvol_metrics, s); 413 + return 0; 414 + } 415 + 395 416 DEFINE_SHOW_ATTRIBUTE(mdsmap); 396 417 DEFINE_SHOW_ATTRIBUTE(mdsc); 397 418 DEFINE_SHOW_ATTRIBUTE(caps); ··· 454 369 DEFINE_SHOW_ATTRIBUTE(metrics_latency); 455 370 DEFINE_SHOW_ATTRIBUTE(metrics_size); 456 371 DEFINE_SHOW_ATTRIBUTE(metrics_caps); 372 + DEFINE_SHOW_ATTRIBUTE(subvolume_metrics); 457 373 374 + static int metric_features_show(struct seq_file *s, void *p) 375 + { 376 + struct ceph_fs_client *fsc = s->private; 377 + struct ceph_mds_client *mdsc = fsc->mdsc; 378 + unsigned long session_features = 0; 379 + bool have_session = false; 380 + bool metric_collect = false; 381 + bool subvol_support = false; 382 + bool metrics_enabled = false; 383 + bool subvol_enabled = false; 384 + int i; 385 + 386 + if (!mdsc) { 387 + seq_puts(s, "mds client unavailable\n"); 388 + return 0; 389 + } 390 + 391 + mutex_lock(&mdsc->mutex); 392 + if (mdsc->metric.session) { 393 + have_session = true; 394 + session_features = mdsc->metric.session->s_features; 395 + } 396 + mutex_unlock(&mdsc->mutex); 397 + 398 + if (have_session) { 399 + metric_collect = 400 + test_bit(CEPHFS_FEATURE_METRIC_COLLECT, 401 + &session_features); 402 + subvol_support = 403 + test_bit(CEPHFS_FEATURE_SUBVOLUME_METRICS, 404 + &session_features); 405 + } 406 + 407 + metrics_enabled = !disable_send_metrics && have_session && metric_collect; 408 + subvol_enabled = metrics_enabled && subvol_support; 409 + 410 + seq_printf(s, 411 + "metrics_enabled: %s (disable_send_metrics=%d, session=%s, metric_collect=%s)\n", 412 + metrics_enabled ? "yes" : "no", 413 + disable_send_metrics ? 1 : 0, 414 + have_session ? "yes" : "no", 415 + metric_collect ? "yes" : "no"); 416 + seq_printf(s, "subvolume_metrics_enabled: %s\n", 417 + subvol_enabled ? "yes" : "no"); 418 + seq_printf(s, "session_feature_bits: 0x%lx\n", session_features); 419 + 420 + if (!have_session) { 421 + seq_puts(s, "(no active MDS session for metrics)\n"); 422 + return 0; 423 + } 424 + 425 + for (i = 0; i < ARRAY_SIZE(ceph_session_feature_table); i++) { 426 + const struct ceph_session_feature_desc *desc = 427 + &ceph_session_feature_table[i]; 428 + bool set = test_bit(desc->bit, &session_features); 429 + 430 + seq_printf(s, " %-24s : %s\n", desc->name, 431 + set ? "yes" : "no"); 432 + } 433 + 434 + return 0; 435 + } 436 + 437 + DEFINE_SHOW_ATTRIBUTE(metric_features); 458 438 459 439 /* 460 440 * debugfs ··· 554 404 debugfs_remove(fsc->debugfs_caps); 555 405 debugfs_remove(fsc->debugfs_status); 556 406 debugfs_remove(fsc->debugfs_mdsc); 407 + debugfs_remove(fsc->debugfs_subvolume_metrics); 557 408 debugfs_remove_recursive(fsc->debugfs_metrics_dir); 558 409 doutc(fsc->client, "done\n"); 559 410 } ··· 619 468 &metrics_size_fops); 620 469 debugfs_create_file("caps", 0400, fsc->debugfs_metrics_dir, fsc, 621 470 &metrics_caps_fops); 471 + debugfs_create_file("metric_features", 0400, fsc->debugfs_metrics_dir, 472 + fsc, &metric_features_fops); 473 + fsc->debugfs_subvolume_metrics = 474 + debugfs_create_file("subvolumes", 0400, 475 + fsc->debugfs_metrics_dir, fsc, 476 + &subvolume_metrics_fops); 622 477 doutc(fsc->client, "done\n"); 623 478 } 624 479
+4 -2
fs/ceph/dir.c
··· 769 769 d_drop(dentry); 770 770 err = -ENOENT; 771 771 } else { 772 - d_add(dentry, NULL); 772 + if (d_unhashed(dentry)) 773 + d_add(dentry, NULL); 773 774 } 774 775 } 775 776 } ··· 841 840 spin_unlock(&ci->i_ceph_lock); 842 841 doutc(cl, " dir %llx.%llx complete, -ENOENT\n", 843 842 ceph_vinop(dir)); 844 - d_add(dentry, NULL); 843 + if (d_unhashed(dentry)) 844 + d_add(dentry, NULL); 845 845 di->lease_shared_gen = atomic_read(&ci->i_shared_gen); 846 846 return NULL; 847 847 }
+64 -4
fs/ceph/file.c
··· 19 19 #include "cache.h" 20 20 #include "io.h" 21 21 #include "metric.h" 22 + #include "subvolume_metrics.h" 23 + 24 + /* 25 + * Record I/O for subvolume metrics tracking. 26 + * 27 + * Callers must ensure bytes > 0 for reads (ret > 0 check) to avoid counting 28 + * EOF as an I/O operation. For writes, the condition is (ret >= 0 && len > 0). 29 + */ 30 + static inline void ceph_record_subvolume_io(struct inode *inode, bool is_write, 31 + ktime_t start, ktime_t end, 32 + size_t bytes) 33 + { 34 + if (!bytes) 35 + return; 36 + 37 + ceph_subvolume_metrics_record_io(ceph_sb_to_mdsc(inode->i_sb), 38 + ceph_inode(inode), 39 + is_write, bytes, start, end); 40 + } 22 41 23 42 static __le32 ceph_flags_sys2wire(struct ceph_mds_client *mdsc, u32 flags) 24 43 { ··· 1159 1140 req->r_start_latency, 1160 1141 req->r_end_latency, 1161 1142 read_len, ret); 1143 + /* 1144 + * Only record subvolume metrics for actual bytes read. 1145 + * ret == 0 means EOF (no data), not an I/O operation. 1146 + */ 1147 + if (ret > 0) 1148 + ceph_record_subvolume_io(inode, false, 1149 + req->r_start_latency, 1150 + req->r_end_latency, 1151 + ret); 1162 1152 1163 1153 if (ret > 0) 1164 1154 objver = req->r_version; ··· 1413 1385 1414 1386 /* r_start_latency == 0 means the request was not submitted */ 1415 1387 if (req->r_start_latency) { 1416 - if (aio_req->write) 1388 + if (aio_req->write) { 1417 1389 ceph_update_write_metrics(metric, req->r_start_latency, 1418 1390 req->r_end_latency, len, rc); 1419 - else 1391 + if (rc >= 0 && len) 1392 + ceph_record_subvolume_io(inode, true, 1393 + req->r_start_latency, 1394 + req->r_end_latency, 1395 + len); 1396 + } else { 1420 1397 ceph_update_read_metrics(metric, req->r_start_latency, 1421 1398 req->r_end_latency, len, rc); 1399 + if (rc > 0) 1400 + ceph_record_subvolume_io(inode, false, 1401 + req->r_start_latency, 1402 + req->r_end_latency, 1403 + rc); 1404 + } 1422 1405 } 1423 1406 1424 1407 put_bvecs(osd_data->bvec_pos.bvecs, osd_data->num_bvecs, ··· 1653 1614 ceph_osdc_start_request(req->r_osdc, req); 1654 1615 ret = ceph_osdc_wait_request(&fsc->client->osdc, req); 1655 1616 1656 - if (write) 1617 + if (write) { 1657 1618 ceph_update_write_metrics(metric, req->r_start_latency, 1658 1619 req->r_end_latency, len, ret); 1659 - else 1620 + if (ret >= 0 && len) 1621 + ceph_record_subvolume_io(inode, true, 1622 + req->r_start_latency, 1623 + req->r_end_latency, 1624 + len); 1625 + } else { 1660 1626 ceph_update_read_metrics(metric, req->r_start_latency, 1661 1627 req->r_end_latency, len, ret); 1628 + if (ret > 0) 1629 + ceph_record_subvolume_io(inode, false, 1630 + req->r_start_latency, 1631 + req->r_end_latency, 1632 + ret); 1633 + } 1662 1634 1663 1635 size = i_size_read(inode); 1664 1636 if (!write) { ··· 1922 1872 req->r_start_latency, 1923 1873 req->r_end_latency, 1924 1874 read_len, ret); 1875 + if (ret > 0) 1876 + ceph_record_subvolume_io(inode, false, 1877 + req->r_start_latency, 1878 + req->r_end_latency, 1879 + ret); 1925 1880 1926 1881 /* Ok if object is not already present */ 1927 1882 if (ret == -ENOENT) { ··· 2091 2036 2092 2037 ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, 2093 2038 req->r_end_latency, len, ret); 2039 + if (ret >= 0 && write_len) 2040 + ceph_record_subvolume_io(inode, true, 2041 + req->r_start_latency, 2042 + req->r_end_latency, 2043 + write_len); 2094 2044 ceph_osdc_put_request(req); 2095 2045 if (ret != 0) { 2096 2046 doutc(cl, "osd write returned %d\n", ret);
+41
fs/ceph/inode.c
··· 638 638 639 639 ci->i_max_bytes = 0; 640 640 ci->i_max_files = 0; 641 + ci->i_subvolume_id = CEPH_SUBVOLUME_ID_NONE; 641 642 642 643 memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout)); 643 644 memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout)); ··· 742 741 doutc(cl, "%p ino %llx.%llx\n", inode, ceph_vinop(inode)); 743 742 744 743 percpu_counter_dec(&mdsc->metric.total_inodes); 744 + 745 + ci->i_subvolume_id = CEPH_SUBVOLUME_ID_NONE; 745 746 746 747 netfs_wait_for_outstanding_io(inode); 747 748 truncate_inode_pages_final(&inode->i_data); ··· 874 871 } 875 872 } 876 873 return queue_trunc; 874 + } 875 + 876 + /* 877 + * Set the subvolume ID for an inode. 878 + * 879 + * The subvolume_id identifies which CephFS subvolume this inode belongs to. 880 + * CEPH_SUBVOLUME_ID_NONE (0) means unknown/unset - the MDS only sends 881 + * non-zero IDs for inodes within subvolumes. 882 + * 883 + * An inode's subvolume membership is immutable - once an inode is created 884 + * in a subvolume, it stays there. Therefore, if we already have a valid 885 + * (non-zero) subvolume_id and receive a different one, that indicates a bug. 886 + */ 887 + void ceph_inode_set_subvolume(struct inode *inode, u64 subvolume_id) 888 + { 889 + struct ceph_inode_info *ci; 890 + u64 old; 891 + 892 + if (!inode || subvolume_id == CEPH_SUBVOLUME_ID_NONE) 893 + return; 894 + 895 + ci = ceph_inode(inode); 896 + old = READ_ONCE(ci->i_subvolume_id); 897 + 898 + if (old == subvolume_id) 899 + return; 900 + 901 + if (old != CEPH_SUBVOLUME_ID_NONE) { 902 + /* subvolume_id should not change once set */ 903 + WARN_ON_ONCE(1); 904 + return; 905 + } 906 + 907 + WRITE_ONCE(ci->i_subvolume_id, subvolume_id); 877 908 } 878 909 879 910 void ceph_fill_file_time(struct inode *inode, int issued, ··· 1113 1076 new_issued = ~issued & info_caps; 1114 1077 1115 1078 __ceph_update_quota(ci, iinfo->max_bytes, iinfo->max_files); 1079 + ceph_inode_set_subvolume(inode, iinfo->subvolume_id); 1116 1080 1117 1081 #ifdef CONFIG_FS_ENCRYPTION 1118 1082 if (iinfo->fscrypt_auth_len && ··· 1621 1583 goto done; 1622 1584 } 1623 1585 if (parent_dir) { 1586 + ceph_inode_set_subvolume(parent_dir, 1587 + rinfo->diri.subvolume_id); 1624 1588 err = ceph_fill_inode(parent_dir, NULL, &rinfo->diri, 1625 1589 rinfo->dirfrag, session, -1, 1626 1590 &req->r_caps_reservation); ··· 1711 1671 BUG_ON(!req->r_target_inode); 1712 1672 1713 1673 in = req->r_target_inode; 1674 + ceph_inode_set_subvolume(in, rinfo->targeti.subvolume_id); 1714 1675 err = ceph_fill_inode(in, req->r_locked_page, &rinfo->targeti, 1715 1676 NULL, session, 1716 1677 (!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags) &&
+82 -16
fs/ceph/mds_client.c
··· 68 68 69 69 static const struct ceph_connection_operations mds_con_ops; 70 70 71 + static void ceph_metric_bind_session(struct ceph_mds_client *mdsc, 72 + struct ceph_mds_session *session) 73 + { 74 + struct ceph_mds_session *old; 75 + 76 + if (!mdsc || !session || disable_send_metrics) 77 + return; 78 + 79 + old = mdsc->metric.session; 80 + mdsc->metric.session = ceph_get_mds_session(session); 81 + if (old) 82 + ceph_put_mds_session(old); 83 + 84 + metric_schedule_delayed(&mdsc->metric); 85 + } 71 86 72 87 /* 73 88 * mds reply parsing ··· 111 96 return -EIO; 112 97 } 113 98 114 - /* 115 - * parse individual inode info 116 - */ 117 99 static int parse_reply_info_in(void **p, void *end, 118 100 struct ceph_mds_reply_info_in *info, 119 - u64 features) 101 + u64 features, 102 + struct ceph_mds_client *mdsc) 120 103 { 121 104 int err = 0; 122 105 u8 struct_v = 0; 106 + u8 struct_compat = 0; 107 + u32 struct_len = 0; 108 + 109 + info->subvolume_id = CEPH_SUBVOLUME_ID_NONE; 123 110 124 111 if (features == (u64)-1) { 125 - u32 struct_len; 126 - u8 struct_compat; 127 112 ceph_decode_8_safe(p, end, struct_v, bad); 128 113 ceph_decode_8_safe(p, end, struct_compat, bad); 129 114 /* struct_v is expected to be >= 1. we only understand ··· 247 232 info->fscrypt_file_len, bad); 248 233 } 249 234 } 235 + 236 + /* 237 + * InodeStat encoding versions: 238 + * v1-v7: various fields added over time 239 + * v8: added optmetadata (versioned sub-structure containing 240 + * optional inode metadata like charmap for case-insensitive 241 + * filesystems). The kernel client doesn't support 242 + * case-insensitive lookups, so we skip this field. 243 + * v9: added subvolume_id (parsed below) 244 + */ 245 + if (struct_v >= 8) { 246 + u32 v8_struct_len; 247 + 248 + /* skip optmetadata versioned sub-structure */ 249 + ceph_decode_skip_8(p, end, bad); /* struct_v */ 250 + ceph_decode_skip_8(p, end, bad); /* struct_compat */ 251 + ceph_decode_32_safe(p, end, v8_struct_len, bad); 252 + ceph_decode_skip_n(p, end, v8_struct_len, bad); 253 + } 254 + 255 + /* struct_v 9 added subvolume_id */ 256 + if (struct_v >= 9) 257 + ceph_decode_64_safe(p, end, info->subvolume_id, bad); 258 + 250 259 *p = end; 251 260 } else { 252 261 /* legacy (unversioned) struct */ ··· 403 364 */ 404 365 static int parse_reply_info_trace(void **p, void *end, 405 366 struct ceph_mds_reply_info_parsed *info, 406 - u64 features) 367 + u64 features, 368 + struct ceph_mds_client *mdsc) 407 369 { 408 370 int err; 409 371 410 372 if (info->head->is_dentry) { 411 - err = parse_reply_info_in(p, end, &info->diri, features); 373 + err = parse_reply_info_in(p, end, &info->diri, features, mdsc); 412 374 if (err < 0) 413 375 goto out_bad; 414 376 ··· 429 389 } 430 390 431 391 if (info->head->is_target) { 432 - err = parse_reply_info_in(p, end, &info->targeti, features); 392 + err = parse_reply_info_in(p, end, &info->targeti, features, 393 + mdsc); 433 394 if (err < 0) 434 395 goto out_bad; 435 396 } ··· 451 410 */ 452 411 static int parse_reply_info_readdir(void **p, void *end, 453 412 struct ceph_mds_request *req, 454 - u64 features) 413 + u64 features, 414 + struct ceph_mds_client *mdsc) 455 415 { 456 416 struct ceph_mds_reply_info_parsed *info = &req->r_reply_info; 457 417 struct ceph_client *cl = req->r_mdsc->fsc->client; ··· 567 525 rde->name_len = oname.len; 568 526 569 527 /* inode */ 570 - err = parse_reply_info_in(p, end, &rde->inode, features); 528 + err = parse_reply_info_in(p, end, &rde->inode, features, mdsc); 571 529 if (err < 0) 572 530 goto out_bad; 573 531 /* ceph_readdir_prepopulate() will update it */ ··· 775 733 if (op == CEPH_MDS_OP_GETFILELOCK) 776 734 return parse_reply_info_filelock(p, end, info, features); 777 735 else if (op == CEPH_MDS_OP_READDIR || op == CEPH_MDS_OP_LSSNAP) 778 - return parse_reply_info_readdir(p, end, req, features); 736 + return parse_reply_info_readdir(p, end, req, features, 737 + req->r_mdsc); 779 738 else if (op == CEPH_MDS_OP_CREATE) 780 739 return parse_reply_info_create(p, end, info, features, s); 781 740 else if (op == CEPH_MDS_OP_GETVXATTR) ··· 805 762 ceph_decode_32_safe(&p, end, len, bad); 806 763 if (len > 0) { 807 764 ceph_decode_need(&p, end, len, bad); 808 - err = parse_reply_info_trace(&p, p+len, info, features); 765 + err = parse_reply_info_trace(&p, p + len, info, features, 766 + s->s_mdsc); 809 767 if (err < 0) 810 768 goto out_bad; 811 769 } ··· 815 771 ceph_decode_32_safe(&p, end, len, bad); 816 772 if (len > 0) { 817 773 ceph_decode_need(&p, end, len, bad); 818 - err = parse_reply_info_extra(&p, p+len, req, features, s); 774 + err = parse_reply_info_extra(&p, p + len, req, features, s); 819 775 if (err < 0) 820 776 goto out_bad; 821 777 } ··· 4013 3969 goto out_err; 4014 3970 } 4015 3971 req->r_target_inode = in; 3972 + ceph_inode_set_subvolume(in, rinfo->targeti.subvolume_id); 4016 3973 } 4017 3974 4018 3975 mutex_lock(&session->s_mutex); ··· 4362 4317 } 4363 4318 mdsc->s_cap_auths_num = cap_auths_num; 4364 4319 mdsc->s_cap_auths = cap_auths; 4320 + 4321 + session->s_features = features; 4322 + if (test_bit(CEPHFS_FEATURE_METRIC_COLLECT, 4323 + &session->s_features)) 4324 + ceph_metric_bind_session(mdsc, session); 4365 4325 } 4366 4326 if (op == CEPH_SESSION_CLOSE) { 4367 4327 ceph_get_mds_session(session); ··· 4393 4343 pr_info_client(cl, "mds%d reconnect success\n", 4394 4344 session->s_mds); 4395 4345 4396 - session->s_features = features; 4346 + if (test_bit(CEPHFS_FEATURE_SUBVOLUME_METRICS, 4347 + &session->s_features)) 4348 + ceph_subvolume_metrics_enable(&mdsc->subvol_metrics, true); 4349 + else 4350 + ceph_subvolume_metrics_enable(&mdsc->subvol_metrics, false); 4397 4351 if (session->s_state == CEPH_MDS_SESSION_OPEN) { 4398 4352 pr_notice_client(cl, "mds%d is already opened\n", 4399 4353 session->s_mds); ··· 5010 4956 /* placeholder for nr_caps */ 5011 4957 err = ceph_pagelist_encode_32(recon_state.pagelist, 0); 5012 4958 if (err) 5013 - goto fail; 4959 + goto fail_clear_cap_reconnect; 5014 4960 5015 4961 if (test_bit(CEPHFS_FEATURE_MULTI_RECONNECT, &session->s_features)) { 5016 4962 recon_state.msg_version = 3; ··· 5100 5046 ceph_pagelist_release(recon_state.pagelist); 5101 5047 return; 5102 5048 5049 + fail_clear_cap_reconnect: 5050 + spin_lock(&session->s_cap_lock); 5051 + session->s_cap_reconnect = 0; 5052 + spin_unlock(&session->s_cap_lock); 5103 5053 fail: 5104 5054 ceph_msg_put(reply); 5105 5055 up_read(&mdsc->snap_rwsem); ··· 5640 5582 err = ceph_metric_init(&mdsc->metric); 5641 5583 if (err) 5642 5584 goto err_mdsmap; 5585 + ceph_subvolume_metrics_init(&mdsc->subvol_metrics); 5586 + mutex_init(&mdsc->subvol_metrics_last_mutex); 5587 + mdsc->subvol_metrics_last = NULL; 5588 + mdsc->subvol_metrics_last_nr = 0; 5589 + mdsc->subvol_metrics_sent = 0; 5590 + mdsc->subvol_metrics_nonzero_sends = 0; 5643 5591 5644 5592 spin_lock_init(&mdsc->dentry_list_lock); 5645 5593 INIT_LIST_HEAD(&mdsc->dentry_leases); ··· 6179 6115 ceph_mdsc_stop(mdsc); 6180 6116 6181 6117 ceph_metric_destroy(&mdsc->metric); 6118 + ceph_subvolume_metrics_destroy(&mdsc->subvol_metrics); 6119 + kfree(mdsc->subvol_metrics_last); 6182 6120 6183 6121 fsc->mdsc = NULL; 6184 6122 kfree(mdsc);
+13 -1
fs/ceph/mds_client.h
··· 18 18 19 19 #include "mdsmap.h" 20 20 #include "metric.h" 21 + #include "subvolume_metrics.h" 21 22 #include "super.h" 22 23 23 24 /* The first 8 bits are reserved for old ceph releases */ ··· 37 36 CEPHFS_FEATURE_NEW_SNAPREALM_INFO, 38 37 CEPHFS_FEATURE_HAS_OWNER_UIDGID, 39 38 CEPHFS_FEATURE_MDS_AUTH_CAPS_CHECK, 39 + CEPHFS_FEATURE_SUBVOLUME_METRICS, 40 40 41 - CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_MDS_AUTH_CAPS_CHECK, 41 + CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_SUBVOLUME_METRICS, 42 42 }; 43 43 44 44 #define CEPHFS_FEATURES_CLIENT_SUPPORTED { \ ··· 56 54 CEPHFS_FEATURE_32BITS_RETRY_FWD, \ 57 55 CEPHFS_FEATURE_HAS_OWNER_UIDGID, \ 58 56 CEPHFS_FEATURE_MDS_AUTH_CAPS_CHECK, \ 57 + CEPHFS_FEATURE_SUBVOLUME_METRICS, \ 59 58 } 60 59 61 60 /* ··· 121 118 u32 fscrypt_file_len; 122 119 u64 rsnaps; 123 120 u64 change_attr; 121 + u64 subvolume_id; 124 122 }; 125 123 126 124 struct ceph_mds_reply_dir_entry { ··· 540 536 struct list_head dentry_dir_leases; /* lru list */ 541 537 542 538 struct ceph_client_metric metric; 539 + struct ceph_subvolume_metrics_tracker subvol_metrics; 540 + 541 + /* Subvolume metrics send tracking */ 542 + struct mutex subvol_metrics_last_mutex; 543 + struct ceph_subvol_metric_snapshot *subvol_metrics_last; 544 + u32 subvol_metrics_last_nr; 545 + u64 subvol_metrics_sent; 546 + u64 subvol_metrics_nonzero_sends; 543 547 544 548 spinlock_t snapid_map_lock; 545 549 struct rb_root snapid_map_tree;
+177 -6
fs/ceph/metric.c
··· 4 4 #include <linux/types.h> 5 5 #include <linux/percpu_counter.h> 6 6 #include <linux/math64.h> 7 + #include <linux/ratelimit.h> 8 + 9 + #include <linux/ceph/decode.h> 7 10 8 11 #include "metric.h" 9 12 #include "mds_client.h" 13 + 14 + static bool metrics_disable_warned; 15 + 16 + static inline u32 ceph_subvolume_entry_payload_len(void) 17 + { 18 + return sizeof(struct ceph_subvolume_metric_entry_wire); 19 + } 20 + 21 + static inline u32 ceph_subvolume_entry_encoded_len(void) 22 + { 23 + return CEPH_ENCODING_START_BLK_LEN + 24 + ceph_subvolume_entry_payload_len(); 25 + } 26 + 27 + static inline u32 ceph_subvolume_outer_payload_len(u32 nr_subvols) 28 + { 29 + /* count is encoded as le64 (size_t on wire) to match FUSE client */ 30 + return sizeof(__le64) + 31 + nr_subvols * ceph_subvolume_entry_encoded_len(); 32 + } 33 + 34 + static inline u32 ceph_subvolume_metric_data_len(u32 nr_subvols) 35 + { 36 + return CEPH_ENCODING_START_BLK_LEN + 37 + ceph_subvolume_outer_payload_len(nr_subvols); 38 + } 39 + 40 + static inline u32 ceph_subvolume_clamp_u32(u64 val) 41 + { 42 + return val > U32_MAX ? U32_MAX : (u32)val; 43 + } 44 + 45 + static void ceph_init_subvolume_wire_entry( 46 + struct ceph_subvolume_metric_entry_wire *dst, 47 + const struct ceph_subvol_metric_snapshot *src) 48 + { 49 + dst->subvolume_id = cpu_to_le64(src->subvolume_id); 50 + dst->read_ops = cpu_to_le32(ceph_subvolume_clamp_u32(src->read_ops)); 51 + dst->write_ops = cpu_to_le32(ceph_subvolume_clamp_u32(src->write_ops)); 52 + dst->read_bytes = cpu_to_le64(src->read_bytes); 53 + dst->write_bytes = cpu_to_le64(src->write_bytes); 54 + dst->read_latency_us = cpu_to_le64(src->read_latency_us); 55 + dst->write_latency_us = cpu_to_le64(src->write_latency_us); 56 + dst->time_stamp = 0; 57 + } 58 + 59 + static int ceph_encode_subvolume_metrics(void **p, void *end, 60 + struct ceph_subvol_metric_snapshot *subvols, 61 + u32 nr_subvols) 62 + { 63 + u32 i; 64 + 65 + ceph_start_encoding(p, 1, 1, 66 + ceph_subvolume_outer_payload_len(nr_subvols)); 67 + /* count is encoded as le64 (size_t on wire) to match FUSE client */ 68 + ceph_encode_64_safe(p, end, (u64)nr_subvols, enc_err); 69 + 70 + for (i = 0; i < nr_subvols; i++) { 71 + struct ceph_subvolume_metric_entry_wire wire_entry; 72 + 73 + ceph_init_subvolume_wire_entry(&wire_entry, &subvols[i]); 74 + ceph_start_encoding(p, 1, 1, 75 + ceph_subvolume_entry_payload_len()); 76 + ceph_encode_copy_safe(p, end, &wire_entry, 77 + sizeof(wire_entry), enc_err); 78 + } 79 + 80 + return 0; 81 + enc_err: 82 + return -ERANGE; 83 + } 10 84 11 85 static void ktime_to_ceph_timespec(struct ceph_timespec *ts, ktime_t val) 12 86 { ··· 103 29 struct ceph_read_io_size *rsize; 104 30 struct ceph_write_io_size *wsize; 105 31 struct ceph_client_metric *m = &mdsc->metric; 32 + struct ceph_subvol_metric_snapshot *subvols = NULL; 106 33 u64 nr_caps = atomic64_read(&m->total_caps); 107 34 u32 header_len = sizeof(struct ceph_metric_header); 108 35 struct ceph_client *cl = mdsc->fsc->client; 109 36 struct ceph_msg *msg; 37 + u32 nr_subvols = 0; 38 + size_t subvol_len = 0; 39 + void *cursor; 110 40 s64 sum; 111 41 s32 items = 0; 112 42 s32 len; ··· 123 45 } 124 46 mutex_unlock(&mdsc->mutex); 125 47 48 + if (ceph_subvolume_metrics_enabled(&mdsc->subvol_metrics) && 49 + test_bit(CEPHFS_FEATURE_SUBVOLUME_METRICS, &s->s_features)) { 50 + int ret; 51 + 52 + ret = ceph_subvolume_metrics_snapshot(&mdsc->subvol_metrics, 53 + &subvols, &nr_subvols, 54 + true); 55 + if (ret) { 56 + pr_warn_client(cl, "failed to snapshot subvolume metrics: %d\n", 57 + ret); 58 + /* 59 + * On error, ceph_subvolume_metrics_snapshot() guarantees 60 + * *out = NULL and *nr = 0 at function entry, so subvols 61 + * is already NULL here - no cleanup needed. 62 + */ 63 + nr_subvols = 0; 64 + subvols = NULL; 65 + } 66 + } 67 + 68 + if (nr_subvols) { 69 + /* type (le32) + ENCODE_START payload - no metric header */ 70 + subvol_len = sizeof(__le32) + 71 + ceph_subvolume_metric_data_len(nr_subvols); 72 + } 73 + 126 74 len = sizeof(*head) + sizeof(*cap) + sizeof(*read) + sizeof(*write) 127 75 + sizeof(*meta) + sizeof(*dlease) + sizeof(*files) 128 76 + sizeof(*icaps) + sizeof(*inodes) + sizeof(*rsize) 129 - + sizeof(*wsize); 77 + + sizeof(*wsize) + subvol_len; 130 78 131 79 msg = ceph_msg_new(CEPH_MSG_CLIENT_METRICS, len, GFP_NOFS, true); 132 80 if (!msg) { 133 81 pr_err_client(cl, "to mds%d, failed to allocate message\n", 134 82 s->s_mds); 83 + kfree(subvols); 135 84 return false; 136 85 } 137 86 ··· 277 172 wsize->total_size = cpu_to_le64(m->metric[METRIC_WRITE].size_sum); 278 173 items++; 279 174 175 + cursor = wsize + 1; 176 + 177 + if (nr_subvols) { 178 + void *payload; 179 + void *payload_end; 180 + int ret; 181 + 182 + /* Emit only the type (le32), no ver/compat/data_len */ 183 + ceph_encode_32(&cursor, CLIENT_METRIC_TYPE_SUBVOLUME_METRICS); 184 + items++; 185 + 186 + payload = cursor; 187 + payload_end = (char *)payload + 188 + ceph_subvolume_metric_data_len(nr_subvols); 189 + 190 + ret = ceph_encode_subvolume_metrics(&payload, payload_end, 191 + subvols, nr_subvols); 192 + if (ret) { 193 + pr_warn_client(cl, 194 + "failed to encode subvolume metrics\n"); 195 + kfree(subvols); 196 + ceph_msg_put(msg); 197 + return false; 198 + } 199 + 200 + WARN_ON(payload != payload_end); 201 + cursor = payload; 202 + } 203 + 280 204 put_unaligned_le32(items, &head->num); 281 - msg->front.iov_len = len; 205 + msg->front.iov_len = (char *)cursor - (char *)head; 282 206 msg->hdr.version = cpu_to_le16(1); 283 207 msg->hdr.compat_version = cpu_to_le16(1); 284 208 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); 209 + 285 210 ceph_con_send(&s->s_con, msg); 211 + 212 + if (nr_subvols) { 213 + mutex_lock(&mdsc->subvol_metrics_last_mutex); 214 + kfree(mdsc->subvol_metrics_last); 215 + mdsc->subvol_metrics_last = subvols; 216 + mdsc->subvol_metrics_last_nr = nr_subvols; 217 + mdsc->subvol_metrics_sent += nr_subvols; 218 + mdsc->subvol_metrics_nonzero_sends++; 219 + mutex_unlock(&mdsc->subvol_metrics_last_mutex); 220 + 221 + subvols = NULL; 222 + } 223 + kfree(subvols); 286 224 287 225 return true; 288 226 } ··· 346 198 * Skip it if MDS doesn't support the metric collection, 347 199 * or the MDS will close the session's socket connection 348 200 * directly when it get this message. 201 + * 202 + * Also skip sessions that don't support SUBVOLUME_METRICS 203 + * when subvolume metrics collection is enabled. This ensures 204 + * we only send subvolume metrics to MDSs that understand them. 205 + * If no session supports the feature, metrics won't be sent. 349 206 */ 350 207 if (check_session_state(s) && 351 208 test_bit(CEPHFS_FEATURE_METRIC_COLLECT, &s->s_features)) { 209 + if (ceph_subvolume_metrics_enabled(&mdsc->subvol_metrics) && 210 + !test_bit(CEPHFS_FEATURE_SUBVOLUME_METRICS, 211 + &s->s_features)) { 212 + ceph_put_mds_session(s); 213 + continue; 214 + } 352 215 mdsc->metric.session = s; 353 216 break; 354 217 } ··· 376 217 struct ceph_mds_client *mdsc = 377 218 container_of(m, struct ceph_mds_client, metric); 378 219 379 - if (mdsc->stopping || disable_send_metrics) 220 + if (mdsc->stopping) 380 221 return; 222 + 223 + if (disable_send_metrics) { 224 + if (!metrics_disable_warned) { 225 + pr_info("ceph: metrics sending disabled via module parameter\n"); 226 + metrics_disable_warned = true; 227 + } 228 + return; 229 + } 230 + metrics_disable_warned = false; 381 231 382 232 if (!m->session || !check_session_state(m->session)) { 383 233 if (m->session) { ··· 395 227 } 396 228 metric_get_session(mdsc); 397 229 } 398 - if (m->session) { 230 + 231 + if (m->session) 399 232 ceph_mdsc_send_metrics(mdsc, m->session); 400 - metric_schedule_delayed(m); 401 - } 233 + else 234 + pr_warn_ratelimited("ceph: metrics worker has no MDS session\n"); 235 + 236 + metric_schedule_delayed(m); 402 237 } 403 238 404 239 int ceph_metric_init(struct ceph_client_metric *m)
+38 -1
fs/ceph/metric.h
··· 25 25 CLIENT_METRIC_TYPE_STDEV_WRITE_LATENCY, 26 26 CLIENT_METRIC_TYPE_AVG_METADATA_LATENCY, 27 27 CLIENT_METRIC_TYPE_STDEV_METADATA_LATENCY, 28 + CLIENT_METRIC_TYPE_SUBVOLUME_METRICS, 28 29 29 - CLIENT_METRIC_TYPE_MAX = CLIENT_METRIC_TYPE_STDEV_METADATA_LATENCY, 30 + CLIENT_METRIC_TYPE_MAX = CLIENT_METRIC_TYPE_SUBVOLUME_METRICS, 30 31 }; 31 32 32 33 /* ··· 51 50 CLIENT_METRIC_TYPE_STDEV_WRITE_LATENCY, \ 52 51 CLIENT_METRIC_TYPE_AVG_METADATA_LATENCY, \ 53 52 CLIENT_METRIC_TYPE_STDEV_METADATA_LATENCY, \ 53 + CLIENT_METRIC_TYPE_SUBVOLUME_METRICS, \ 54 54 \ 55 55 CLIENT_METRIC_TYPE_MAX, \ 56 56 } ··· 139 137 struct ceph_metric_header header; 140 138 __le64 total_ops; 141 139 __le64 total_size; 140 + } __packed; 141 + 142 + /** 143 + * struct ceph_subvolume_metric_entry_wire - On-wire format sent to MDS 144 + * @subvolume_id: Subvolume identifier 145 + * @read_ops: Read operation count (32-bit, clamped from 64-bit internal) 146 + * @write_ops: Write operation count (32-bit, clamped from 64-bit internal) 147 + * @read_bytes: Total bytes read 148 + * @write_bytes: Total bytes written 149 + * @read_latency_us: Cumulative read latency in microseconds 150 + * @write_latency_us: Cumulative write latency in microseconds 151 + * @time_stamp: Collection timestamp (currently unused, set to 0) 152 + * 153 + * Wire format must match C++ AggregatedIOMetrics struct in MDS. 154 + */ 155 + struct ceph_subvolume_metric_entry_wire { 156 + __le64 subvolume_id; 157 + __le32 read_ops; 158 + __le32 write_ops; 159 + __le64 read_bytes; 160 + __le64 write_bytes; 161 + __le64 read_latency_us; 162 + __le64 write_latency_us; 163 + __le64 time_stamp; 164 + } __packed; 165 + 166 + /* Old struct kept for internal tracking, not used on wire */ 167 + struct ceph_subvolume_metric_entry { 168 + __le64 subvolume_id; 169 + __le64 read_ops; 170 + __le64 write_ops; 171 + __le64 read_bytes; 172 + __le64 write_bytes; 173 + __le64 read_latency_us; 174 + __le64 write_latency_us; 142 175 } __packed; 143 176 144 177 struct ceph_metric_head {
+416
fs/ceph/subvolume_metrics.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + #include <linux/ceph/ceph_debug.h> 3 + 4 + #include <linux/math64.h> 5 + #include <linux/slab.h> 6 + #include <linux/seq_file.h> 7 + 8 + #include "subvolume_metrics.h" 9 + #include "mds_client.h" 10 + #include "super.h" 11 + 12 + /** 13 + * struct ceph_subvol_metric_rb_entry - Per-subvolume I/O metrics node 14 + * @node: Red-black tree linkage for tracker->tree 15 + * @subvolume_id: Subvolume identifier (key for rb-tree lookup) 16 + * @read_ops: Accumulated read operation count since last snapshot 17 + * @write_ops: Accumulated write operation count since last snapshot 18 + * @read_bytes: Accumulated bytes read since last snapshot 19 + * @write_bytes: Accumulated bytes written since last snapshot 20 + * @read_latency_us: Sum of read latencies in microseconds 21 + * @write_latency_us: Sum of write latencies in microseconds 22 + */ 23 + struct ceph_subvol_metric_rb_entry { 24 + struct rb_node node; 25 + u64 subvolume_id; 26 + u64 read_ops; 27 + u64 write_ops; 28 + u64 read_bytes; 29 + u64 write_bytes; 30 + u64 read_latency_us; 31 + u64 write_latency_us; 32 + }; 33 + 34 + static struct kmem_cache *ceph_subvol_metric_entry_cachep; 35 + 36 + void ceph_subvolume_metrics_init(struct ceph_subvolume_metrics_tracker *tracker) 37 + { 38 + spin_lock_init(&tracker->lock); 39 + tracker->tree = RB_ROOT_CACHED; 40 + tracker->nr_entries = 0; 41 + tracker->enabled = false; 42 + atomic64_set(&tracker->snapshot_attempts, 0); 43 + atomic64_set(&tracker->snapshot_empty, 0); 44 + atomic64_set(&tracker->snapshot_failures, 0); 45 + atomic64_set(&tracker->record_calls, 0); 46 + atomic64_set(&tracker->record_disabled, 0); 47 + atomic64_set(&tracker->record_no_subvol, 0); 48 + atomic64_set(&tracker->total_read_ops, 0); 49 + atomic64_set(&tracker->total_read_bytes, 0); 50 + atomic64_set(&tracker->total_write_ops, 0); 51 + atomic64_set(&tracker->total_write_bytes, 0); 52 + } 53 + 54 + static struct ceph_subvol_metric_rb_entry * 55 + __lookup_entry(struct ceph_subvolume_metrics_tracker *tracker, u64 subvol_id) 56 + { 57 + struct rb_node *node; 58 + 59 + node = tracker->tree.rb_root.rb_node; 60 + while (node) { 61 + struct ceph_subvol_metric_rb_entry *entry = 62 + rb_entry(node, struct ceph_subvol_metric_rb_entry, node); 63 + 64 + if (subvol_id < entry->subvolume_id) 65 + node = node->rb_left; 66 + else if (subvol_id > entry->subvolume_id) 67 + node = node->rb_right; 68 + else 69 + return entry; 70 + } 71 + 72 + return NULL; 73 + } 74 + 75 + static struct ceph_subvol_metric_rb_entry * 76 + __insert_entry(struct ceph_subvolume_metrics_tracker *tracker, 77 + struct ceph_subvol_metric_rb_entry *entry) 78 + { 79 + struct rb_node **link = &tracker->tree.rb_root.rb_node; 80 + struct rb_node *parent = NULL; 81 + bool leftmost = true; 82 + 83 + while (*link) { 84 + struct ceph_subvol_metric_rb_entry *cur = 85 + rb_entry(*link, struct ceph_subvol_metric_rb_entry, node); 86 + 87 + parent = *link; 88 + if (entry->subvolume_id < cur->subvolume_id) 89 + link = &(*link)->rb_left; 90 + else if (entry->subvolume_id > cur->subvolume_id) { 91 + link = &(*link)->rb_right; 92 + leftmost = false; 93 + } else 94 + return cur; 95 + } 96 + 97 + rb_link_node(&entry->node, parent, link); 98 + rb_insert_color_cached(&entry->node, &tracker->tree, leftmost); 99 + tracker->nr_entries++; 100 + return entry; 101 + } 102 + 103 + static void ceph_subvolume_metrics_clear_locked( 104 + struct ceph_subvolume_metrics_tracker *tracker) 105 + { 106 + struct rb_node *node = rb_first_cached(&tracker->tree); 107 + 108 + while (node) { 109 + struct ceph_subvol_metric_rb_entry *entry = 110 + rb_entry(node, struct ceph_subvol_metric_rb_entry, node); 111 + struct rb_node *next = rb_next(node); 112 + 113 + rb_erase_cached(&entry->node, &tracker->tree); 114 + tracker->nr_entries--; 115 + kmem_cache_free(ceph_subvol_metric_entry_cachep, entry); 116 + node = next; 117 + } 118 + 119 + tracker->tree = RB_ROOT_CACHED; 120 + } 121 + 122 + void ceph_subvolume_metrics_destroy(struct ceph_subvolume_metrics_tracker *tracker) 123 + { 124 + spin_lock(&tracker->lock); 125 + ceph_subvolume_metrics_clear_locked(tracker); 126 + tracker->enabled = false; 127 + spin_unlock(&tracker->lock); 128 + } 129 + 130 + void ceph_subvolume_metrics_enable(struct ceph_subvolume_metrics_tracker *tracker, 131 + bool enable) 132 + { 133 + spin_lock(&tracker->lock); 134 + if (enable) { 135 + tracker->enabled = true; 136 + } else { 137 + tracker->enabled = false; 138 + ceph_subvolume_metrics_clear_locked(tracker); 139 + } 140 + spin_unlock(&tracker->lock); 141 + } 142 + 143 + void ceph_subvolume_metrics_record(struct ceph_subvolume_metrics_tracker *tracker, 144 + u64 subvol_id, bool is_write, 145 + size_t size, u64 latency_us) 146 + { 147 + struct ceph_subvol_metric_rb_entry *entry, *new_entry = NULL; 148 + bool retry = false; 149 + 150 + /* CEPH_SUBVOLUME_ID_NONE (0) means unknown/unset subvolume */ 151 + if (!READ_ONCE(tracker->enabled) || 152 + subvol_id == CEPH_SUBVOLUME_ID_NONE || !size || !latency_us) 153 + return; 154 + 155 + /* 156 + * Retry loop for lock-free allocation pattern: 157 + * 1. First iteration: lookup under lock, if miss -> drop lock, alloc, retry 158 + * 2. Second iteration: lookup again (may have been inserted), insert if still missing 159 + * 3. On race (another thread inserted same key): free our alloc, retry 160 + * All successful paths exit via return, so retry flag doesn't need reset. 161 + */ 162 + do { 163 + spin_lock(&tracker->lock); 164 + if (!tracker->enabled) { 165 + spin_unlock(&tracker->lock); 166 + if (new_entry) 167 + kmem_cache_free(ceph_subvol_metric_entry_cachep, new_entry); 168 + return; 169 + } 170 + 171 + entry = __lookup_entry(tracker, subvol_id); 172 + if (!entry) { 173 + if (!new_entry) { 174 + spin_unlock(&tracker->lock); 175 + new_entry = kmem_cache_zalloc(ceph_subvol_metric_entry_cachep, 176 + GFP_NOFS); 177 + if (!new_entry) 178 + return; 179 + new_entry->subvolume_id = subvol_id; 180 + retry = true; 181 + continue; 182 + } 183 + entry = __insert_entry(tracker, new_entry); 184 + if (entry != new_entry) { 185 + /* raced with another insert */ 186 + spin_unlock(&tracker->lock); 187 + kmem_cache_free(ceph_subvol_metric_entry_cachep, new_entry); 188 + new_entry = NULL; 189 + retry = true; 190 + continue; 191 + } 192 + new_entry = NULL; 193 + } 194 + 195 + if (is_write) { 196 + entry->write_ops++; 197 + entry->write_bytes += size; 198 + entry->write_latency_us += latency_us; 199 + atomic64_inc(&tracker->total_write_ops); 200 + atomic64_add(size, &tracker->total_write_bytes); 201 + } else { 202 + entry->read_ops++; 203 + entry->read_bytes += size; 204 + entry->read_latency_us += latency_us; 205 + atomic64_inc(&tracker->total_read_ops); 206 + atomic64_add(size, &tracker->total_read_bytes); 207 + } 208 + spin_unlock(&tracker->lock); 209 + if (new_entry) 210 + kmem_cache_free(ceph_subvol_metric_entry_cachep, new_entry); 211 + return; 212 + } while (retry); 213 + } 214 + 215 + int ceph_subvolume_metrics_snapshot(struct ceph_subvolume_metrics_tracker *tracker, 216 + struct ceph_subvol_metric_snapshot **out, 217 + u32 *nr, bool consume) 218 + { 219 + struct ceph_subvol_metric_snapshot *snap = NULL; 220 + struct rb_node *node; 221 + u32 count = 0, idx = 0; 222 + int ret = 0; 223 + 224 + *out = NULL; 225 + *nr = 0; 226 + 227 + if (!READ_ONCE(tracker->enabled)) 228 + return 0; 229 + 230 + atomic64_inc(&tracker->snapshot_attempts); 231 + 232 + spin_lock(&tracker->lock); 233 + for (node = rb_first_cached(&tracker->tree); node; node = rb_next(node)) { 234 + struct ceph_subvol_metric_rb_entry *entry = 235 + rb_entry(node, struct ceph_subvol_metric_rb_entry, node); 236 + 237 + /* Include entries with ANY I/O activity (read OR write) */ 238 + if (entry->read_ops || entry->write_ops) 239 + count++; 240 + } 241 + spin_unlock(&tracker->lock); 242 + 243 + if (!count) { 244 + atomic64_inc(&tracker->snapshot_empty); 245 + return 0; 246 + } 247 + 248 + snap = kcalloc(count, sizeof(*snap), GFP_NOFS); 249 + if (!snap) { 250 + atomic64_inc(&tracker->snapshot_failures); 251 + return -ENOMEM; 252 + } 253 + 254 + spin_lock(&tracker->lock); 255 + node = rb_first_cached(&tracker->tree); 256 + while (node) { 257 + struct ceph_subvol_metric_rb_entry *entry = 258 + rb_entry(node, struct ceph_subvol_metric_rb_entry, node); 259 + struct rb_node *next = rb_next(node); 260 + 261 + /* Skip entries with NO I/O activity at all */ 262 + if (!entry->read_ops && !entry->write_ops) { 263 + rb_erase_cached(&entry->node, &tracker->tree); 264 + tracker->nr_entries--; 265 + kmem_cache_free(ceph_subvol_metric_entry_cachep, entry); 266 + node = next; 267 + continue; 268 + } 269 + 270 + if (idx >= count) { 271 + pr_warn("ceph: subvol metrics snapshot race (idx=%u count=%u)\n", 272 + idx, count); 273 + break; 274 + } 275 + 276 + snap[idx].subvolume_id = entry->subvolume_id; 277 + snap[idx].read_ops = entry->read_ops; 278 + snap[idx].write_ops = entry->write_ops; 279 + snap[idx].read_bytes = entry->read_bytes; 280 + snap[idx].write_bytes = entry->write_bytes; 281 + snap[idx].read_latency_us = entry->read_latency_us; 282 + snap[idx].write_latency_us = entry->write_latency_us; 283 + idx++; 284 + 285 + if (consume) { 286 + entry->read_ops = 0; 287 + entry->write_ops = 0; 288 + entry->read_bytes = 0; 289 + entry->write_bytes = 0; 290 + entry->read_latency_us = 0; 291 + entry->write_latency_us = 0; 292 + rb_erase_cached(&entry->node, &tracker->tree); 293 + tracker->nr_entries--; 294 + kmem_cache_free(ceph_subvol_metric_entry_cachep, entry); 295 + } 296 + node = next; 297 + } 298 + spin_unlock(&tracker->lock); 299 + 300 + if (!idx) { 301 + kfree(snap); 302 + snap = NULL; 303 + ret = 0; 304 + } else { 305 + *nr = idx; 306 + *out = snap; 307 + } 308 + 309 + return ret; 310 + } 311 + 312 + void ceph_subvolume_metrics_free_snapshot(struct ceph_subvol_metric_snapshot *snapshot) 313 + { 314 + kfree(snapshot); 315 + } 316 + 317 + /* 318 + * Dump subvolume metrics to a seq_file for debugfs. 319 + * 320 + * Iterates the rb-tree directly under spinlock to avoid allocation. 321 + * The lock hold time is minimal since we're only doing seq_printf calls. 322 + */ 323 + void ceph_subvolume_metrics_dump(struct ceph_subvolume_metrics_tracker *tracker, 324 + struct seq_file *s) 325 + { 326 + struct rb_node *node; 327 + bool found = false; 328 + 329 + spin_lock(&tracker->lock); 330 + if (!tracker->enabled) { 331 + spin_unlock(&tracker->lock); 332 + seq_puts(s, "subvolume metrics disabled\n"); 333 + return; 334 + } 335 + 336 + for (node = rb_first_cached(&tracker->tree); node; node = rb_next(node)) { 337 + struct ceph_subvol_metric_rb_entry *entry = 338 + rb_entry(node, struct ceph_subvol_metric_rb_entry, node); 339 + u64 avg_rd_lat, avg_wr_lat; 340 + 341 + if (!entry->read_ops && !entry->write_ops) 342 + continue; 343 + 344 + if (!found) { 345 + seq_puts(s, "subvol_id rd_ops rd_bytes rd_avg_lat_us wr_ops wr_bytes wr_avg_lat_us\n"); 346 + seq_puts(s, "------------------------------------------------------------------------------------------------\n"); 347 + found = true; 348 + } 349 + 350 + avg_rd_lat = entry->read_ops ? 351 + div64_u64(entry->read_latency_us, entry->read_ops) : 0; 352 + avg_wr_lat = entry->write_ops ? 353 + div64_u64(entry->write_latency_us, entry->write_ops) : 0; 354 + 355 + seq_printf(s, "%-15llu%-10llu%-12llu%-16llu%-10llu%-12llu%-16llu\n", 356 + entry->subvolume_id, 357 + entry->read_ops, 358 + entry->read_bytes, 359 + avg_rd_lat, 360 + entry->write_ops, 361 + entry->write_bytes, 362 + avg_wr_lat); 363 + } 364 + spin_unlock(&tracker->lock); 365 + 366 + if (!found) 367 + seq_puts(s, "(no subvolume metrics collected)\n"); 368 + } 369 + 370 + void ceph_subvolume_metrics_record_io(struct ceph_mds_client *mdsc, 371 + struct ceph_inode_info *ci, 372 + bool is_write, size_t bytes, 373 + ktime_t start, ktime_t end) 374 + { 375 + struct ceph_subvolume_metrics_tracker *tracker; 376 + u64 subvol_id; 377 + s64 delta_us; 378 + 379 + if (!mdsc || !ci || !bytes) 380 + return; 381 + 382 + tracker = &mdsc->subvol_metrics; 383 + atomic64_inc(&tracker->record_calls); 384 + 385 + if (!ceph_subvolume_metrics_enabled(tracker)) { 386 + atomic64_inc(&tracker->record_disabled); 387 + return; 388 + } 389 + 390 + subvol_id = READ_ONCE(ci->i_subvolume_id); 391 + if (subvol_id == CEPH_SUBVOLUME_ID_NONE) { 392 + atomic64_inc(&tracker->record_no_subvol); 393 + return; 394 + } 395 + 396 + delta_us = ktime_to_us(ktime_sub(end, start)); 397 + if (delta_us <= 0) 398 + delta_us = 1; 399 + 400 + ceph_subvolume_metrics_record(tracker, subvol_id, is_write, 401 + bytes, (u64)delta_us); 402 + } 403 + 404 + int __init ceph_subvolume_metrics_cache_init(void) 405 + { 406 + ceph_subvol_metric_entry_cachep = KMEM_CACHE(ceph_subvol_metric_rb_entry, 407 + SLAB_RECLAIM_ACCOUNT); 408 + if (!ceph_subvol_metric_entry_cachep) 409 + return -ENOMEM; 410 + return 0; 411 + } 412 + 413 + void ceph_subvolume_metrics_cache_destroy(void) 414 + { 415 + kmem_cache_destroy(ceph_subvol_metric_entry_cachep); 416 + }
+97
fs/ceph/subvolume_metrics.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + #ifndef _FS_CEPH_SUBVOLUME_METRICS_H 3 + #define _FS_CEPH_SUBVOLUME_METRICS_H 4 + 5 + #include <linux/types.h> 6 + #include <linux/rbtree.h> 7 + #include <linux/spinlock.h> 8 + #include <linux/ktime.h> 9 + #include <linux/atomic.h> 10 + 11 + struct seq_file; 12 + struct ceph_mds_client; 13 + struct ceph_inode_info; 14 + 15 + /** 16 + * struct ceph_subvol_metric_snapshot - Point-in-time snapshot of subvolume metrics 17 + * @subvolume_id: Subvolume identifier (inode number of subvolume root) 18 + * @read_ops: Number of read operations since last snapshot 19 + * @write_ops: Number of write operations since last snapshot 20 + * @read_bytes: Total bytes read since last snapshot 21 + * @write_bytes: Total bytes written since last snapshot 22 + * @read_latency_us: Sum of read latencies in microseconds (for avg calculation) 23 + * @write_latency_us: Sum of write latencies in microseconds (for avg calculation) 24 + */ 25 + struct ceph_subvol_metric_snapshot { 26 + u64 subvolume_id; 27 + u64 read_ops; 28 + u64 write_ops; 29 + u64 read_bytes; 30 + u64 write_bytes; 31 + u64 read_latency_us; 32 + u64 write_latency_us; 33 + }; 34 + 35 + /** 36 + * struct ceph_subvolume_metrics_tracker - Tracks per-subvolume I/O metrics 37 + * @lock: Protects @tree and @nr_entries during concurrent access 38 + * @tree: Red-black tree of per-subvolume entries, keyed by subvolume_id 39 + * @nr_entries: Number of entries currently in @tree 40 + * @enabled: Whether collection is enabled (requires MDS feature support) 41 + * @snapshot_attempts: Debug counter: total ceph_subvolume_metrics_snapshot() calls 42 + * @snapshot_empty: Debug counter: snapshots that found no data to report 43 + * @snapshot_failures: Debug counter: snapshots that failed to allocate memory 44 + * @record_calls: Debug counter: total ceph_subvolume_metrics_record() calls 45 + * @record_disabled: Debug counter: record calls skipped because disabled 46 + * @record_no_subvol: Debug counter: record calls skipped (no subvolume_id) 47 + * @total_read_ops: Cumulative read ops across all snapshots (never reset) 48 + * @total_read_bytes: Cumulative bytes read across all snapshots (never reset) 49 + * @total_write_ops: Cumulative write ops across all snapshots (never reset) 50 + * @total_write_bytes: Cumulative bytes written across all snapshots (never reset) 51 + */ 52 + struct ceph_subvolume_metrics_tracker { 53 + spinlock_t lock; 54 + struct rb_root_cached tree; 55 + u32 nr_entries; 56 + bool enabled; 57 + atomic64_t snapshot_attempts; 58 + atomic64_t snapshot_empty; 59 + atomic64_t snapshot_failures; 60 + atomic64_t record_calls; 61 + atomic64_t record_disabled; 62 + atomic64_t record_no_subvol; 63 + atomic64_t total_read_ops; 64 + atomic64_t total_read_bytes; 65 + atomic64_t total_write_ops; 66 + atomic64_t total_write_bytes; 67 + }; 68 + 69 + void ceph_subvolume_metrics_init(struct ceph_subvolume_metrics_tracker *tracker); 70 + void ceph_subvolume_metrics_destroy(struct ceph_subvolume_metrics_tracker *tracker); 71 + void ceph_subvolume_metrics_enable(struct ceph_subvolume_metrics_tracker *tracker, 72 + bool enable); 73 + void ceph_subvolume_metrics_record(struct ceph_subvolume_metrics_tracker *tracker, 74 + u64 subvol_id, bool is_write, 75 + size_t size, u64 latency_us); 76 + int ceph_subvolume_metrics_snapshot(struct ceph_subvolume_metrics_tracker *tracker, 77 + struct ceph_subvol_metric_snapshot **out, 78 + u32 *nr, bool consume); 79 + void ceph_subvolume_metrics_free_snapshot(struct ceph_subvol_metric_snapshot *snapshot); 80 + void ceph_subvolume_metrics_dump(struct ceph_subvolume_metrics_tracker *tracker, 81 + struct seq_file *s); 82 + 83 + void ceph_subvolume_metrics_record_io(struct ceph_mds_client *mdsc, 84 + struct ceph_inode_info *ci, 85 + bool is_write, size_t bytes, 86 + ktime_t start, ktime_t end); 87 + 88 + static inline bool ceph_subvolume_metrics_enabled( 89 + const struct ceph_subvolume_metrics_tracker *tracker) 90 + { 91 + return READ_ONCE(tracker->enabled); 92 + } 93 + 94 + int __init ceph_subvolume_metrics_cache_init(void); 95 + void ceph_subvolume_metrics_cache_destroy(void); 96 + 97 + #endif /* _FS_CEPH_SUBVOLUME_METRICS_H */
+8
fs/ceph/super.c
··· 21 21 #include "mds_client.h" 22 22 #include "cache.h" 23 23 #include "crypto.h" 24 + #include "subvolume_metrics.h" 24 25 25 26 #include <linux/ceph/ceph_features.h> 26 27 #include <linux/ceph/decode.h> ··· 967 966 if (!ceph_wb_pagevec_pool) 968 967 goto bad_pagevec_pool; 969 968 969 + error = ceph_subvolume_metrics_cache_init(); 970 + if (error) 971 + goto bad_subvol_metrics; 972 + 970 973 return 0; 971 974 975 + bad_subvol_metrics: 976 + mempool_destroy(ceph_wb_pagevec_pool); 972 977 bad_pagevec_pool: 973 978 kmem_cache_destroy(ceph_mds_request_cachep); 974 979 bad_mds_req: ··· 1011 1004 kmem_cache_destroy(ceph_dir_file_cachep); 1012 1005 kmem_cache_destroy(ceph_mds_request_cachep); 1013 1006 mempool_destroy(ceph_wb_pagevec_pool); 1007 + ceph_subvolume_metrics_cache_destroy(); 1014 1008 } 1015 1009 1016 1010 static void __ceph_umount_begin(struct ceph_fs_client *fsc)
+11
fs/ceph/super.h
··· 179 179 struct dentry *debugfs_status; 180 180 struct dentry *debugfs_mds_sessions; 181 181 struct dentry *debugfs_metrics_dir; 182 + struct dentry *debugfs_subvolume_metrics; 182 183 #endif 183 184 184 185 #ifdef CONFIG_CEPH_FSCACHE ··· 398 397 399 398 /* quotas */ 400 399 u64 i_max_bytes, i_max_files; 400 + 401 + /* 402 + * Subvolume ID this inode belongs to. CEPH_SUBVOLUME_ID_NONE (0) 403 + * means unknown/unset, matching the FUSE client convention. 404 + * Once set to a valid (non-zero) value, it should not change 405 + * during the inode's lifetime. 406 + */ 407 + #define CEPH_SUBVOLUME_ID_NONE 0 408 + u64 i_subvolume_id; 401 409 402 410 s32 i_dir_pin; 403 411 ··· 1079 1069 extern struct inode *ceph_get_snapdir(struct inode *parent); 1080 1070 extern int ceph_fill_file_size(struct inode *inode, int issued, 1081 1071 u32 truncate_seq, u64 truncate_size, u64 size); 1072 + extern void ceph_inode_set_subvolume(struct inode *inode, u64 subvolume_id); 1082 1073 extern void ceph_fill_file_time(struct inode *inode, int issued, 1083 1074 u64 time_warp_seq, struct timespec64 *ctime, 1084 1075 struct timespec64 *mtime,
+2 -2
net/ceph/auth.c
··· 245 245 ac->protocol = 0; 246 246 ac->ops = NULL; 247 247 } 248 - if (ac->protocol != protocol) { 248 + if (!ac->protocol) { 249 249 ret = init_protocol(ac, protocol); 250 250 if (ret) { 251 251 pr_err("auth protocol '%s' init failed: %d\n", ··· 257 257 ac->negotiating = false; 258 258 } 259 259 260 - if (result) { 260 + if (result < 0) { 261 261 pr_err("auth protocol '%s' mauth authentication failed: %d\n", 262 262 ceph_auth_proto_name(ac->protocol), result); 263 263 ret = result;
+3 -4
net/ceph/crush/mapper.c
··· 911 911 int osize; 912 912 const struct crush_rule *rule; 913 913 __u32 step; 914 - int i, j; 914 + int i; 915 915 int numrep; 916 916 int out_size; 917 917 /* ··· 1012 1012 if (numrep <= 0) 1013 1013 continue; 1014 1014 } 1015 - j = 0; 1016 1015 /* make sure bucket id is valid */ 1017 1016 bno = -1 - w[i]; 1018 1017 if (bno < 0 || bno >= map->max_buckets) { ··· 1035 1036 weight, weight_max, 1036 1037 x, numrep, 1037 1038 curstep->arg2, 1038 - o+osize, j, 1039 + o+osize, 0, 1039 1040 result_max-osize, 1040 1041 choose_tries, 1041 1042 recurse_tries, ··· 1057 1058 weight, weight_max, 1058 1059 x, out_size, numrep, 1059 1060 curstep->arg2, 1060 - o+osize, j, 1061 + o+osize, 0, 1061 1062 choose_tries, 1062 1063 choose_leaf_tries ? 1063 1064 choose_leaf_tries : 1,
+2 -2
net/ceph/messenger.c
··· 368 368 /* only queue to workqueue if there is data we want to write, 369 369 * and there is sufficient space in the socket buffer to accept 370 370 * more data. clear SOCK_NOSPACE so that ceph_sock_write_space() 371 - * doesn't get called again until try_write() fills the socket 372 - * buffer. See net/ipv4/tcp_input.c:tcp_check_space() 371 + * doesn't get called again until ceph_con_v[12]_try_write() fills 372 + * the socket buffer. See net/ipv4/tcp_input.c:tcp_check_space() 373 373 * and net/core/stream.c:sk_stream_write_space(). 374 374 */ 375 375 if (ceph_con_flag_test(con, CEPH_CON_F_WRITE_PENDING)) {
+5 -8
net/ceph/messenger_v2.c
··· 8 8 #include <linux/ceph/ceph_debug.h> 9 9 10 10 #include <crypto/aead.h> 11 - #include <crypto/hash.h> 12 11 #include <crypto/sha2.h> 13 12 #include <crypto/utils.h> 14 13 #include <linux/bvec.h> ··· 2351 2352 } 2352 2353 2353 2354 /* 2354 - * Align session_key and con_secret to avoid GFP_ATOMIC allocation 2355 - * inside crypto_shash_setkey() and crypto_aead_setkey() called from 2356 - * setup_crypto(). __aligned(16) isn't guaranteed to work for stack 2357 - * objects, so do it by hand. 2355 + * Align con_secret to avoid GFP_ATOMIC allocation inside 2356 + * crypto_aead_setkey() called from setup_crypto(). __aligned(16) 2357 + * isn't guaranteed to work for stack objects, so do it by hand. 2358 2358 */ 2359 2359 static int process_auth_done(struct ceph_connection *con, void *p, void *end) 2360 2360 { 2361 - u8 session_key_buf[CEPH_MAX_KEY_LEN + 16]; 2361 + u8 session_key[CEPH_MAX_KEY_LEN]; 2362 2362 u8 con_secret_buf[CEPH_MAX_CON_SECRET_LEN + 16]; 2363 - u8 *session_key = PTR_ALIGN(&session_key_buf[0], 16); 2364 2363 u8 *con_secret = PTR_ALIGN(&con_secret_buf[0], 16); 2365 2364 int session_key_len, con_secret_len; 2366 2365 int payload_len; ··· 2412 2415 con->state = CEPH_CON_S_V2_AUTH_SIGNATURE; 2413 2416 2414 2417 out: 2415 - memzero_explicit(session_key_buf, sizeof(session_key_buf)); 2418 + memzero_explicit(session_key, sizeof(session_key)); 2416 2419 memzero_explicit(con_secret_buf, sizeof(con_secret_buf)); 2417 2420 return ret; 2418 2421
+2
net/ceph/mon_client.c
··· 174 174 */ 175 175 static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len) 176 176 { 177 + BUG_ON(len > monc->m_auth->front_alloc_len); 178 + 177 179 monc->pending_auth = 1; 178 180 monc->m_auth->front.iov_len = len; 179 181 monc->m_auth->hdr.front_len = cpu_to_le32(len);