Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

ceph: supply snapshot context in ceph_uninline_data()

The ceph_uninline_data function was missing proper snapshot context
handling for its OSD write operations. Both CEPH_OSD_OP_CREATE and
CEPH_OSD_OP_WRITE requests were passing NULL instead of the appropriate
snapshot context, which could lead to unnecessary object clone.

Reproducer:
../src/vstart.sh --new -x --localhost --bluestore
// turn on cephfs inline data
./bin/ceph fs set a inline_data true --yes-i-really-really-mean-it
// allow fs_a client to take snapshot
./bin/ceph auth caps client.fs_a mds 'allow rwps fsname=a' mon 'allow r fsname=a' osd 'allow rw tag cephfs data=a'
// mount cephfs with fuse, since kernel cephfs doesn't support inline write
ceph-fuse --id fs_a -m 127.0.0.1:40318 --conf ceph.conf -d /mnt/mycephfs/
// bump snapshot seq
mkdir /mnt/mycephfs/.snap/snap1
echo "foo" > /mnt/mycephfs/test
// umount and mount it again using kernel cephfs client
umount /mnt/mycephfs
mount -t ceph fs_a@.a=/ /mnt/mycephfs/ -o conf=./ceph.conf
echo "bar" >> /mnt/mycephfs/test
./bin/rados listsnaps -p cephfs.a.data $(printf "%x\n" $(stat -c %i /mnt/mycephfs/test)).00000000

will see this object does unnecessary clone
1000000000a.00000000 (seq:2):
cloneid snaps size overlap
2 2 4 []
head - 8

but it's expected to see
10000000000.00000000 (seq:2):
cloneid snaps size overlap
head - 8

since there's no snapshot between these 2 writes

clone happened because the first osd request CEPH_OSD_OP_CREATE doesn't
pass snap context so object is created with snap seq 0, but later data
writeback is equipped with snapshot context.
snap.seq(1) > object snap seq(0), so osd does object clone.

This fix properly acquiring the snapshot context before performing
write operations.

Signed-off-by: ethanwu <ethanwu@synology.com>
Reviewed-by: Viacheslav Dubeyko <Slava.Dubeyko@ibm.com>
Tested-by: Viacheslav Dubeyko <Slava.Dubeyko@ibm.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>

authored by

ethanwu and committed by
Ilya Dryomov
305ff6b3 f16bd3fa

+22 -2
+22 -2
fs/ceph/addr.c
··· 2199 2199 struct ceph_osd_request *req = NULL; 2200 2200 struct ceph_cap_flush *prealloc_cf = NULL; 2201 2201 struct folio *folio = NULL; 2202 + struct ceph_snap_context *snapc = NULL; 2202 2203 u64 inline_version = CEPH_INLINE_NONE; 2203 2204 struct page *pages[1]; 2204 2205 int err = 0; ··· 2227 2226 if (inline_version == 1) /* initial version, no data */ 2228 2227 goto out_uninline; 2229 2228 2229 + down_read(&fsc->mdsc->snap_rwsem); 2230 + spin_lock(&ci->i_ceph_lock); 2231 + if (__ceph_have_pending_cap_snap(ci)) { 2232 + struct ceph_cap_snap *capsnap = 2233 + list_last_entry(&ci->i_cap_snaps, 2234 + struct ceph_cap_snap, 2235 + ci_item); 2236 + snapc = ceph_get_snap_context(capsnap->context); 2237 + } else { 2238 + if (!ci->i_head_snapc) { 2239 + ci->i_head_snapc = ceph_get_snap_context( 2240 + ci->i_snap_realm->cached_context); 2241 + } 2242 + snapc = ceph_get_snap_context(ci->i_head_snapc); 2243 + } 2244 + spin_unlock(&ci->i_ceph_lock); 2245 + up_read(&fsc->mdsc->snap_rwsem); 2246 + 2230 2247 folio = read_mapping_folio(inode->i_mapping, 0, file); 2231 2248 if (IS_ERR(folio)) { 2232 2249 err = PTR_ERR(folio); ··· 2260 2241 req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, 2261 2242 ceph_vino(inode), 0, &len, 0, 1, 2262 2243 CEPH_OSD_OP_CREATE, CEPH_OSD_FLAG_WRITE, 2263 - NULL, 0, 0, false); 2244 + snapc, 0, 0, false); 2264 2245 if (IS_ERR(req)) { 2265 2246 err = PTR_ERR(req); 2266 2247 goto out_unlock; ··· 2276 2257 req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, 2277 2258 ceph_vino(inode), 0, &len, 1, 3, 2278 2259 CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, 2279 - NULL, ci->i_truncate_seq, 2260 + snapc, ci->i_truncate_seq, 2280 2261 ci->i_truncate_size, false); 2281 2262 if (IS_ERR(req)) { 2282 2263 err = PTR_ERR(req); ··· 2339 2320 folio_put(folio); 2340 2321 } 2341 2322 out: 2323 + ceph_put_snap_context(snapc); 2342 2324 ceph_free_cap_flush(prealloc_cf); 2343 2325 doutc(cl, "%llx.%llx inline_version %llu = %d\n", 2344 2326 ceph_vinop(inode), inline_version, err);