NFSD: Enforce timeout on layout recall and integrate lease manager fencing

+30

Documentation/admin-guide/nfs/pnfs-block-server.rst

··· 40 40 41 41 echo "fencing client ${CLIENT} serial ${EVPD}" >> /var/log/pnfsd-fence.log 42 42 EOF 43 + 44 + If the nfsd server needs to fence a non-responding client and the 45 + fencing operation fails, the server logs a warning message in the 46 + system log with the following format: 47 + 48 + FENCE failed client[IP_address] clid[#n] device[dev_name] 49 + 50 + Where: 51 + 52 + IP_address: refers to the IP address of the affected client. 53 + #n: indicates the unique client identifier. 54 + dev_name: specifies the name of the block device related 55 + to the fencing attempt. 56 + 57 + The server will repeatedly retry the operation indefinitely. During 58 + this time, access to the affected file is restricted for all other 59 + clients. This is to prevent potential data corruption if multiple 60 + clients access the same file simultaneously. 61 + 62 + To restore access to the affected file for other clients, the admin 63 + needs to take the following actions: 64 + 65 + . shutdown or power off the client being fenced. 66 + . manually expire the client to release all its state on the server: 67 + 68 + echo 'expire' > /proc/fs/nfsd/clients/clid/ctl'. 69 + 70 + Where: 71 + 72 + clid: is the unique client identifier displayed in the system log.

+31

Documentation/admin-guide/nfs/pnfs-scsi-server.rst

··· 22 22 On the client make sure the kernel has the CONFIG_PNFS_BLOCK option 23 23 enabled, and the file system is mounted using the NFSv4.1 protocol 24 24 version (mount -o vers=4.1). 25 + 26 + If the nfsd server needs to fence a non-responding client and the 27 + fencing operation fails, the server logs a warning message in the 28 + system log with the following format: 29 + 30 + FENCE failed client[IP_address] clid[#n] device[dev_name] 31 + 32 + Where: 33 + 34 + IP_address: refers to the IP address of the affected client. 35 + #n: indicates the unique client identifier. 36 + dev_name: specifies the name of the block device related 37 + to the fencing attempt. 38 + 39 + The server will repeatedly retry the operation indefinitely. During 40 + this time, access to the affected file is restricted for all other 41 + clients. This is to prevent potential data corruption if multiple 42 + clients access the same file simultaneously. 43 + 44 + To restore access to the affected file for other clients, the admin 45 + needs to take the following actions: 46 + 47 + . shutdown or power off the client being fenced. 48 + . manually expire the client to release all its state on the server: 49 + 50 + echo 'expire' > /proc/fs/nfsd/clients/clid/ctl'. 51 + 52 + Where: 53 + 54 + clid: is the unique client identifier displayed in the system log. 55 +

+2

Documentation/filesystems/locking.rst

··· 398 398 bool (*lm_breaker_owns_lease)(struct file_lock *); 399 399 bool (*lm_lock_expirable)(struct file_lock *); 400 400 void (*lm_expire_lock)(void); 401 + bool (*lm_breaker_timedout)(struct file_lease *); 401 402 402 403 locking rules: 403 404 ··· 413 412 lm_lock_expirable yes no no 414 413 lm_expire_lock no no yes 415 414 lm_open_conflict yes no no 415 + lm_breaker_timedout yes no no 416 416 ====================== ============= ================= ========= 417 417 418 418 buffer_head

+21 -5

fs/locks.c

··· 1534 1534 { 1535 1535 struct file_lock_context *ctx = inode->i_flctx; 1536 1536 struct file_lease *fl, *tmp; 1537 + bool remove; 1537 1538 1538 1539 lockdep_assert_held(&ctx->flc_lock); 1539 1540 ··· 1542 1541 trace_time_out_leases(inode, fl); 1543 1542 if (past_time(fl->fl_downgrade_time)) 1544 1543 lease_modify(fl, F_RDLCK, dispose); 1545 - if (past_time(fl->fl_break_time)) 1546 - lease_modify(fl, F_UNLCK, dispose); 1544 + 1545 + remove = true; 1546 + if (past_time(fl->fl_break_time)) { 1547 + /* 1548 + * Consult the lease manager when a lease break times 1549 + * out to determine whether the lease should be disposed 1550 + * of. 1551 + */ 1552 + if (fl->fl_lmops && fl->fl_lmops->lm_breaker_timedout) 1553 + remove = fl->fl_lmops->lm_breaker_timedout(fl); 1554 + if (remove) 1555 + lease_modify(fl, F_UNLCK, dispose); 1556 + } 1547 1557 } 1548 1558 } 1549 1559 ··· 1682 1670 restart: 1683 1671 fl = list_first_entry(&ctx->flc_lease, struct file_lease, c.flc_list); 1684 1672 break_time = fl->fl_break_time; 1685 - if (break_time != 0) 1686 - break_time -= jiffies; 1687 - if (break_time == 0) 1673 + if (break_time != 0) { 1674 + if (time_after(jiffies, break_time)) { 1675 + fl->fl_break_time = jiffies + lease_break_time * HZ; 1676 + break_time = lease_break_time * HZ; 1677 + } else 1678 + break_time -= jiffies; 1679 + } else 1688 1680 break_time++; 1689 1681 locks_insert_block(&fl->c, &new_fl->c, leases_conflict); 1690 1682 trace_break_lease_block(inode, new_fl);

+35 -7

fs/nfsd/blocklayout.c

··· 297 297 ret = 0; 298 298 } 299 299 xa_unlock(xa); 300 + clp->cl_fence_retry_warn = false; 300 301 return ret; 301 302 } 302 303 ··· 444 443 return nfsd4_block_commit_blocks(inode, lcp, iomaps, nr_iomaps); 445 444 } 446 445 447 - static void 446 + /* 447 + * Perform the fence operation to prevent the client from accessing the 448 + * block device. If a fence operation is already in progress, wait for 449 + * it to complete before checking the NFSD_MDS_PR_FENCED flag. Once the 450 + * operation is complete, check the flag. If NFSD_MDS_PR_FENCED is set, 451 + * update the layout stateid by setting the ls_fenced flag to indicate 452 + * that the client has been fenced. 453 + * 454 + * The cl_fence_mutex ensures that the fence operation has been fully 455 + * completed, rather than just in progress, when returning from this 456 + * function. 457 + * 458 + * Return true if client was fenced otherwise return false. 459 + */ 460 + static bool 448 461 nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls, struct nfsd_file *file) 449 462 { 450 463 struct nfs4_client *clp = ls->ls_stid.sc_client; 451 464 struct block_device *bdev = file->nf_file->f_path.mnt->mnt_sb->s_bdev; 452 465 int status; 466 + bool ret; 453 467 454 - if (nfsd4_scsi_fence_set(clp, bdev->bd_dev)) 455 - return; 468 + mutex_lock(&clp->cl_fence_mutex); 469 + if (nfsd4_scsi_fence_set(clp, bdev->bd_dev)) { 470 + mutex_unlock(&clp->cl_fence_mutex); 471 + return true; 472 + } 456 473 457 474 status = bdev->bd_disk->fops->pr_ops->pr_preempt(bdev, NFSD_MDS_PR_KEY, 458 475 nfsd4_scsi_pr_key(clp), ··· 489 470 * PR_STS_RESERVATION_CONFLICT, which would cause an infinite 490 471 * retry loop. 491 472 */ 492 - if (status < 0 || 493 - status == PR_STS_PATH_FAILED || 494 - status == PR_STS_PATH_FAST_FAILED || 495 - status == PR_STS_RETRY_PATH_FAILURE) 473 + switch (status) { 474 + case 0: 475 + case PR_STS_IOERR: 476 + case PR_STS_RESERVATION_CONFLICT: 477 + ret = true; 478 + break; 479 + default: 480 + /* retry-able and other errors */ 481 + ret = false; 496 482 nfsd4_scsi_fence_clear(clp, bdev->bd_dev); 483 + break; 484 + } 485 + mutex_unlock(&clp->cl_fence_mutex); 497 486 498 487 trace_nfsd_pnfs_fence(clp, bdev->bd_disk->disk_name, status); 488 + return ret; 499 489 } 500 490 501 491 const struct nfsd4_layout_ops scsi_layout_ops = {

+148 -4

fs/nfsd/nfs4layouts.c

··· 27 27 static const struct nfsd4_callback_ops nfsd4_cb_layout_ops; 28 28 static const struct lease_manager_operations nfsd4_layouts_lm_ops; 29 29 30 + static void nfsd4_layout_fence_worker(struct work_struct *work); 31 + 30 32 const struct nfsd4_layout_ops *nfsd4_layout_ops[LAYOUT_TYPE_MAX] = { 31 33 #ifdef CONFIG_NFSD_FLEXFILELAYOUT 32 34 [LAYOUT_FLEX_FILES] = &ff_layout_ops, ··· 179 177 180 178 trace_nfsd_layoutstate_free(&ls->ls_stid.sc_stateid); 181 179 180 + spin_lock(&ls->ls_lock); 181 + if (delayed_work_pending(&ls->ls_fence_work)) { 182 + spin_unlock(&ls->ls_lock); 183 + cancel_delayed_work_sync(&ls->ls_fence_work); 184 + } else 185 + spin_unlock(&ls->ls_lock); 186 + 182 187 spin_lock(&clp->cl_lock); 183 188 list_del_init(&ls->ls_perclnt); 184 189 spin_unlock(&clp->cl_lock); ··· 279 270 spin_lock(&fp->fi_lock); 280 271 list_add(&ls->ls_perfile, &fp->fi_lo_states); 281 272 spin_unlock(&fp->fi_lock); 273 + 274 + ls->ls_fenced = false; 275 + ls->ls_fence_delay = 0; 276 + INIT_DELAYED_WORK(&ls->ls_fence_work, nfsd4_layout_fence_worker); 282 277 283 278 trace_nfsd_layoutstate_alloc(&ls->ls_stid.sc_stateid); 284 279 return ls; ··· 760 747 nfsd4_layout_lm_break(struct file_lease *fl) 761 748 { 762 749 /* 763 - * We don't want the locks code to timeout the lease for us; 764 - * we'll remove it ourself if a layout isn't returned 765 - * in time: 750 + * Enforce break lease timeout to prevent NFSD 751 + * thread from hanging in __break_lease. 766 752 */ 767 - fl->fl_break_time = 0; 768 753 nfsd4_recall_file_layout(fl->c.flc_owner); 769 754 return false; 770 755 } ··· 793 782 return 0; 794 783 } 795 784 785 + static void 786 + nfsd4_layout_fence_worker(struct work_struct *work) 787 + { 788 + struct delayed_work *dwork = to_delayed_work(work); 789 + struct nfs4_layout_stateid *ls = container_of(dwork, 790 + struct nfs4_layout_stateid, ls_fence_work); 791 + struct nfsd_file *nf; 792 + struct block_device *bdev; 793 + struct nfs4_client *clp; 794 + struct nfsd_net *nn; 795 + 796 + /* 797 + * The workqueue clears WORK_STRUCT_PENDING before invoking 798 + * this callback. Re-arm immediately so that 799 + * delayed_work_pending() returns true while the fence 800 + * operation is in progress, preventing 801 + * lm_breaker_timedout() from taking a duplicate reference. 802 + */ 803 + mod_delayed_work(system_dfl_wq, &ls->ls_fence_work, 0); 804 + 805 + spin_lock(&ls->ls_lock); 806 + if (list_empty(&ls->ls_layouts)) { 807 + spin_unlock(&ls->ls_lock); 808 + dispose: 809 + cancel_delayed_work(&ls->ls_fence_work); 810 + /* unlock the lease so that tasks waiting on it can proceed */ 811 + nfsd4_close_layout(ls); 812 + 813 + ls->ls_fenced = true; 814 + nfs4_put_stid(&ls->ls_stid); 815 + return; 816 + } 817 + spin_unlock(&ls->ls_lock); 818 + 819 + rcu_read_lock(); 820 + nf = nfsd_file_get(ls->ls_file); 821 + rcu_read_unlock(); 822 + if (!nf) 823 + goto dispose; 824 + 825 + clp = ls->ls_stid.sc_client; 826 + nn = net_generic(clp->net, nfsd_net_id); 827 + bdev = nf->nf_file->f_path.mnt->mnt_sb->s_bdev; 828 + if (nfsd4_layout_ops[ls->ls_layout_type]->fence_client(ls, nf)) { 829 + /* fenced ok */ 830 + nfsd_file_put(nf); 831 + pr_warn("%s: FENCED client[%pISpc] clid[%d] to device[%s]\n", 832 + __func__, (struct sockaddr *)&clp->cl_addr, 833 + clp->cl_clientid.cl_id - nn->clientid_base, 834 + bdev->bd_disk->disk_name); 835 + goto dispose; 836 + } 837 + /* fence failed */ 838 + nfsd_file_put(nf); 839 + 840 + if (!clp->cl_fence_retry_warn) { 841 + pr_warn("%s: FENCE failed client[%pISpc] clid[%d] device[%s]\n", 842 + __func__, (struct sockaddr *)&clp->cl_addr, 843 + clp->cl_clientid.cl_id - nn->clientid_base, 844 + bdev->bd_disk->disk_name); 845 + clp->cl_fence_retry_warn = true; 846 + } 847 + /* 848 + * The fence worker retries the fencing operation indefinitely to 849 + * prevent data corruption. The admin needs to take the following 850 + * actions to restore access to the file for other clients: 851 + * 852 + * . shutdown or power off the client being fenced. 853 + * . manually expire the client to release all its state on the server; 854 + * echo 'expire' > /proc/fs/nfsd/clients/clid/ctl'. 855 + * 856 + * Where: 857 + * 858 + * clid: is the unique client identifier displayed in 859 + * the warning message above. 860 + */ 861 + if (!ls->ls_fence_delay) 862 + ls->ls_fence_delay = HZ; 863 + else 864 + ls->ls_fence_delay = min(ls->ls_fence_delay << 1, 865 + MAX_FENCE_DELAY); 866 + mod_delayed_work(system_dfl_wq, &ls->ls_fence_work, ls->ls_fence_delay); 867 + } 868 + 869 + /** 870 + * nfsd4_layout_lm_breaker_timedout - The layout recall has timed out. 871 + * @fl: file to check 872 + * 873 + * If the layout type supports a fence operation, schedule a worker to 874 + * fence the client from accessing the block device. 875 + * 876 + * This function runs under the protection of the spin_lock flc_lock. 877 + * At this time, the file_lease associated with the layout stateid is 878 + * on the flc_list. A reference count is incremented on the layout 879 + * stateid to prevent it from being freed while the fence worker is 880 + * executing. Once the fence worker finishes its operation, it releases 881 + * this reference. 882 + * 883 + * The fence worker continues to run until either the client has been 884 + * fenced or the layout becomes invalid. The layout can become invalid 885 + * as a result of a LAYOUTRETURN or when the CB_LAYOUT recall callback 886 + * has completed. 887 + * 888 + * Return true if the file_lease should be disposed of by the caller; 889 + * otherwise, return false. 890 + */ 891 + static bool 892 + nfsd4_layout_lm_breaker_timedout(struct file_lease *fl) 893 + { 894 + struct nfs4_layout_stateid *ls = fl->c.flc_owner; 895 + 896 + if ((!nfsd4_layout_ops[ls->ls_layout_type]->fence_client) || 897 + ls->ls_fenced) 898 + return true; 899 + if (delayed_work_pending(&ls->ls_fence_work)) 900 + return false; 901 + /* 902 + * Make sure layout has not been returned yet before 903 + * taking a reference count on the layout stateid. 904 + */ 905 + spin_lock(&ls->ls_lock); 906 + if (list_empty(&ls->ls_layouts) || 907 + !refcount_inc_not_zero(&ls->ls_stid.sc_count)) { 908 + spin_unlock(&ls->ls_lock); 909 + return true; 910 + } 911 + spin_unlock(&ls->ls_lock); 912 + 913 + mod_delayed_work(system_dfl_wq, &ls->ls_fence_work, 0); 914 + return false; 915 + } 916 + 796 917 static const struct lease_manager_operations nfsd4_layouts_lm_ops = { 797 918 .lm_break = nfsd4_layout_lm_break, 798 919 .lm_change = nfsd4_layout_lm_change, 799 920 .lm_open_conflict = nfsd4_layout_lm_open_conflict, 921 + .lm_breaker_timedout = nfsd4_layout_lm_breaker_timedout, 800 922 }; 801 923 802 924 int

+1

fs/nfsd/nfs4state.c

··· 2386 2386 #endif 2387 2387 #ifdef CONFIG_NFSD_SCSILAYOUT 2388 2388 xa_init(&clp->cl_dev_fences); 2389 + mutex_init(&clp->cl_fence_mutex); 2389 2390 #endif 2390 2391 INIT_LIST_HEAD(&clp->async_copies); 2391 2392 spin_lock_init(&clp->async_lock);

+4 -1

fs/nfsd/pnfs.h

··· 11 11 12 12 struct xdr_stream; 13 13 14 + /* Cap exponential backoff between fence retries at 3 minutes */ 15 + #define MAX_FENCE_DELAY ((unsigned int)(3 * 60 * HZ)) 16 + 14 17 struct nfsd4_deviceid_map { 15 18 struct list_head hash; 16 19 u64 idx; ··· 41 38 struct svc_rqst *rqstp, 42 39 struct nfsd4_layoutcommit *lcp); 43 40 44 - void (*fence_client)(struct nfs4_layout_stateid *ls, 41 + bool (*fence_client)(struct nfs4_layout_stateid *ls, 45 42 struct nfsd_file *file); 46 43 }; 47 44

+6

fs/nfsd/state.h

··· 456 456 struct list_head cl_lru; /* tail queue */ 457 457 #ifdef CONFIG_NFSD_PNFS 458 458 struct list_head cl_lo_states; /* outstanding layout states */ 459 + bool cl_fence_retry_warn; 459 460 #endif 460 461 struct xdr_netobj cl_name; /* id generated by client */ 461 462 nfs4_verifier cl_verifier; /* generated by client */ ··· 530 529 time64_t cl_ra_time; 531 530 #ifdef CONFIG_NFSD_SCSILAYOUT 532 531 struct xarray cl_dev_fences; 532 + struct mutex cl_fence_mutex; 533 533 #endif 534 534 }; 535 535 ··· 747 745 stateid_t ls_recall_sid; 748 746 bool ls_recalled; 749 747 struct mutex ls_mutex; 748 + 749 + struct delayed_work ls_fence_work; 750 + unsigned int ls_fence_delay; 751 + bool ls_fenced; 750 752 }; 751 753 752 754 static inline struct nfs4_layout_stateid *layoutstateid(struct nfs4_stid *s)

+1

include/linux/filelock.h

··· 50 50 void (*lm_setup)(struct file_lease *, void **); 51 51 bool (*lm_breaker_owns_lease)(struct file_lease *); 52 52 int (*lm_open_conflict)(struct file *, int); 53 + bool (*lm_breaker_timedout)(struct file_lease *fl); 53 54 }; 54 55 55 56 struct lock_manager {

Configure Feed

Configure Feed