Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

fanotify: configurable limits via sysfs

fanotify has some hardcoded limits. The only APIs to escape those limits
are FAN_UNLIMITED_QUEUE and FAN_UNLIMITED_MARKS.

Allow finer grained tuning of the system limits via sysfs tunables under
/proc/sys/fs/fanotify, similar to tunables under /proc/sys/fs/inotify,
with some minor differences.

- max_queued_events - global system tunable for group queue size limit.
Like the inotify tunable with the same name, it defaults to 16384 and
applies on initialization of a new group.

- max_user_marks - user ns tunable for marks limit per user.
Like the inotify tunable named max_user_watches, on a machine with
sufficient RAM and it defaults to 1048576 in init userns and can be
further limited per containing user ns.

- max_user_groups - user ns tunable for number of groups per user.
Like the inotify tunable named max_user_instances, it defaults to 128
in init userns and can be further limited per containing user ns.

The slightly different tunable names used for fanotify are derived from
the "group" and "mark" terminology used in the fanotify man pages and
throughout the code.

Considering the fact that the default value for max_user_instances was
increased in kernel v5.10 from 8192 to 1048576, leaving the legacy
fanotify limit of 8192 marks per group in addition to the max_user_marks
limit makes little sense, so the per group marks limit has been removed.

Note that when a group is initialized with FAN_UNLIMITED_MARKS, its own
marks are not accounted in the per user marks account, so in effect the
limit of max_user_marks is only for the collection of groups that are
not initialized with FAN_UNLIMITED_MARKS.

Link: https://lore.kernel.org/r/20210304112921.3996419-2-amir73il@gmail.com
Suggested-by: Jan Kara <jack@suse.cz>
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>

authored by

Amir Goldstein and committed by
Jan Kara
5b8fea65 b8cd0ee8

+137 -39
+11 -5
fs/notify/fanotify/fanotify.c
··· 801 801 802 802 static void fanotify_free_group_priv(struct fsnotify_group *group) 803 803 { 804 - struct user_struct *user; 805 - 806 804 kfree(group->fanotify_data.merge_hash); 807 - user = group->fanotify_data.user; 808 - atomic_dec(&user->fanotify_listeners); 809 - free_uid(user); 805 + if (group->fanotify_data.ucounts) 806 + dec_ucount(group->fanotify_data.ucounts, 807 + UCOUNT_FANOTIFY_GROUPS); 810 808 } 811 809 812 810 static void fanotify_free_path_event(struct fanotify_event *event) ··· 860 862 } 861 863 } 862 864 865 + static void fanotify_freeing_mark(struct fsnotify_mark *mark, 866 + struct fsnotify_group *group) 867 + { 868 + if (!FAN_GROUP_FLAG(group, FAN_UNLIMITED_MARKS)) 869 + dec_ucount(group->fanotify_data.ucounts, UCOUNT_FANOTIFY_MARKS); 870 + } 871 + 863 872 static void fanotify_free_mark(struct fsnotify_mark *fsn_mark) 864 873 { 865 874 kmem_cache_free(fanotify_mark_cache, fsn_mark); ··· 876 871 .handle_event = fanotify_handle_event, 877 872 .free_group_priv = fanotify_free_group_priv, 878 873 .free_event = fanotify_free_event, 874 + .freeing_mark = fanotify_freeing_mark, 879 875 .free_mark = fanotify_free_mark, 880 876 };
+103 -20
fs/notify/fanotify/fanotify_user.c
··· 27 27 #include "fanotify.h" 28 28 29 29 #define FANOTIFY_DEFAULT_MAX_EVENTS 16384 30 - #define FANOTIFY_DEFAULT_MAX_MARKS 8192 31 - #define FANOTIFY_DEFAULT_MAX_LISTENERS 128 30 + #define FANOTIFY_OLD_DEFAULT_MAX_MARKS 8192 31 + #define FANOTIFY_DEFAULT_MAX_GROUPS 128 32 + 33 + /* 34 + * Legacy fanotify marks limits (8192) is per group and we introduced a tunable 35 + * limit of marks per user, similar to inotify. Effectively, the legacy limit 36 + * of fanotify marks per user is <max marks per group> * <max groups per user>. 37 + * This default limit (1M) also happens to match the increased limit of inotify 38 + * max_user_watches since v5.10. 39 + */ 40 + #define FANOTIFY_DEFAULT_MAX_USER_MARKS \ 41 + (FANOTIFY_OLD_DEFAULT_MAX_MARKS * FANOTIFY_DEFAULT_MAX_GROUPS) 42 + 43 + /* 44 + * Most of the memory cost of adding an inode mark is pinning the marked inode. 45 + * The size of the filesystem inode struct is not uniform across filesystems, 46 + * so double the size of a VFS inode is used as a conservative approximation. 47 + */ 48 + #define INODE_MARK_COST (2 * sizeof(struct inode)) 49 + 50 + /* configurable via /proc/sys/fs/fanotify/ */ 51 + static int fanotify_max_queued_events __read_mostly; 52 + 53 + #ifdef CONFIG_SYSCTL 54 + 55 + #include <linux/sysctl.h> 56 + 57 + struct ctl_table fanotify_table[] = { 58 + { 59 + .procname = "max_user_groups", 60 + .data = &init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS], 61 + .maxlen = sizeof(int), 62 + .mode = 0644, 63 + .proc_handler = proc_dointvec_minmax, 64 + .extra1 = SYSCTL_ZERO, 65 + }, 66 + { 67 + .procname = "max_user_marks", 68 + .data = &init_user_ns.ucount_max[UCOUNT_FANOTIFY_MARKS], 69 + .maxlen = sizeof(int), 70 + .mode = 0644, 71 + .proc_handler = proc_dointvec_minmax, 72 + .extra1 = SYSCTL_ZERO, 73 + }, 74 + { 75 + .procname = "max_queued_events", 76 + .data = &fanotify_max_queued_events, 77 + .maxlen = sizeof(int), 78 + .mode = 0644, 79 + .proc_handler = proc_dointvec_minmax, 80 + .extra1 = SYSCTL_ZERO 81 + }, 82 + { } 83 + }; 84 + #endif /* CONFIG_SYSCTL */ 32 85 33 86 /* 34 87 * All flags that may be specified in parameter event_f_flags of fanotify_init. ··· 900 847 unsigned int type, 901 848 __kernel_fsid_t *fsid) 902 849 { 850 + struct ucounts *ucounts = group->fanotify_data.ucounts; 903 851 struct fsnotify_mark *mark; 904 852 int ret; 905 853 906 - if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks) 854 + /* 855 + * Enforce per user marks limits per user in all containing user ns. 856 + * A group with FAN_UNLIMITED_MARKS does not contribute to mark count 857 + * in the limited groups account. 858 + */ 859 + if (!FAN_GROUP_FLAG(group, FAN_UNLIMITED_MARKS) && 860 + !inc_ucount(ucounts->ns, ucounts->uid, UCOUNT_FANOTIFY_MARKS)) 907 861 return ERR_PTR(-ENOSPC); 908 862 909 863 mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL); 910 - if (!mark) 911 - return ERR_PTR(-ENOMEM); 864 + if (!mark) { 865 + ret = -ENOMEM; 866 + goto out_dec_ucounts; 867 + } 912 868 913 869 fsnotify_init_mark(mark, group); 914 870 ret = fsnotify_add_mark_locked(mark, connp, type, 0, fsid); 915 871 if (ret) { 916 872 fsnotify_put_mark(mark); 917 - return ERR_PTR(ret); 873 + goto out_dec_ucounts; 918 874 } 919 875 920 876 return mark; 877 + 878 + out_dec_ucounts: 879 + if (!FAN_GROUP_FLAG(group, FAN_UNLIMITED_MARKS)) 880 + dec_ucount(ucounts, UCOUNT_FANOTIFY_MARKS); 881 + return ERR_PTR(ret); 921 882 } 922 883 923 884 ··· 1030 963 { 1031 964 struct fsnotify_group *group; 1032 965 int f_flags, fd; 1033 - struct user_struct *user; 1034 966 unsigned int fid_mode = flags & FANOTIFY_FID_BITS; 1035 967 unsigned int class = flags & FANOTIFY_CLASS_BITS; 1036 968 ··· 1068 1002 if ((fid_mode & FAN_REPORT_NAME) && !(fid_mode & FAN_REPORT_DIR_FID)) 1069 1003 return -EINVAL; 1070 1004 1071 - user = get_current_user(); 1072 - if (atomic_read(&user->fanotify_listeners) > FANOTIFY_DEFAULT_MAX_LISTENERS) { 1073 - free_uid(user); 1074 - return -EMFILE; 1075 - } 1076 - 1077 1005 f_flags = O_RDWR | FMODE_NONOTIFY; 1078 1006 if (flags & FAN_CLOEXEC) 1079 1007 f_flags |= O_CLOEXEC; ··· 1077 1017 /* fsnotify_alloc_group takes a ref. Dropped in fanotify_release */ 1078 1018 group = fsnotify_alloc_user_group(&fanotify_fsnotify_ops); 1079 1019 if (IS_ERR(group)) { 1080 - free_uid(user); 1081 1020 return PTR_ERR(group); 1082 1021 } 1083 1022 1084 - group->fanotify_data.user = user; 1023 + /* Enforce groups limits per user in all containing user ns */ 1024 + group->fanotify_data.ucounts = inc_ucount(current_user_ns(), 1025 + current_euid(), 1026 + UCOUNT_FANOTIFY_GROUPS); 1027 + if (!group->fanotify_data.ucounts) { 1028 + fd = -EMFILE; 1029 + goto out_destroy_group; 1030 + } 1031 + 1085 1032 group->fanotify_data.flags = flags; 1086 - atomic_inc(&user->fanotify_listeners); 1087 1033 group->memcg = get_mem_cgroup_from_mm(current->mm); 1088 1034 1089 1035 group->fanotify_data.merge_hash = fanotify_alloc_merge_hash(); ··· 1130 1064 goto out_destroy_group; 1131 1065 group->max_events = UINT_MAX; 1132 1066 } else { 1133 - group->max_events = FANOTIFY_DEFAULT_MAX_EVENTS; 1067 + group->max_events = fanotify_max_queued_events; 1134 1068 } 1135 1069 1136 1070 if (flags & FAN_UNLIMITED_MARKS) { 1137 1071 fd = -EPERM; 1138 1072 if (!capable(CAP_SYS_ADMIN)) 1139 1073 goto out_destroy_group; 1140 - group->fanotify_data.max_marks = UINT_MAX; 1141 - } else { 1142 - group->fanotify_data.max_marks = FANOTIFY_DEFAULT_MAX_MARKS; 1143 1074 } 1144 1075 1145 1076 if (flags & FAN_ENABLE_AUDIT) { ··· 1420 1357 */ 1421 1358 static int __init fanotify_user_setup(void) 1422 1359 { 1360 + struct sysinfo si; 1361 + int max_marks; 1362 + 1363 + si_meminfo(&si); 1364 + /* 1365 + * Allow up to 1% of addressable memory to be accounted for per user 1366 + * marks limited to the range [8192, 1048576]. mount and sb marks are 1367 + * a lot cheaper than inode marks, but there is no reason for a user 1368 + * to have many of those, so calculate by the cost of inode marks. 1369 + */ 1370 + max_marks = (((si.totalram - si.totalhigh) / 100) << PAGE_SHIFT) / 1371 + INODE_MARK_COST; 1372 + max_marks = clamp(max_marks, FANOTIFY_OLD_DEFAULT_MAX_MARKS, 1373 + FANOTIFY_DEFAULT_MAX_USER_MARKS); 1374 + 1423 1375 BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 10); 1424 1376 BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 9); 1425 1377 ··· 1448 1370 fanotify_perm_event_cachep = 1449 1371 KMEM_CACHE(fanotify_perm_event, SLAB_PANIC); 1450 1372 } 1373 + 1374 + fanotify_max_queued_events = FANOTIFY_DEFAULT_MAX_EVENTS; 1375 + init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS] = 1376 + FANOTIFY_DEFAULT_MAX_GROUPS; 1377 + init_user_ns.ucount_max[UCOUNT_FANOTIFY_MARKS] = max_marks; 1451 1378 1452 1379 return 0; 1453 1380 }
-1
fs/notify/group.c
··· 122 122 123 123 /* set to 0 when there a no external references to this group */ 124 124 refcount_set(&group->refcnt, 1); 125 - atomic_set(&group->num_marks, 0); 126 125 atomic_set(&group->user_waits, 0); 127 126 128 127 spin_lock_init(&group->notification_lock);
-4
fs/notify/mark.c
··· 391 391 list_del_init(&mark->g_list); 392 392 spin_unlock(&mark->lock); 393 393 394 - atomic_dec(&group->num_marks); 395 - 396 394 /* Drop mark reference acquired in fsnotify_add_mark_locked() */ 397 395 fsnotify_put_mark(mark); 398 396 } ··· 654 656 mark->flags |= FSNOTIFY_MARK_FLAG_ALIVE | FSNOTIFY_MARK_FLAG_ATTACHED; 655 657 656 658 list_add(&mark->g_list, &group->marks_list); 657 - atomic_inc(&group->num_marks); 658 659 fsnotify_get_mark(mark); /* for g_list */ 659 660 spin_unlock(&mark->lock); 660 661 ··· 671 674 FSNOTIFY_MARK_FLAG_ATTACHED); 672 675 list_del_init(&mark->g_list); 673 676 spin_unlock(&mark->lock); 674 - atomic_dec(&group->num_marks); 675 677 676 678 fsnotify_put_mark(mark); 677 679 return ret;
+3
include/linux/fanotify.h
··· 2 2 #ifndef _LINUX_FANOTIFY_H 3 3 #define _LINUX_FANOTIFY_H 4 4 5 + #include <linux/sysctl.h> 5 6 #include <uapi/linux/fanotify.h> 7 + 8 + extern struct ctl_table fanotify_table[]; /* for sysctl */ 6 9 7 10 #define FAN_GROUP_FLAG(group, flag) \ 8 11 ((group)->fanotify_data.flags & (flag))
+1 -5
include/linux/fsnotify_backend.h
··· 206 206 207 207 /* stores all fastpath marks assoc with this group so they can be cleaned on unregister */ 208 208 struct mutex mark_mutex; /* protect marks_list */ 209 - atomic_t num_marks; /* 1 for each mark and 1 for not being 210 - * past the point of no return when freeing 211 - * a group */ 212 209 atomic_t user_waits; /* Number of tasks waiting for user 213 210 * response */ 214 211 struct list_head marks_list; /* all inode marks for this group */ ··· 237 240 wait_queue_head_t access_waitq; 238 241 int flags; /* flags from fanotify_init() */ 239 242 int f_flags; /* event_f_flags from fanotify_init() */ 240 - unsigned int max_marks; 241 - struct user_struct *user; 243 + struct ucounts *ucounts; 242 244 } fanotify_data; 243 245 #endif /* CONFIG_FANOTIFY */ 244 246 };
-3
include/linux/sched/user.h
··· 14 14 refcount_t __count; /* reference count */ 15 15 atomic_t processes; /* How many processes does this user have? */ 16 16 atomic_t sigpending; /* How many pending signals does this user have? */ 17 - #ifdef CONFIG_FANOTIFY 18 - atomic_t fanotify_listeners; 19 - #endif 20 17 #ifdef CONFIG_EPOLL 21 18 atomic_long_t epoll_watches; /* The number of file descriptors currently watched */ 22 19 #endif
+4
include/linux/user_namespace.h
··· 50 50 UCOUNT_INOTIFY_INSTANCES, 51 51 UCOUNT_INOTIFY_WATCHES, 52 52 #endif 53 + #ifdef CONFIG_FANOTIFY 54 + UCOUNT_FANOTIFY_GROUPS, 55 + UCOUNT_FANOTIFY_MARKS, 56 + #endif 53 57 UCOUNT_COUNTS, 54 58 }; 55 59
+11 -1
kernel/sysctl.c
··· 148 148 #ifdef CONFIG_INOTIFY_USER 149 149 #include <linux/inotify.h> 150 150 #endif 151 + #ifdef CONFIG_FANOTIFY 152 + #include <linux/fanotify.h> 153 + #endif 151 154 152 155 #ifdef CONFIG_PROC_SYSCTL 153 156 ··· 3261 3258 .mode = 0555, 3262 3259 .child = inotify_table, 3263 3260 }, 3264 - #endif 3261 + #endif 3262 + #ifdef CONFIG_FANOTIFY 3263 + { 3264 + .procname = "fanotify", 3265 + .mode = 0555, 3266 + .child = fanotify_table, 3267 + }, 3268 + #endif 3265 3269 #ifdef CONFIG_EPOLL 3266 3270 { 3267 3271 .procname = "epoll",
+4
kernel/ucount.c
··· 74 74 UCOUNT_ENTRY("max_inotify_instances"), 75 75 UCOUNT_ENTRY("max_inotify_watches"), 76 76 #endif 77 + #ifdef CONFIG_FANOTIFY 78 + UCOUNT_ENTRY("max_fanotify_groups"), 79 + UCOUNT_ENTRY("max_fanotify_marks"), 80 + #endif 77 81 { } 78 82 }; 79 83 #endif /* CONFIG_SYSCTL */