Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

[PATCH] per-task delay accounting taskstats interface: control exit data through cpumasks

On systems with a large number of cpus, with even a modest rate of tasks
exiting per cpu, the volume of taskstats data sent on thread exit can
overflow a userspace listener's buffers.

One approach to avoiding overflow is to allow listeners to get data for a
limited and specific set of cpus. By scaling the number of listeners
and/or the cpus they monitor, userspace can handle the statistical data
overload more gracefully.

In this patch, each listener registers to listen to a specific set of cpus
by specifying a cpumask. The interest is recorded per-cpu. When a task
exits on a cpu, its taskstats data is unicast to each listener interested
in that cpu.

Thanks to Andrew Morton for pointing out the various scalability and
general concerns of previous attempts and for suggesting this design.

[akpm@osdl.org: build fix]
Signed-off-by: Shailabh Nagar <nagar@watson.ibm.com>
Signed-off-by: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Chandra Seetharaman <sekharan@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

authored by

Shailabh Nagar and committed by
Linus Torvalds
f9fd8914 c8924363

+198 -38
+2 -2
include/linux/taskstats.h
··· 91 91 }; 92 92 93 93 94 - #define TASKSTATS_LISTEN_GROUP 0x1 95 - 96 94 /* 97 95 * Commands sent from userspace 98 96 * Not versioned. New commands should only be inserted at the enum's end ··· 122 124 TASKSTATS_CMD_ATTR_UNSPEC = 0, 123 125 TASKSTATS_CMD_ATTR_PID, 124 126 TASKSTATS_CMD_ATTR_TGID, 127 + TASKSTATS_CMD_ATTR_REGISTER_CPUMASK, 128 + TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK, 125 129 __TASKSTATS_CMD_ATTR_MAX, 126 130 }; 127 131
+4 -23
include/linux/taskstats_kern.h
··· 11 11 #include <linux/sched.h> 12 12 #include <net/genetlink.h> 13 13 14 - enum { 15 - TASKSTATS_MSG_UNICAST, /* send data only to requester */ 16 - TASKSTATS_MSG_MULTICAST, /* send data to a group */ 17 - }; 18 - 19 14 #ifdef CONFIG_TASKSTATS 20 15 extern kmem_cache_t *taskstats_cache; 21 16 extern struct mutex taskstats_exit_mutex; 22 - 23 - static inline int taskstats_has_listeners(void) 24 - { 25 - if (!genl_sock) 26 - return 0; 27 - return netlink_has_listeners(genl_sock, TASKSTATS_LISTEN_GROUP); 28 - } 29 - 30 - 31 - static inline void taskstats_exit_alloc(struct taskstats **ptidstats) 32 - { 33 - *ptidstats = NULL; 34 - if (taskstats_has_listeners()) 35 - *ptidstats = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL); 36 - } 37 17 38 18 static inline void taskstats_exit_free(struct taskstats *tidstats) 39 19 { ··· 62 82 kmem_cache_free(taskstats_cache, stats); 63 83 } 64 84 65 - extern void taskstats_exit_send(struct task_struct *, struct taskstats *, int); 85 + extern void taskstats_exit_alloc(struct taskstats **, unsigned int *); 86 + extern void taskstats_exit_send(struct task_struct *, struct taskstats *, int, unsigned int); 66 87 extern void taskstats_init_early(void); 67 88 extern void taskstats_tgid_alloc(struct signal_struct *); 68 89 #else 69 - static inline void taskstats_exit_alloc(struct taskstats **ptidstats) 90 + static inline void taskstats_exit_alloc(struct taskstats **ptidstats, unsigned int *mycpu) 70 91 {} 71 92 static inline void taskstats_exit_free(struct taskstats *ptidstats) 72 93 {} 73 94 static inline void taskstats_exit_send(struct task_struct *tsk, 74 95 struct taskstats *tidstats, 75 - int group_dead) 96 + int group_dead, unsigned int cpu) 76 97 {} 77 98 static inline void taskstats_tgid_init(struct signal_struct *sig) 78 99 {}
+3 -2
kernel/exit.c
··· 847 847 struct task_struct *tsk = current; 848 848 struct taskstats *tidstats; 849 849 int group_dead; 850 + unsigned int mycpu; 850 851 851 852 profile_task_exit(tsk); 852 853 ··· 885 884 current->comm, current->pid, 886 885 preempt_count()); 887 886 888 - taskstats_exit_alloc(&tidstats); 887 + taskstats_exit_alloc(&tidstats, &mycpu); 889 888 890 889 acct_update_integrals(tsk); 891 890 if (tsk->mm) { ··· 906 905 #endif 907 906 if (unlikely(tsk->audit_context)) 908 907 audit_free(tsk); 909 - taskstats_exit_send(tsk, tidstats, group_dead); 908 + taskstats_exit_send(tsk, tidstats, group_dead, mycpu); 910 909 taskstats_exit_free(tidstats); 911 910 delayacct_tsk_exit(tsk); 912 911
+189 -11
kernel/taskstats.c
··· 19 19 #include <linux/kernel.h> 20 20 #include <linux/taskstats_kern.h> 21 21 #include <linux/delayacct.h> 22 + #include <linux/cpumask.h> 23 + #include <linux/percpu.h> 22 24 #include <net/genetlink.h> 23 25 #include <asm/atomic.h> 26 + 27 + /* 28 + * Maximum length of a cpumask that can be specified in 29 + * the TASKSTATS_CMD_ATTR_REGISTER/DEREGISTER_CPUMASK attribute 30 + */ 31 + #define TASKSTATS_CPUMASK_MAXLEN (100+6*NR_CPUS) 24 32 25 33 static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 }; 26 34 static int family_registered; ··· 45 37 __read_mostly = { 46 38 [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 }, 47 39 [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 }, 40 + [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING }, 41 + [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },}; 42 + 43 + struct listener { 44 + struct list_head list; 45 + pid_t pid; 48 46 }; 49 47 48 + struct listener_list { 49 + struct rw_semaphore sem; 50 + struct list_head list; 51 + }; 52 + static DEFINE_PER_CPU(struct listener_list, listener_array); 53 + 54 + enum actions { 55 + REGISTER, 56 + DEREGISTER, 57 + CPU_DONT_CARE 58 + }; 50 59 51 60 static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, 52 61 void **replyp, size_t size) ··· 99 74 return 0; 100 75 } 101 76 102 - static int send_reply(struct sk_buff *skb, pid_t pid, int event) 77 + /* 78 + * Send taskstats data in @skb to listener with nl_pid @pid 79 + */ 80 + static int send_reply(struct sk_buff *skb, pid_t pid) 103 81 { 104 82 struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data); 105 - void *reply; 83 + void *reply = genlmsg_data(genlhdr); 106 84 int rc; 107 - 108 - reply = genlmsg_data(genlhdr); 109 85 110 86 rc = genlmsg_end(skb, reply); 111 87 if (rc < 0) { ··· 114 88 return rc; 115 89 } 116 90 117 - if (event == TASKSTATS_MSG_MULTICAST) 118 - return genlmsg_multicast(skb, pid, TASKSTATS_LISTEN_GROUP); 119 91 return genlmsg_unicast(skb, pid); 92 + } 93 + 94 + /* 95 + * Send taskstats data in @skb to listeners registered for @cpu's exit data 96 + */ 97 + static int send_cpu_listeners(struct sk_buff *skb, unsigned int cpu) 98 + { 99 + struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data); 100 + struct listener_list *listeners; 101 + struct listener *s, *tmp; 102 + struct sk_buff *skb_next, *skb_cur = skb; 103 + void *reply = genlmsg_data(genlhdr); 104 + int rc, ret; 105 + 106 + rc = genlmsg_end(skb, reply); 107 + if (rc < 0) { 108 + nlmsg_free(skb); 109 + return rc; 110 + } 111 + 112 + rc = 0; 113 + listeners = &per_cpu(listener_array, cpu); 114 + down_write(&listeners->sem); 115 + list_for_each_entry_safe(s, tmp, &listeners->list, list) { 116 + skb_next = NULL; 117 + if (!list_is_last(&s->list, &listeners->list)) { 118 + skb_next = skb_clone(skb_cur, GFP_KERNEL); 119 + if (!skb_next) { 120 + nlmsg_free(skb_cur); 121 + rc = -ENOMEM; 122 + break; 123 + } 124 + } 125 + ret = genlmsg_unicast(skb_cur, s->pid); 126 + if (ret == -ECONNREFUSED) { 127 + list_del(&s->list); 128 + kfree(s); 129 + rc = ret; 130 + } 131 + skb_cur = skb_next; 132 + } 133 + up_write(&listeners->sem); 134 + 135 + return rc; 120 136 } 121 137 122 138 static int fill_pid(pid_t pid, struct task_struct *pidtsk, ··· 272 204 return; 273 205 } 274 206 207 + static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd) 208 + { 209 + struct listener_list *listeners; 210 + struct listener *s, *tmp; 211 + unsigned int cpu; 212 + cpumask_t mask = *maskp; 275 213 276 - static int taskstats_send_stats(struct sk_buff *skb, struct genl_info *info) 214 + if (!cpus_subset(mask, cpu_possible_map)) 215 + return -EINVAL; 216 + 217 + if (isadd == REGISTER) { 218 + for_each_cpu_mask(cpu, mask) { 219 + s = kmalloc_node(sizeof(struct listener), GFP_KERNEL, 220 + cpu_to_node(cpu)); 221 + if (!s) 222 + goto cleanup; 223 + s->pid = pid; 224 + INIT_LIST_HEAD(&s->list); 225 + 226 + listeners = &per_cpu(listener_array, cpu); 227 + down_write(&listeners->sem); 228 + list_add(&s->list, &listeners->list); 229 + up_write(&listeners->sem); 230 + } 231 + return 0; 232 + } 233 + 234 + /* Deregister or cleanup */ 235 + cleanup: 236 + for_each_cpu_mask(cpu, mask) { 237 + listeners = &per_cpu(listener_array, cpu); 238 + down_write(&listeners->sem); 239 + list_for_each_entry_safe(s, tmp, &listeners->list, list) { 240 + if (s->pid == pid) { 241 + list_del(&s->list); 242 + kfree(s); 243 + break; 244 + } 245 + } 246 + up_write(&listeners->sem); 247 + } 248 + return 0; 249 + } 250 + 251 + static int parse(struct nlattr *na, cpumask_t *mask) 252 + { 253 + char *data; 254 + int len; 255 + int ret; 256 + 257 + if (na == NULL) 258 + return 1; 259 + len = nla_len(na); 260 + if (len > TASKSTATS_CPUMASK_MAXLEN) 261 + return -E2BIG; 262 + if (len < 1) 263 + return -EINVAL; 264 + data = kmalloc(len, GFP_KERNEL); 265 + if (!data) 266 + return -ENOMEM; 267 + nla_strlcpy(data, na, len); 268 + ret = cpulist_parse(data, *mask); 269 + kfree(data); 270 + return ret; 271 + } 272 + 273 + static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) 277 274 { 278 275 int rc = 0; 279 276 struct sk_buff *rep_skb; ··· 346 213 void *reply; 347 214 size_t size; 348 215 struct nlattr *na; 216 + cpumask_t mask; 217 + 218 + rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], &mask); 219 + if (rc < 0) 220 + return rc; 221 + if (rc == 0) 222 + return add_del_listener(info->snd_pid, &mask, REGISTER); 223 + 224 + rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], &mask); 225 + if (rc < 0) 226 + return rc; 227 + if (rc == 0) 228 + return add_del_listener(info->snd_pid, &mask, DEREGISTER); 349 229 350 230 /* 351 231 * Size includes space for nested attributes ··· 398 252 399 253 nla_nest_end(rep_skb, na); 400 254 401 - return send_reply(rep_skb, info->snd_pid, TASKSTATS_MSG_UNICAST); 255 + return send_reply(rep_skb, info->snd_pid); 402 256 403 257 nla_put_failure: 404 258 return genlmsg_cancel(rep_skb, reply); ··· 407 261 return rc; 408 262 } 409 263 264 + void taskstats_exit_alloc(struct taskstats **ptidstats, unsigned int *mycpu) 265 + { 266 + struct listener_list *listeners; 267 + struct taskstats *tmp; 268 + /* 269 + * This is the cpu on which the task is exiting currently and will 270 + * be the one for which the exit event is sent, even if the cpu 271 + * on which this function is running changes later. 272 + */ 273 + *mycpu = raw_smp_processor_id(); 274 + 275 + *ptidstats = NULL; 276 + tmp = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL); 277 + if (!tmp) 278 + return; 279 + 280 + listeners = &per_cpu(listener_array, *mycpu); 281 + down_read(&listeners->sem); 282 + if (!list_empty(&listeners->list)) { 283 + *ptidstats = tmp; 284 + tmp = NULL; 285 + } 286 + up_read(&listeners->sem); 287 + kfree(tmp); 288 + } 289 + 410 290 /* Send pid data out on exit */ 411 291 void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats, 412 - int group_dead) 292 + int group_dead, unsigned int mycpu) 413 293 { 414 294 int rc; 415 295 struct sk_buff *rep_skb; ··· 496 324 nla_nest_end(rep_skb, na); 497 325 498 326 send: 499 - send_reply(rep_skb, 0, TASKSTATS_MSG_MULTICAST); 327 + send_cpu_listeners(rep_skb, mycpu); 500 328 return; 501 329 502 330 nla_put_failure: ··· 510 338 511 339 static struct genl_ops taskstats_ops = { 512 340 .cmd = TASKSTATS_CMD_GET, 513 - .doit = taskstats_send_stats, 341 + .doit = taskstats_user_cmd, 514 342 .policy = taskstats_cmd_get_policy, 515 343 }; 516 344 517 345 /* Needed early in initialization */ 518 346 void __init taskstats_init_early(void) 519 347 { 348 + unsigned int i; 349 + 520 350 taskstats_cache = kmem_cache_create("taskstats_cache", 521 351 sizeof(struct taskstats), 522 352 0, SLAB_PANIC, NULL, NULL); 353 + for_each_possible_cpu(i) { 354 + INIT_LIST_HEAD(&(per_cpu(listener_array, i).list)); 355 + init_rwsem(&(per_cpu(listener_array, i).sem)); 356 + } 523 357 } 524 358 525 359 static int __init taskstats_init(void)