Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

tracing: fprobe: use rhltable for fprobe_ip_table

For now, all the kernel functions who are hooked by the fprobe will be
added to the hash table "fprobe_ip_table". The key of it is the function
address, and the value of it is "struct fprobe_hlist_node".

The budget of the hash table is FPROBE_IP_TABLE_SIZE, which is 256. And
this means the overhead of the hash table lookup will grow linearly if
the count of the functions in the fprobe more than 256. When we try to
hook all the kernel functions, the overhead will be huge.

Therefore, replace the hash table with rhltable to reduce the overhead.

Link: https://lore.kernel.org/all/20250819031825.55653-1-dongml2@chinatelecom.cn/

Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>

authored by

Menglong Dong and committed by
Masami Hiramatsu (Google)
0de4c70d dcb6fa37

+94 -68
+2 -1
include/linux/fprobe.h
··· 7 7 #include <linux/ftrace.h> 8 8 #include <linux/rcupdate.h> 9 9 #include <linux/refcount.h> 10 + #include <linux/rhashtable.h> 10 11 #include <linux/slab.h> 11 12 12 13 struct fprobe; ··· 27 26 * @fp: The fprobe which owns this. 28 27 */ 29 28 struct fprobe_hlist_node { 30 - struct hlist_node hlist; 29 + struct rhlist_head hlist; 31 30 unsigned long addr; 32 31 struct fprobe *fp; 33 32 };
+92 -67
kernel/trace/fprobe.c
··· 10 10 #include <linux/kprobes.h> 11 11 #include <linux/list.h> 12 12 #include <linux/mutex.h> 13 + #include <linux/rhashtable.h> 13 14 #include <linux/slab.h> 14 15 #include <linux/sort.h> 15 16 ··· 42 41 * - RCU hlist traversal under disabling preempt 43 42 */ 44 43 static struct hlist_head fprobe_table[FPROBE_TABLE_SIZE]; 45 - static struct hlist_head fprobe_ip_table[FPROBE_IP_TABLE_SIZE]; 44 + static struct rhltable fprobe_ip_table; 46 45 static DEFINE_MUTEX(fprobe_mutex); 47 46 48 - /* 49 - * Find first fprobe in the hlist. It will be iterated twice in the entry 50 - * probe, once for correcting the total required size, the second time is 51 - * calling back the user handlers. 52 - * Thus the hlist in the fprobe_table must be sorted and new probe needs to 53 - * be added *before* the first fprobe. 54 - */ 55 - static struct fprobe_hlist_node *find_first_fprobe_node(unsigned long ip) 47 + static u32 fprobe_node_hashfn(const void *data, u32 len, u32 seed) 56 48 { 57 - struct fprobe_hlist_node *node; 58 - struct hlist_head *head; 59 - 60 - head = &fprobe_ip_table[hash_ptr((void *)ip, FPROBE_IP_HASH_BITS)]; 61 - hlist_for_each_entry_rcu(node, head, hlist, 62 - lockdep_is_held(&fprobe_mutex)) { 63 - if (node->addr == ip) 64 - return node; 65 - } 66 - return NULL; 49 + return hash_ptr(*(unsigned long **)data, 32); 67 50 } 68 - NOKPROBE_SYMBOL(find_first_fprobe_node); 51 + 52 + static int fprobe_node_cmp(struct rhashtable_compare_arg *arg, 53 + const void *ptr) 54 + { 55 + unsigned long key = *(unsigned long *)arg->key; 56 + const struct fprobe_hlist_node *n = ptr; 57 + 58 + return n->addr != key; 59 + } 60 + 61 + static u32 fprobe_node_obj_hashfn(const void *data, u32 len, u32 seed) 62 + { 63 + const struct fprobe_hlist_node *n = data; 64 + 65 + return hash_ptr((void *)n->addr, 32); 66 + } 67 + 68 + static const struct rhashtable_params fprobe_rht_params = { 69 + .head_offset = offsetof(struct fprobe_hlist_node, hlist), 70 + .key_offset = offsetof(struct fprobe_hlist_node, addr), 71 + .key_len = sizeof_field(struct fprobe_hlist_node, addr), 72 + .hashfn = fprobe_node_hashfn, 73 + .obj_hashfn = fprobe_node_obj_hashfn, 74 + .obj_cmpfn = fprobe_node_cmp, 75 + .automatic_shrinking = true, 76 + }; 69 77 70 78 /* Node insertion and deletion requires the fprobe_mutex */ 71 - static void insert_fprobe_node(struct fprobe_hlist_node *node) 79 + static int insert_fprobe_node(struct fprobe_hlist_node *node) 72 80 { 73 - unsigned long ip = node->addr; 74 - struct fprobe_hlist_node *next; 75 - struct hlist_head *head; 76 - 77 81 lockdep_assert_held(&fprobe_mutex); 78 82 79 - next = find_first_fprobe_node(ip); 80 - if (next) { 81 - hlist_add_before_rcu(&node->hlist, &next->hlist); 82 - return; 83 - } 84 - head = &fprobe_ip_table[hash_ptr((void *)ip, FPROBE_IP_HASH_BITS)]; 85 - hlist_add_head_rcu(&node->hlist, head); 83 + return rhltable_insert(&fprobe_ip_table, &node->hlist, fprobe_rht_params); 86 84 } 87 85 88 86 /* Return true if there are synonims */ 89 87 static bool delete_fprobe_node(struct fprobe_hlist_node *node) 90 88 { 91 89 lockdep_assert_held(&fprobe_mutex); 90 + bool ret; 92 91 93 92 /* Avoid double deleting */ 94 93 if (READ_ONCE(node->fp) != NULL) { 95 94 WRITE_ONCE(node->fp, NULL); 96 - hlist_del_rcu(&node->hlist); 95 + rhltable_remove(&fprobe_ip_table, &node->hlist, 96 + fprobe_rht_params); 97 97 } 98 - return !!find_first_fprobe_node(node->addr); 98 + 99 + rcu_read_lock(); 100 + ret = !!rhltable_lookup(&fprobe_ip_table, &node->addr, 101 + fprobe_rht_params); 102 + rcu_read_unlock(); 103 + 104 + return ret; 99 105 } 100 106 101 107 /* Check existence of the fprobe */ ··· 257 249 static int fprobe_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops, 258 250 struct ftrace_regs *fregs) 259 251 { 260 - struct fprobe_hlist_node *node, *first; 261 252 unsigned long *fgraph_data = NULL; 262 253 unsigned long func = trace->func; 254 + struct fprobe_hlist_node *node; 255 + struct rhlist_head *head, *pos; 263 256 unsigned long ret_ip; 264 257 int reserved_words; 265 258 struct fprobe *fp; ··· 269 260 if (WARN_ON_ONCE(!fregs)) 270 261 return 0; 271 262 272 - first = node = find_first_fprobe_node(func); 273 - if (unlikely(!first)) 274 - return 0; 275 - 263 + head = rhltable_lookup(&fprobe_ip_table, &func, fprobe_rht_params); 276 264 reserved_words = 0; 277 - hlist_for_each_entry_from_rcu(node, hlist) { 265 + rhl_for_each_entry_rcu(node, pos, head, hlist) { 278 266 if (node->addr != func) 279 - break; 267 + continue; 280 268 fp = READ_ONCE(node->fp); 281 269 if (!fp || !fp->exit_handler) 282 270 continue; ··· 284 278 reserved_words += 285 279 FPROBE_HEADER_SIZE_IN_LONG + SIZE_IN_LONG(fp->entry_data_size); 286 280 } 287 - node = first; 288 281 if (reserved_words) { 289 282 fgraph_data = fgraph_reserve_data(gops->idx, reserved_words * sizeof(long)); 290 283 if (unlikely(!fgraph_data)) { 291 - hlist_for_each_entry_from_rcu(node, hlist) { 284 + rhl_for_each_entry_rcu(node, pos, head, hlist) { 292 285 if (node->addr != func) 293 - break; 286 + continue; 294 287 fp = READ_ONCE(node->fp); 295 288 if (fp && !fprobe_disabled(fp)) 296 289 fp->nmissed++; ··· 304 299 */ 305 300 ret_ip = ftrace_regs_get_return_address(fregs); 306 301 used = 0; 307 - hlist_for_each_entry_from_rcu(node, hlist) { 302 + rhl_for_each_entry_rcu(node, pos, head, hlist) { 308 303 int data_size; 309 304 void *data; 310 305 311 306 if (node->addr != func) 312 - break; 307 + continue; 313 308 fp = READ_ONCE(node->fp); 314 309 if (!fp || fprobe_disabled(fp)) 315 310 continue; ··· 454 449 return 0; 455 450 } 456 451 457 - static void fprobe_remove_node_in_module(struct module *mod, struct hlist_head *head, 458 - struct fprobe_addr_list *alist) 452 + static void fprobe_remove_node_in_module(struct module *mod, struct fprobe_hlist_node *node, 453 + struct fprobe_addr_list *alist) 459 454 { 460 - struct fprobe_hlist_node *node; 461 455 int ret = 0; 462 456 463 - hlist_for_each_entry_rcu(node, head, hlist, 464 - lockdep_is_held(&fprobe_mutex)) { 465 - if (!within_module(node->addr, mod)) 466 - continue; 467 - if (delete_fprobe_node(node)) 468 - continue; 469 - /* 470 - * If failed to update alist, just continue to update hlist. 471 - * Therefore, at list user handler will not hit anymore. 472 - */ 473 - if (!ret) 474 - ret = fprobe_addr_list_add(alist, node->addr); 475 - } 457 + if (!within_module(node->addr, mod)) 458 + return; 459 + if (delete_fprobe_node(node)) 460 + return; 461 + /* 462 + * If failed to update alist, just continue to update hlist. 463 + * Therefore, at list user handler will not hit anymore. 464 + */ 465 + if (!ret) 466 + ret = fprobe_addr_list_add(alist, node->addr); 476 467 } 477 468 478 469 /* Handle module unloading to manage fprobe_ip_table. */ ··· 476 475 unsigned long val, void *data) 477 476 { 478 477 struct fprobe_addr_list alist = {.size = FPROBE_IPS_BATCH_INIT}; 478 + struct fprobe_hlist_node *node; 479 + struct rhashtable_iter iter; 479 480 struct module *mod = data; 480 - int i; 481 481 482 482 if (val != MODULE_STATE_GOING) 483 483 return NOTIFY_DONE; ··· 489 487 return NOTIFY_DONE; 490 488 491 489 mutex_lock(&fprobe_mutex); 492 - for (i = 0; i < FPROBE_IP_TABLE_SIZE; i++) 493 - fprobe_remove_node_in_module(mod, &fprobe_ip_table[i], &alist); 490 + rhltable_walk_enter(&fprobe_ip_table, &iter); 491 + do { 492 + rhashtable_walk_start(&iter); 493 + 494 + while ((node = rhashtable_walk_next(&iter)) && !IS_ERR(node)) 495 + fprobe_remove_node_in_module(mod, node, &alist); 496 + 497 + rhashtable_walk_stop(&iter); 498 + } while (node == ERR_PTR(-EAGAIN)); 499 + rhashtable_walk_exit(&iter); 494 500 495 501 if (alist.index > 0) 496 502 ftrace_set_filter_ips(&fprobe_graph_ops.ops, ··· 738 728 ret = fprobe_graph_add_ips(addrs, num); 739 729 if (!ret) { 740 730 add_fprobe_hash(fp); 741 - for (i = 0; i < hlist_array->size; i++) 742 - insert_fprobe_node(&hlist_array->array[i]); 731 + for (i = 0; i < hlist_array->size; i++) { 732 + ret = insert_fprobe_node(&hlist_array->array[i]); 733 + if (ret) 734 + break; 735 + } 736 + /* fallback on insert error */ 737 + if (ret) { 738 + for (i--; i >= 0; i--) 739 + delete_fprobe_node(&hlist_array->array[i]); 740 + } 743 741 } 744 742 mutex_unlock(&fprobe_mutex); 745 743 ··· 843 825 return ret; 844 826 } 845 827 EXPORT_SYMBOL_GPL(unregister_fprobe); 828 + 829 + static int __init fprobe_initcall(void) 830 + { 831 + rhltable_init(&fprobe_ip_table, &fprobe_rht_params); 832 + return 0; 833 + } 834 + late_initcall(fprobe_initcall);