Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

mm: mmap_lock: optimize mmap_lock tracepoints

We are starting to deploy mmap_lock tracepoint monitoring across our
fleet and the early results showed that these tracepoints are consuming
significant amount of CPUs in kernfs_path_from_node when enabled.

It seems like the kernel is trying to resolve the cgroup path in the
fast path of the locking code path when the tracepoints are enabled. In
addition for some application their metrics are regressing when
monitoring is enabled.

The cgroup path resolution can be slow and should not be done in the
fast path. Most userspace tools, like bpftrace, provides functionality
to get the cgroup path from cgroup id, so let's just trace the cgroup
id and the users can use better tools to get the path in the slow path.

Link: https://lkml.kernel.org/r/20241125171617.113892-1-shakeel.butt@linux.dev
Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
Reviewed-by: Yosry Ahmed <yosryahmed@google.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Roman Gushchin <roman.gushchin@linux.dev>
Reviewed-by: Axel Rasmussen <axelrasmussen@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

authored by

Shakeel Butt and committed by
Andrew Morton
9023691d 66539952

+40 -64
+22
include/linux/memcontrol.h
··· 1046 1046 1047 1047 void split_page_memcg(struct page *head, int old_order, int new_order); 1048 1048 1049 + static inline u64 cgroup_id_from_mm(struct mm_struct *mm) 1050 + { 1051 + struct mem_cgroup *memcg; 1052 + u64 id; 1053 + 1054 + if (mem_cgroup_disabled()) 1055 + return 0; 1056 + 1057 + rcu_read_lock(); 1058 + memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); 1059 + if (!memcg) 1060 + memcg = root_mem_cgroup; 1061 + id = cgroup_id(memcg->css.cgroup); 1062 + rcu_read_unlock(); 1063 + return id; 1064 + } 1065 + 1049 1066 #else /* CONFIG_MEMCG */ 1050 1067 1051 1068 #define MEM_CGROUP_ID_SHIFT 0 ··· 1482 1465 1483 1466 static inline void split_page_memcg(struct page *head, int old_order, int new_order) 1484 1467 { 1468 + } 1469 + 1470 + static inline u64 cgroup_id_from_mm(struct mm_struct *mm) 1471 + { 1472 + return 0; 1485 1473 } 1486 1474 #endif /* CONFIG_MEMCG */ 1487 1475
+15 -17
include/trace/events/mmap_lock.h
··· 5 5 #if !defined(_TRACE_MMAP_LOCK_H) || defined(TRACE_HEADER_MULTI_READ) 6 6 #define _TRACE_MMAP_LOCK_H 7 7 8 + #include <linux/memcontrol.h> 8 9 #include <linux/tracepoint.h> 9 10 #include <linux/types.h> 10 11 ··· 13 12 14 13 DECLARE_EVENT_CLASS(mmap_lock, 15 14 16 - TP_PROTO(struct mm_struct *mm, const char *memcg_path, bool write), 15 + TP_PROTO(struct mm_struct *mm, bool write), 17 16 18 - TP_ARGS(mm, memcg_path, write), 17 + TP_ARGS(mm, write), 19 18 20 19 TP_STRUCT__entry( 21 20 __field(struct mm_struct *, mm) 22 - __string(memcg_path, memcg_path) 21 + __field(u64, memcg_id) 23 22 __field(bool, write) 24 23 ), 25 24 26 25 TP_fast_assign( 27 26 __entry->mm = mm; 28 - __assign_str(memcg_path); 27 + __entry->memcg_id = cgroup_id_from_mm(mm); 29 28 __entry->write = write; 30 29 ), 31 30 32 31 TP_printk( 33 - "mm=%p memcg_path=%s write=%s", 34 - __entry->mm, 35 - __get_str(memcg_path), 32 + "mm=%p memcg_id=%llu write=%s", 33 + __entry->mm, __entry->memcg_id, 36 34 __entry->write ? "true" : "false" 37 35 ) 38 36 ); 39 37 40 38 #define DEFINE_MMAP_LOCK_EVENT(name) \ 41 39 DEFINE_EVENT(mmap_lock, name, \ 42 - TP_PROTO(struct mm_struct *mm, const char *memcg_path, \ 43 - bool write), \ 44 - TP_ARGS(mm, memcg_path, write)) 40 + TP_PROTO(struct mm_struct *mm, bool write), \ 41 + TP_ARGS(mm, write)) 45 42 46 43 DEFINE_MMAP_LOCK_EVENT(mmap_lock_start_locking); 47 44 DEFINE_MMAP_LOCK_EVENT(mmap_lock_released); 48 45 49 46 TRACE_EVENT(mmap_lock_acquire_returned, 50 47 51 - TP_PROTO(struct mm_struct *mm, const char *memcg_path, bool write, 52 - bool success), 48 + TP_PROTO(struct mm_struct *mm, bool write, bool success), 53 49 54 - TP_ARGS(mm, memcg_path, write, success), 50 + TP_ARGS(mm, write, success), 55 51 56 52 TP_STRUCT__entry( 57 53 __field(struct mm_struct *, mm) 58 - __string(memcg_path, memcg_path) 54 + __field(u64, memcg_id) 59 55 __field(bool, write) 60 56 __field(bool, success) 61 57 ), 62 58 63 59 TP_fast_assign( 64 60 __entry->mm = mm; 65 - __assign_str(memcg_path); 61 + __entry->memcg_id = cgroup_id_from_mm(mm); 66 62 __entry->write = write; 67 63 __entry->success = success; 68 64 ), 69 65 70 66 TP_printk( 71 - "mm=%p memcg_path=%s write=%s success=%s", 67 + "mm=%p memcg_id=%llu write=%s success=%s", 72 68 __entry->mm, 73 - __get_str(memcg_path), 69 + __entry->memcg_id, 74 70 __entry->write ? "true" : "false", 75 71 __entry->success ? "true" : "false" 76 72 )
+3 -47
mm/mmap_lock.c
··· 17 17 EXPORT_TRACEPOINT_SYMBOL(mmap_lock_acquire_returned); 18 18 EXPORT_TRACEPOINT_SYMBOL(mmap_lock_released); 19 19 20 - #ifdef CONFIG_MEMCG 21 - 22 - /* 23 - * Size of the buffer for memcg path names. Ignoring stack trace support, 24 - * trace_events_hist.c uses MAX_FILTER_STR_VAL for this, so we also use it. 25 - */ 26 - #define MEMCG_PATH_BUF_SIZE MAX_FILTER_STR_VAL 27 - 28 - #define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \ 29 - do { \ 30 - if (trace_mmap_lock_##type##_enabled()) { \ 31 - char buf[MEMCG_PATH_BUF_SIZE]; \ 32 - get_mm_memcg_path(mm, buf, sizeof(buf)); \ 33 - trace_mmap_lock_##type(mm, buf, ##__VA_ARGS__); \ 34 - } \ 35 - } while (0) 36 - 37 - #else /* !CONFIG_MEMCG */ 38 - 39 - #define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \ 40 - trace_mmap_lock_##type(mm, "", ##__VA_ARGS__) 41 - 42 - #endif /* CONFIG_MEMCG */ 43 - 44 20 #ifdef CONFIG_TRACING 45 - #ifdef CONFIG_MEMCG 46 - /* 47 - * Write the given mm_struct's memcg path to a buffer. If the path cannot be 48 - * determined, empty string is written. 49 - */ 50 - static void get_mm_memcg_path(struct mm_struct *mm, char *buf, size_t buflen) 51 - { 52 - struct mem_cgroup *memcg; 53 - 54 - buf[0] = '\0'; 55 - memcg = get_mem_cgroup_from_mm(mm); 56 - if (memcg == NULL) 57 - return; 58 - if (memcg->css.cgroup) 59 - cgroup_path(memcg->css.cgroup, buf, buflen); 60 - css_put(&memcg->css); 61 - } 62 - 63 - #endif /* CONFIG_MEMCG */ 64 - 65 21 /* 66 22 * Trace calls must be in a separate file, as otherwise there's a circular 67 23 * dependency between linux/mmap_lock.h and trace/events/mmap_lock.h. ··· 25 69 26 70 void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write) 27 71 { 28 - TRACE_MMAP_LOCK_EVENT(start_locking, mm, write); 72 + trace_mmap_lock_start_locking(mm, write); 29 73 } 30 74 EXPORT_SYMBOL(__mmap_lock_do_trace_start_locking); 31 75 32 76 void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write, 33 77 bool success) 34 78 { 35 - TRACE_MMAP_LOCK_EVENT(acquire_returned, mm, write, success); 79 + trace_mmap_lock_acquire_returned(mm, write, success); 36 80 } 37 81 EXPORT_SYMBOL(__mmap_lock_do_trace_acquire_returned); 38 82 39 83 void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write) 40 84 { 41 - TRACE_MMAP_LOCK_EVENT(released, mm, write); 85 + trace_mmap_lock_released(mm, write); 42 86 } 43 87 EXPORT_SYMBOL(__mmap_lock_do_trace_released); 44 88 #endif /* CONFIG_TRACING */