Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

tracing: Introduce simple_ring_buffer

Add a simple implementation of the kernel ring-buffer. This intends to
be used later by ring-buffer remotes such as the pKVM hypervisor, hence
the need for a cut down version (write only) without any dependency.

Link: https://patch.msgid.link/20260309162516.2623589-14-vdonnefort@google.com
Reviewed-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Vincent Donnefort <vdonnefort@google.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>

authored by

Vincent Donnefort and committed by
Steven Rostedt (Google)
34e5b958 93ae1b76

+525
+57
include/linux/simple_ring_buffer.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + #ifndef _LINUX_SIMPLE_RING_BUFFER_H 3 + #define _LINUX_SIMPLE_RING_BUFFER_H 4 + 5 + #include <linux/list.h> 6 + #include <linux/ring_buffer.h> 7 + #include <linux/ring_buffer_types.h> 8 + #include <linux/types.h> 9 + 10 + /* 11 + * Ideally those struct would stay private but the caller needs to know 12 + * the allocation size for simple_ring_buffer_init(). 13 + */ 14 + struct simple_buffer_page { 15 + struct list_head link; 16 + struct buffer_data_page *page; 17 + u64 entries; 18 + u32 write; 19 + u32 id; 20 + }; 21 + 22 + struct simple_rb_per_cpu { 23 + struct simple_buffer_page *tail_page; 24 + struct simple_buffer_page *reader_page; 25 + struct simple_buffer_page *head_page; 26 + struct simple_buffer_page *bpages; 27 + struct trace_buffer_meta *meta; 28 + u32 nr_pages; 29 + 30 + #define SIMPLE_RB_UNAVAILABLE 0 31 + #define SIMPLE_RB_READY 1 32 + #define SIMPLE_RB_WRITING 2 33 + u32 status; 34 + 35 + u64 last_overrun; 36 + u64 write_stamp; 37 + 38 + struct simple_rb_cbs *cbs; 39 + }; 40 + 41 + int simple_ring_buffer_init(struct simple_rb_per_cpu *cpu_buffer, struct simple_buffer_page *bpages, 42 + const struct ring_buffer_desc *desc); 43 + 44 + void simple_ring_buffer_unload(struct simple_rb_per_cpu *cpu_buffer); 45 + 46 + void *simple_ring_buffer_reserve(struct simple_rb_per_cpu *cpu_buffer, unsigned long length, 47 + u64 timestamp); 48 + 49 + void simple_ring_buffer_commit(struct simple_rb_per_cpu *cpu_buffer); 50 + 51 + int simple_ring_buffer_enable_tracing(struct simple_rb_per_cpu *cpu_buffer, bool enable); 52 + 53 + int simple_ring_buffer_reset(struct simple_rb_per_cpu *cpu_buffer); 54 + 55 + int simple_ring_buffer_swap_reader_page(struct simple_rb_per_cpu *cpu_buffer); 56 + 57 + #endif
+3
kernel/trace/Kconfig
··· 1284 1284 config TRACE_REMOTE 1285 1285 bool 1286 1286 1287 + config SIMPLE_RING_BUFFER 1288 + bool 1289 + 1287 1290 endif # FTRACE
+1
kernel/trace/Makefile
··· 129 129 obj-$(CONFIG_RV) += rv/ 130 130 131 131 obj-$(CONFIG_TRACE_REMOTE) += trace_remote.o 132 + obj-$(CONFIG_SIMPLE_RING_BUFFER) += simple_ring_buffer.o 132 133 libftrace-y := ftrace.o
+464
kernel/trace/simple_ring_buffer.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * Copyright (C) 2025 - Google LLC 4 + * Author: Vincent Donnefort <vdonnefort@google.com> 5 + */ 6 + 7 + #include <linux/atomic.h> 8 + #include <linux/simple_ring_buffer.h> 9 + 10 + #include <asm/barrier.h> 11 + #include <asm/local.h> 12 + 13 + enum simple_rb_link_type { 14 + SIMPLE_RB_LINK_NORMAL = 0, 15 + SIMPLE_RB_LINK_HEAD = 1, 16 + SIMPLE_RB_LINK_HEAD_MOVING 17 + }; 18 + 19 + #define SIMPLE_RB_LINK_MASK ~(SIMPLE_RB_LINK_HEAD | SIMPLE_RB_LINK_HEAD_MOVING) 20 + 21 + static void simple_bpage_set_head_link(struct simple_buffer_page *bpage) 22 + { 23 + unsigned long link = (unsigned long)bpage->link.next; 24 + 25 + link &= SIMPLE_RB_LINK_MASK; 26 + link |= SIMPLE_RB_LINK_HEAD; 27 + 28 + /* 29 + * Paired with simple_rb_find_head() to order access between the head 30 + * link and overrun. It ensures we always report an up-to-date value 31 + * after swapping the reader page. 32 + */ 33 + smp_store_release(&bpage->link.next, (struct list_head *)link); 34 + } 35 + 36 + static bool simple_bpage_unset_head_link(struct simple_buffer_page *bpage, 37 + struct simple_buffer_page *dst, 38 + enum simple_rb_link_type new_type) 39 + { 40 + unsigned long *link = (unsigned long *)(&bpage->link.next); 41 + unsigned long old = (*link & SIMPLE_RB_LINK_MASK) | SIMPLE_RB_LINK_HEAD; 42 + unsigned long new = (unsigned long)(&dst->link) | new_type; 43 + 44 + return try_cmpxchg(link, &old, new); 45 + } 46 + 47 + static void simple_bpage_set_normal_link(struct simple_buffer_page *bpage) 48 + { 49 + unsigned long link = (unsigned long)bpage->link.next; 50 + 51 + WRITE_ONCE(bpage->link.next, (struct list_head *)(link & SIMPLE_RB_LINK_MASK)); 52 + } 53 + 54 + static struct simple_buffer_page *simple_bpage_from_link(struct list_head *link) 55 + { 56 + unsigned long ptr = (unsigned long)link & SIMPLE_RB_LINK_MASK; 57 + 58 + return container_of((struct list_head *)ptr, struct simple_buffer_page, link); 59 + } 60 + 61 + static struct simple_buffer_page *simple_bpage_next_page(struct simple_buffer_page *bpage) 62 + { 63 + return simple_bpage_from_link(bpage->link.next); 64 + } 65 + 66 + static void simple_bpage_reset(struct simple_buffer_page *bpage) 67 + { 68 + bpage->write = 0; 69 + bpage->entries = 0; 70 + 71 + local_set(&bpage->page->commit, 0); 72 + } 73 + 74 + static void simple_bpage_init(struct simple_buffer_page *bpage, unsigned long page) 75 + { 76 + INIT_LIST_HEAD(&bpage->link); 77 + bpage->page = (struct buffer_data_page *)page; 78 + 79 + simple_bpage_reset(bpage); 80 + } 81 + 82 + #define simple_rb_meta_inc(__meta, __inc) \ 83 + WRITE_ONCE((__meta), (__meta + __inc)) 84 + 85 + static bool simple_rb_loaded(struct simple_rb_per_cpu *cpu_buffer) 86 + { 87 + return !!cpu_buffer->bpages; 88 + } 89 + 90 + static int simple_rb_find_head(struct simple_rb_per_cpu *cpu_buffer) 91 + { 92 + int retry = cpu_buffer->nr_pages * 2; 93 + struct simple_buffer_page *head; 94 + 95 + head = cpu_buffer->head_page; 96 + 97 + while (retry--) { 98 + unsigned long link; 99 + 100 + spin: 101 + /* See smp_store_release in simple_bpage_set_head_link() */ 102 + link = (unsigned long)smp_load_acquire(&head->link.prev->next); 103 + 104 + switch (link & ~SIMPLE_RB_LINK_MASK) { 105 + /* Found the head */ 106 + case SIMPLE_RB_LINK_HEAD: 107 + cpu_buffer->head_page = head; 108 + return 0; 109 + /* The writer caught the head, we can spin, that won't be long */ 110 + case SIMPLE_RB_LINK_HEAD_MOVING: 111 + goto spin; 112 + } 113 + 114 + head = simple_bpage_next_page(head); 115 + } 116 + 117 + return -EBUSY; 118 + } 119 + 120 + /** 121 + * simple_ring_buffer_swap_reader_page - Swap ring-buffer head with the reader 122 + * @cpu_buffer: A simple_rb_per_cpu 123 + * 124 + * This function enables consuming reading. It ensures the current head page will not be overwritten 125 + * and can be safely read. 126 + * 127 + * Returns 0 on success, -ENODEV if @cpu_buffer was unloaded or -EBUSY if we failed to catch the 128 + * head page. 129 + */ 130 + int simple_ring_buffer_swap_reader_page(struct simple_rb_per_cpu *cpu_buffer) 131 + { 132 + struct simple_buffer_page *last, *head, *reader; 133 + unsigned long overrun; 134 + int retry = 8; 135 + int ret; 136 + 137 + if (!simple_rb_loaded(cpu_buffer)) 138 + return -ENODEV; 139 + 140 + reader = cpu_buffer->reader_page; 141 + 142 + do { 143 + /* Run after the writer to find the head */ 144 + ret = simple_rb_find_head(cpu_buffer); 145 + if (ret) 146 + return ret; 147 + 148 + head = cpu_buffer->head_page; 149 + 150 + /* Connect the reader page around the header page */ 151 + reader->link.next = head->link.next; 152 + reader->link.prev = head->link.prev; 153 + 154 + /* The last page before the head */ 155 + last = simple_bpage_from_link(head->link.prev); 156 + 157 + /* The reader page points to the new header page */ 158 + simple_bpage_set_head_link(reader); 159 + 160 + overrun = cpu_buffer->meta->overrun; 161 + } while (!simple_bpage_unset_head_link(last, reader, SIMPLE_RB_LINK_NORMAL) && retry--); 162 + 163 + if (!retry) 164 + return -EINVAL; 165 + 166 + cpu_buffer->head_page = simple_bpage_from_link(reader->link.next); 167 + cpu_buffer->head_page->link.prev = &reader->link; 168 + cpu_buffer->reader_page = head; 169 + cpu_buffer->meta->reader.lost_events = overrun - cpu_buffer->last_overrun; 170 + cpu_buffer->meta->reader.id = cpu_buffer->reader_page->id; 171 + cpu_buffer->last_overrun = overrun; 172 + 173 + return 0; 174 + } 175 + EXPORT_SYMBOL_GPL(simple_ring_buffer_swap_reader_page); 176 + 177 + static struct simple_buffer_page *simple_rb_move_tail(struct simple_rb_per_cpu *cpu_buffer) 178 + { 179 + struct simple_buffer_page *tail, *new_tail; 180 + 181 + tail = cpu_buffer->tail_page; 182 + new_tail = simple_bpage_next_page(tail); 183 + 184 + if (simple_bpage_unset_head_link(tail, new_tail, SIMPLE_RB_LINK_HEAD_MOVING)) { 185 + /* 186 + * Oh no! we've caught the head. There is none anymore and 187 + * swap_reader will spin until we set the new one. Overrun must 188 + * be written first, to make sure we report the correct number 189 + * of lost events. 190 + */ 191 + simple_rb_meta_inc(cpu_buffer->meta->overrun, new_tail->entries); 192 + simple_rb_meta_inc(cpu_buffer->meta->pages_lost, 1); 193 + 194 + simple_bpage_set_head_link(new_tail); 195 + simple_bpage_set_normal_link(tail); 196 + } 197 + 198 + simple_bpage_reset(new_tail); 199 + cpu_buffer->tail_page = new_tail; 200 + 201 + simple_rb_meta_inc(cpu_buffer->meta->pages_touched, 1); 202 + 203 + return new_tail; 204 + } 205 + 206 + static unsigned long rb_event_size(unsigned long length) 207 + { 208 + struct ring_buffer_event *event; 209 + 210 + return length + RB_EVNT_HDR_SIZE + sizeof(event->array[0]); 211 + } 212 + 213 + static struct ring_buffer_event * 214 + rb_event_add_ts_extend(struct ring_buffer_event *event, u64 delta) 215 + { 216 + event->type_len = RINGBUF_TYPE_TIME_EXTEND; 217 + event->time_delta = delta & TS_MASK; 218 + event->array[0] = delta >> TS_SHIFT; 219 + 220 + return (struct ring_buffer_event *)((unsigned long)event + 8); 221 + } 222 + 223 + static struct ring_buffer_event * 224 + simple_rb_reserve_next(struct simple_rb_per_cpu *cpu_buffer, unsigned long length, u64 timestamp) 225 + { 226 + unsigned long ts_ext_size = 0, event_size = rb_event_size(length); 227 + struct simple_buffer_page *tail = cpu_buffer->tail_page; 228 + struct ring_buffer_event *event; 229 + u32 write, prev_write; 230 + u64 time_delta; 231 + 232 + time_delta = timestamp - cpu_buffer->write_stamp; 233 + 234 + if (test_time_stamp(time_delta)) 235 + ts_ext_size = 8; 236 + 237 + prev_write = tail->write; 238 + write = prev_write + event_size + ts_ext_size; 239 + 240 + if (unlikely(write > (PAGE_SIZE - BUF_PAGE_HDR_SIZE))) 241 + tail = simple_rb_move_tail(cpu_buffer); 242 + 243 + if (!tail->entries) { 244 + tail->page->time_stamp = timestamp; 245 + time_delta = 0; 246 + ts_ext_size = 0; 247 + write = event_size; 248 + prev_write = 0; 249 + } 250 + 251 + tail->write = write; 252 + tail->entries++; 253 + 254 + cpu_buffer->write_stamp = timestamp; 255 + 256 + event = (struct ring_buffer_event *)(tail->page->data + prev_write); 257 + if (ts_ext_size) { 258 + event = rb_event_add_ts_extend(event, time_delta); 259 + time_delta = 0; 260 + } 261 + 262 + event->type_len = 0; 263 + event->time_delta = time_delta; 264 + event->array[0] = event_size - RB_EVNT_HDR_SIZE; 265 + 266 + return event; 267 + } 268 + 269 + /** 270 + * simple_ring_buffer_reserve - Reserve an entry in @cpu_buffer 271 + * @cpu_buffer: A simple_rb_per_cpu 272 + * @length: Size of the entry in bytes 273 + * @timestamp: Timestamp of the entry 274 + * 275 + * Returns the address of the entry where to write data or NULL 276 + */ 277 + void *simple_ring_buffer_reserve(struct simple_rb_per_cpu *cpu_buffer, unsigned long length, 278 + u64 timestamp) 279 + { 280 + struct ring_buffer_event *rb_event; 281 + 282 + if (cmpxchg(&cpu_buffer->status, SIMPLE_RB_READY, SIMPLE_RB_WRITING) != SIMPLE_RB_READY) 283 + return NULL; 284 + 285 + rb_event = simple_rb_reserve_next(cpu_buffer, length, timestamp); 286 + 287 + return &rb_event->array[1]; 288 + } 289 + EXPORT_SYMBOL_GPL(simple_ring_buffer_reserve); 290 + 291 + /** 292 + * simple_ring_buffer_commit - Commit the entry reserved with simple_ring_buffer_reserve() 293 + * @cpu_buffer: The simple_rb_per_cpu where the entry has been reserved 294 + */ 295 + void simple_ring_buffer_commit(struct simple_rb_per_cpu *cpu_buffer) 296 + { 297 + local_set(&cpu_buffer->tail_page->page->commit, 298 + cpu_buffer->tail_page->write); 299 + simple_rb_meta_inc(cpu_buffer->meta->entries, 1); 300 + 301 + /* 302 + * Paired with simple_rb_enable_tracing() to ensure data is 303 + * written to the ring-buffer before teardown. 304 + */ 305 + smp_store_release(&cpu_buffer->status, SIMPLE_RB_READY); 306 + } 307 + EXPORT_SYMBOL_GPL(simple_ring_buffer_commit); 308 + 309 + static u32 simple_rb_enable_tracing(struct simple_rb_per_cpu *cpu_buffer, bool enable) 310 + { 311 + u32 prev_status; 312 + 313 + if (enable) 314 + return cmpxchg(&cpu_buffer->status, SIMPLE_RB_UNAVAILABLE, SIMPLE_RB_READY); 315 + 316 + /* Wait for the buffer to be released */ 317 + do { 318 + prev_status = cmpxchg_acquire(&cpu_buffer->status, 319 + SIMPLE_RB_READY, 320 + SIMPLE_RB_UNAVAILABLE); 321 + } while (prev_status == SIMPLE_RB_WRITING); 322 + 323 + return prev_status; 324 + } 325 + 326 + /** 327 + * simple_ring_buffer_reset - Reset @cpu_buffer 328 + * @cpu_buffer: A simple_rb_per_cpu 329 + * 330 + * This will not clear the content of the data, only reset counters and pointers 331 + * 332 + * Returns 0 on success or -ENODEV if @cpu_buffer was unloaded. 333 + */ 334 + int simple_ring_buffer_reset(struct simple_rb_per_cpu *cpu_buffer) 335 + { 336 + struct simple_buffer_page *bpage; 337 + u32 prev_status; 338 + int ret; 339 + 340 + if (!simple_rb_loaded(cpu_buffer)) 341 + return -ENODEV; 342 + 343 + prev_status = simple_rb_enable_tracing(cpu_buffer, false); 344 + 345 + ret = simple_rb_find_head(cpu_buffer); 346 + if (ret) 347 + return ret; 348 + 349 + bpage = cpu_buffer->tail_page = cpu_buffer->head_page; 350 + do { 351 + simple_bpage_reset(bpage); 352 + bpage = simple_bpage_next_page(bpage); 353 + } while (bpage != cpu_buffer->head_page); 354 + 355 + simple_bpage_reset(cpu_buffer->reader_page); 356 + 357 + cpu_buffer->last_overrun = 0; 358 + cpu_buffer->write_stamp = 0; 359 + 360 + cpu_buffer->meta->reader.read = 0; 361 + cpu_buffer->meta->reader.lost_events = 0; 362 + cpu_buffer->meta->entries = 0; 363 + cpu_buffer->meta->overrun = 0; 364 + cpu_buffer->meta->read = 0; 365 + cpu_buffer->meta->pages_lost = 0; 366 + cpu_buffer->meta->pages_touched = 0; 367 + 368 + if (prev_status == SIMPLE_RB_READY) 369 + simple_rb_enable_tracing(cpu_buffer, true); 370 + 371 + return 0; 372 + } 373 + EXPORT_SYMBOL_GPL(simple_ring_buffer_reset); 374 + 375 + /** 376 + * simple_ring_buffer_init - Init @cpu_buffer based on @desc 377 + * @cpu_buffer: A simple_rb_per_cpu buffer to init, allocated by the caller. 378 + * @bpages: Array of simple_buffer_pages, with as many elements as @desc->nr_page_va 379 + * @desc: A ring_buffer_desc 380 + * 381 + * Returns 0 on success or -EINVAL if the content of @desc is invalid 382 + */ 383 + int simple_ring_buffer_init(struct simple_rb_per_cpu *cpu_buffer, struct simple_buffer_page *bpages, 384 + const struct ring_buffer_desc *desc) 385 + { 386 + struct simple_buffer_page *bpage = bpages; 387 + int i; 388 + 389 + /* At least 1 reader page and two pages in the ring-buffer */ 390 + if (desc->nr_page_va < 3) 391 + return -EINVAL; 392 + 393 + memset(cpu_buffer, 0, sizeof(*cpu_buffer)); 394 + 395 + cpu_buffer->bpages = bpages; 396 + 397 + cpu_buffer->meta = (void *)desc->meta_va; 398 + memset(cpu_buffer->meta, 0, sizeof(*cpu_buffer->meta)); 399 + cpu_buffer->meta->meta_page_size = PAGE_SIZE; 400 + cpu_buffer->meta->nr_subbufs = cpu_buffer->nr_pages; 401 + 402 + /* The reader page is not part of the ring initially */ 403 + simple_bpage_init(bpage, desc->page_va[0]); 404 + bpage->id = 0; 405 + 406 + cpu_buffer->nr_pages = 1; 407 + 408 + cpu_buffer->reader_page = bpage; 409 + cpu_buffer->tail_page = bpage + 1; 410 + cpu_buffer->head_page = bpage + 1; 411 + 412 + for (i = 1; i < desc->nr_page_va; i++) { 413 + simple_bpage_init(++bpage, desc->page_va[i]); 414 + 415 + bpage->link.next = &(bpage + 1)->link; 416 + bpage->link.prev = &(bpage - 1)->link; 417 + bpage->id = i; 418 + 419 + cpu_buffer->nr_pages = i + 1; 420 + } 421 + 422 + /* Close the ring */ 423 + bpage->link.next = &cpu_buffer->tail_page->link; 424 + cpu_buffer->tail_page->link.prev = &bpage->link; 425 + 426 + /* The last init'ed page points to the head page */ 427 + simple_bpage_set_head_link(bpage); 428 + 429 + return 0; 430 + } 431 + EXPORT_SYMBOL_GPL(simple_ring_buffer_init); 432 + 433 + /** 434 + * simple_ring_buffer_unload - Prepare @cpu_buffer for deletion 435 + * @cpu_buffer: A simple_rb_per_cpu that will be deleted. 436 + */ 437 + void simple_ring_buffer_unload(struct simple_rb_per_cpu *cpu_buffer) 438 + { 439 + if (!simple_rb_loaded(cpu_buffer)) 440 + return; 441 + 442 + simple_rb_enable_tracing(cpu_buffer, false); 443 + 444 + cpu_buffer->bpages = NULL; 445 + } 446 + EXPORT_SYMBOL_GPL(simple_ring_buffer_unload); 447 + 448 + /** 449 + * simple_ring_buffer_enable_tracing - Enable or disable writing to @cpu_buffer 450 + * @cpu_buffer: A simple_rb_per_cpu 451 + * @enable: True to enable tracing, False to disable it 452 + * 453 + * Returns 0 on success or -ENODEV if @cpu_buffer was unloaded 454 + */ 455 + int simple_ring_buffer_enable_tracing(struct simple_rb_per_cpu *cpu_buffer, bool enable) 456 + { 457 + if (!simple_rb_loaded(cpu_buffer)) 458 + return -ENODEV; 459 + 460 + simple_rb_enable_tracing(cpu_buffer, enable); 461 + 462 + return 0; 463 + } 464 + EXPORT_SYMBOL_GPL(simple_ring_buffer_enable_tracing);