profiling: attempt to remove per-cpu profile flip buffer

This is the really old legacy kernel profiling code, which has long
since been obviated by "real profiling" (ie 'prof' and company), and
mainly remains as a source of syzbot reports.

There are anecdotal reports that people still use it for boot-time
profiling, but it's unlikely that such use would care about the old NUMA
optimizations in this code from 2004 (commit ad02973d42: "profile: 512x
Altix timer interrupt livelock fix" in the BK import archive at [1])

So in order to head off future syzbot reports, let's try to simplify
this code and get rid of the per-cpu profile buffers that are quite a
large portion of the complexity footprint of this thing (including CPU
hotplug callbacks etc).

It's unlikely anybody will actually notice, or possibly, as Thomas put
it: "Only people who indulge in nostalgia will notice :)".

That said, if it turns out that this code is actually actively used by
somebody, we can always revert this removal. Thus the "attempt" in the
summary line.

[ Note: in a small nod to "the profiling code can cause NUMA problems",
this also removes the "increment the last entry in the profiling array
on any unknown hits" logic. That would account any program counter in
a module to that single counter location, and might exacerbate any
NUMA cacheline bouncing issues ]

Link: https://lore.kernel.org/all/CAHk-=wgs52BxT4Zjmjz8aNvHWKxf5_ThBY4bYL1Y6CTaNL2dTw@mail.gmail.com/
Link: https://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git [1]
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Linus Torvalds 2 years ago 2accfdb7 7c51f7bb

+2 -182

2 changed files

expand all

include

linux

cpuhotplug.h

kernel

profile.c

-1

include/linux/cpuhotplug.h

··· 100 100 CPUHP_WORKQUEUE_PREP, 101 101 CPUHP_POWER_NUMA_PREPARE, 102 102 CPUHP_HRTIMERS_PREPARE, 103 - CPUHP_PROFILE_PREPARE, 104 103 CPUHP_X2APIC_PREPARE, 105 104 CPUHP_SMPCFD_PREPARE, 106 105 CPUHP_RELAY_PREPARE,

+2 -181

kernel/profile.c

··· 129 129 return -ENOMEM; 130 130 } 131 131 132 - #if defined(CONFIG_SMP) && defined(CONFIG_PROC_FS) 133 - /* 134 - * Each cpu has a pair of open-addressed hashtables for pending 135 - * profile hits. read_profile() IPI's all cpus to request them 136 - * to flip buffers and flushes their contents to prof_buffer itself. 137 - * Flip requests are serialized by the profile_flip_mutex. The sole 138 - * use of having a second hashtable is for avoiding cacheline 139 - * contention that would otherwise happen during flushes of pending 140 - * profile hits required for the accuracy of reported profile hits 141 - * and so resurrect the interrupt livelock issue. 142 - * 143 - * The open-addressed hashtables are indexed by profile buffer slot 144 - * and hold the number of pending hits to that profile buffer slot on 145 - * a cpu in an entry. When the hashtable overflows, all pending hits 146 - * are accounted to their corresponding profile buffer slots with 147 - * atomic_add() and the hashtable emptied. As numerous pending hits 148 - * may be accounted to a profile buffer slot in a hashtable entry, 149 - * this amortizes a number of atomic profile buffer increments likely 150 - * to be far larger than the number of entries in the hashtable, 151 - * particularly given that the number of distinct profile buffer 152 - * positions to which hits are accounted during short intervals (e.g. 153 - * several seconds) is usually very small. Exclusion from buffer 154 - * flipping is provided by interrupt disablement (note that for 155 - * SCHED_PROFILING or SLEEP_PROFILING profile_hit() may be called from 156 - * process context). 157 - * The hash function is meant to be lightweight as opposed to strong, 158 - * and was vaguely inspired by ppc64 firmware-supported inverted 159 - * pagetable hash functions, but uses a full hashtable full of finite 160 - * collision chains, not just pairs of them. 161 - * 162 - * -- nyc 163 - */ 164 - static void __profile_flip_buffers(void *unused) 165 - { 166 - int cpu = smp_processor_id(); 167 - 168 - per_cpu(cpu_profile_flip, cpu) = !per_cpu(cpu_profile_flip, cpu); 169 - } 170 - 171 - static void profile_flip_buffers(void) 172 - { 173 - int i, j, cpu; 174 - 175 - mutex_lock(&profile_flip_mutex); 176 - j = per_cpu(cpu_profile_flip, get_cpu()); 177 - put_cpu(); 178 - on_each_cpu(__profile_flip_buffers, NULL, 1); 179 - for_each_online_cpu(cpu) { 180 - struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[j]; 181 - for (i = 0; i < NR_PROFILE_HIT; ++i) { 182 - if (!hits[i].hits) { 183 - if (hits[i].pc) 184 - hits[i].pc = 0; 185 - continue; 186 - } 187 - atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]); 188 - hits[i].hits = hits[i].pc = 0; 189 - } 190 - } 191 - mutex_unlock(&profile_flip_mutex); 192 - } 193 - 194 - static void profile_discard_flip_buffers(void) 195 - { 196 - int i, cpu; 197 - 198 - mutex_lock(&profile_flip_mutex); 199 - i = per_cpu(cpu_profile_flip, get_cpu()); 200 - put_cpu(); 201 - on_each_cpu(__profile_flip_buffers, NULL, 1); 202 - for_each_online_cpu(cpu) { 203 - struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[i]; 204 - memset(hits, 0, NR_PROFILE_HIT*sizeof(struct profile_hit)); 205 - } 206 - mutex_unlock(&profile_flip_mutex); 207 - } 208 - 209 - static void do_profile_hits(int type, void *__pc, unsigned int nr_hits) 210 - { 211 - unsigned long primary, secondary, flags, pc = (unsigned long)__pc; 212 - int i, j, cpu; 213 - struct profile_hit *hits; 214 - 215 - pc = min((pc - (unsigned long)_stext) >> prof_shift, prof_len - 1); 216 - i = primary = (pc & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT; 217 - secondary = (~(pc << 1) & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT; 218 - cpu = get_cpu(); 219 - hits = per_cpu(cpu_profile_hits, cpu)[per_cpu(cpu_profile_flip, cpu)]; 220 - if (!hits) { 221 - put_cpu(); 222 - return; 223 - } 224 - /* 225 - * We buffer the global profiler buffer into a per-CPU 226 - * queue and thus reduce the number of global (and possibly 227 - * NUMA-alien) accesses. The write-queue is self-coalescing: 228 - */ 229 - local_irq_save(flags); 230 - do { 231 - for (j = 0; j < PROFILE_GRPSZ; ++j) { 232 - if (hits[i + j].pc == pc) { 233 - hits[i + j].hits += nr_hits; 234 - goto out; 235 - } else if (!hits[i + j].hits) { 236 - hits[i + j].pc = pc; 237 - hits[i + j].hits = nr_hits; 238 - goto out; 239 - } 240 - } 241 - i = (i + secondary) & (NR_PROFILE_HIT - 1); 242 - } while (i != primary); 243 - 244 - /* 245 - * Add the current hit(s) and flush the write-queue out 246 - * to the global buffer: 247 - */ 248 - atomic_add(nr_hits, &prof_buffer[pc]); 249 - for (i = 0; i < NR_PROFILE_HIT; ++i) { 250 - atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]); 251 - hits[i].pc = hits[i].hits = 0; 252 - } 253 - out: 254 - local_irq_restore(flags); 255 - put_cpu(); 256 - } 257 - 258 - static int profile_dead_cpu(unsigned int cpu) 259 - { 260 - struct page *page; 261 - int i; 262 - 263 - for (i = 0; i < 2; i++) { 264 - if (per_cpu(cpu_profile_hits, cpu)[i]) { 265 - page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[i]); 266 - per_cpu(cpu_profile_hits, cpu)[i] = NULL; 267 - __free_page(page); 268 - } 269 - } 270 - return 0; 271 - } 272 - 273 - static int profile_prepare_cpu(unsigned int cpu) 274 - { 275 - int i, node = cpu_to_mem(cpu); 276 - struct page *page; 277 - 278 - per_cpu(cpu_profile_flip, cpu) = 0; 279 - 280 - for (i = 0; i < 2; i++) { 281 - if (per_cpu(cpu_profile_hits, cpu)[i]) 282 - continue; 283 - 284 - page = __alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); 285 - if (!page) { 286 - profile_dead_cpu(cpu); 287 - return -ENOMEM; 288 - } 289 - per_cpu(cpu_profile_hits, cpu)[i] = page_address(page); 290 - 291 - } 292 - return 0; 293 - } 294 - 295 - #else /* !CONFIG_SMP */ 296 - #define profile_flip_buffers() do { } while (0) 297 - #define profile_discard_flip_buffers() do { } while (0) 298 - 299 132 static void do_profile_hits(int type, void *__pc, unsigned int nr_hits) 300 133 { 301 134 unsigned long pc; 302 135 pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift; 303 - atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]); 136 + if (pc < prof_len) 137 + atomic_add(nr_hits, &prof_buffer[pc]); 304 138 } 305 - #endif /* !CONFIG_SMP */ 306 139 307 140 void profile_hits(int type, void *__pc, unsigned int nr_hits) 308 141 { ··· 173 340 char *pnt; 174 341 unsigned long sample_step = 1UL << prof_shift; 175 342 176 - profile_flip_buffers(); 177 343 if (p >= (prof_len+1)*sizeof(unsigned int)) 178 344 return 0; 179 345 if (count > (prof_len+1)*sizeof(unsigned int) - p) ··· 218 386 return -EINVAL; 219 387 } 220 388 #endif 221 - profile_discard_flip_buffers(); 222 389 memset(prof_buffer, 0, prof_len * sizeof(atomic_t)); 223 390 return count; 224 391 } ··· 235 404 236 405 if (!prof_on) 237 406 return 0; 238 - #ifdef CONFIG_SMP 239 - err = cpuhp_setup_state(CPUHP_PROFILE_PREPARE, "PROFILE_PREPARE", 240 - profile_prepare_cpu, profile_dead_cpu); 241 - if (err) 242 - return err; 243 - #endif 244 407 entry = proc_create("profile", S_IWUSR | S_IRUGO, 245 408 NULL, &profile_proc_ops); 246 409 if (entry) 247 410 proc_set_size(entry, (1 + prof_len) * sizeof(atomic_t)); 248 - #ifdef CONFIG_SMP 249 - else 250 - cpuhp_remove_state(CPUHP_PROFILE_PREPARE); 251 - #endif 252 411 return err; 253 412 } 254 413 subsys_initcall(create_proc_profile);

Configure Feed

Configure Feed