drivers/cpuidle/cpuidle-pseries.c at master

tjh.dev / kernel
fork
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork
kernel / drivers / cpuidle / cpuidle-pseries.c
at master 481 lines 13 kB view raw
wrap content
  1// SPDX-License-Identifier: GPL-2.0
  2/*
  3 *  cpuidle-pseries - idle state cpuidle driver.
  4 *  Adapted from drivers/idle/intel_idle.c and
  5 *  drivers/acpi/processor_idle.c
  6 *
  7 */
  8
  9#include <linux/kernel.h>
 10#include <linux/module.h>
 11#include <linux/init.h>
 12#include <linux/moduleparam.h>
 13#include <linux/cpuidle.h>
 14#include <linux/cpu.h>
 15#include <linux/notifier.h>
 16
 17#include <asm/paca.h>
 18#include <asm/reg.h>
 19#include <asm/machdep.h>
 20#include <asm/firmware.h>
 21#include <asm/runlatch.h>
 22#include <asm/idle.h>
 23#include <asm/plpar_wrappers.h>
 24#include <asm/rtas.h>
 25#include <asm/time.h>
 26
 27static struct cpuidle_driver pseries_idle_driver = {
 28	.name             = "pseries_idle",
 29	.owner            = THIS_MODULE,
 30};
 31
 32static int max_idle_state __read_mostly;
 33static struct cpuidle_state *cpuidle_state_table __read_mostly;
 34static u64 snooze_timeout __read_mostly;
 35static bool snooze_timeout_en __read_mostly;
 36
 37static __cpuidle
 38int snooze_loop(struct cpuidle_device *dev, struct cpuidle_driver *drv,
 39		int index)
 40{
 41	u64 snooze_exit_time;
 42
 43	set_thread_flag(TIF_POLLING_NRFLAG);
 44
 45	pseries_idle_prolog();
 46	raw_local_irq_enable();
 47	snooze_exit_time = get_tb() + snooze_timeout;
 48	dev->poll_time_limit = false;
 49
 50	while (!need_resched()) {
 51		HMT_low();
 52		HMT_very_low();
 53		if (likely(snooze_timeout_en) && get_tb() > snooze_exit_time) {
 54			/*
 55			 * Task has not woken up but we are exiting the polling
 56			 * loop anyway. Require a barrier after polling is
 57			 * cleared to order subsequent test of need_resched().
 58			 */
 59			dev->poll_time_limit = true;
 60			clear_thread_flag(TIF_POLLING_NRFLAG);
 61			smp_mb();
 62			break;
 63		}
 64	}
 65
 66	HMT_medium();
 67
 68       /* Avoid double clear when breaking */
 69	if (!dev->poll_time_limit)
 70		clear_thread_flag(TIF_POLLING_NRFLAG);
 71
 72	raw_local_irq_disable();
 73
 74	pseries_idle_epilog();
 75
 76	return index;
 77}
 78
 79static __cpuidle void check_and_cede_processor(void)
 80{
 81	/*
 82	 * Ensure our interrupt state is properly tracked,
 83	 * also checks if no interrupt has occurred while we
 84	 * were soft-disabled
 85	 */
 86	if (prep_irq_for_idle()) {
 87		cede_processor();
 88#ifdef CONFIG_TRACE_IRQFLAGS
 89		/* Ensure that H_CEDE returns with IRQs on */
 90		if (WARN_ON(!(mfmsr() & MSR_EE)))
 91			__hard_irq_enable();
 92#endif
 93	}
 94}
 95
 96/*
 97 * XCEDE: Extended CEDE states discovered through the
 98 *        "ibm,get-systems-parameter" RTAS call with the token
 99 *        CEDE_LATENCY_TOKEN
100 */
101
102/*
103 * Section 7.3.16 System Parameters Option of PAPR version 2.8.1 has a
104 * table with all the parameters to ibm,get-system-parameters.
105 * CEDE_LATENCY_TOKEN corresponds to the token value for Cede Latency
106 * Settings Information.
107 */
108#define CEDE_LATENCY_TOKEN	45
109
110/*
111 * If the platform supports the cede latency settings information system
112 * parameter it must provide the following information in the NULL terminated
113 * parameter string:
114 *
115 * a. The first byte is the length “N” of each cede latency setting record minus
116 *    one (zero indicates a length of 1 byte).
117 *
118 * b. For each supported cede latency setting a cede latency setting record
119 *    consisting of the first “N” bytes as per the following table.
120 *
121 *    -----------------------------
122 *    | Field           | Field   |
123 *    | Name            | Length  |
124 *    -----------------------------
125 *    | Cede Latency    | 1 Byte  |
126 *    | Specifier Value |         |
127 *    -----------------------------
128 *    | Maximum wakeup  |         |
129 *    | latency in      | 8 Bytes |
130 *    | tb-ticks        |         |
131 *    -----------------------------
132 *    | Responsive to   |         |
133 *    | external        | 1 Byte  |
134 *    | interrupts      |         |
135 *    -----------------------------
136 *
137 * This version has cede latency record size = 10.
138 *
139 * The structure xcede_latency_payload represents a) and b) with
140 * xcede_latency_record representing the table in b).
141 *
142 * xcede_latency_parameter is what gets returned by
143 * ibm,get-systems-parameter RTAS call when made with
144 * CEDE_LATENCY_TOKEN.
145 *
146 * These structures are only used to represent the data obtained by the RTAS
147 * call. The data is in big-endian.
148 */
149struct xcede_latency_record {
150	u8	hint;
151	__be64	latency_ticks;
152	u8	wake_on_irqs;
153} __packed;
154
155// Make space for 16 records, which "should be enough".
156struct xcede_latency_payload {
157	u8     record_size;
158	struct xcede_latency_record records[16];
159} __packed;
160
161struct xcede_latency_parameter {
162	__be16  payload_size;
163	struct xcede_latency_payload payload;
164	u8 null_char;
165} __packed;
166
167static unsigned int nr_xcede_records;
168static struct xcede_latency_parameter xcede_latency_parameter __initdata;
169
170static int __init parse_cede_parameters(void)
171{
172	struct xcede_latency_payload *payload;
173	u32 total_xcede_records_size;
174	u8 xcede_record_size;
175	u16 payload_size;
176	int ret, i;
177
178	ret = rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1,
179			NULL, CEDE_LATENCY_TOKEN, __pa(&xcede_latency_parameter),
180			sizeof(xcede_latency_parameter));
181	if (ret) {
182		pr_err("xcede: Error parsing CEDE_LATENCY_TOKEN\n");
183		return ret;
184	}
185
186	payload_size = be16_to_cpu(xcede_latency_parameter.payload_size);
187	payload = &xcede_latency_parameter.payload;
188
189	xcede_record_size = payload->record_size + 1;
190
191	if (xcede_record_size != sizeof(struct xcede_latency_record)) {
192		pr_err("xcede: Expected record-size %lu. Observed size %u.\n",
193		       sizeof(struct xcede_latency_record), xcede_record_size);
194		return -EINVAL;
195	}
196
197	pr_info("xcede: xcede_record_size = %d\n", xcede_record_size);
198
199	/*
200	 * Since the payload_size includes the last NULL byte and the
201	 * xcede_record_size, the remaining bytes correspond to array of all
202	 * cede_latency settings.
203	 */
204	total_xcede_records_size = payload_size - 2;
205	nr_xcede_records = total_xcede_records_size / xcede_record_size;
206
207	for (i = 0; i < nr_xcede_records; i++) {
208		struct xcede_latency_record *record = &payload->records[i];
209		u64 latency_ticks = be64_to_cpu(record->latency_ticks);
210		u8 wake_on_irqs = record->wake_on_irqs;
211		u8 hint = record->hint;
212
213		pr_info("xcede: Record %d : hint = %u, latency = 0x%llx tb ticks, Wake-on-irq = %u\n",
214			i, hint, latency_ticks, wake_on_irqs);
215	}
216
217	return 0;
218}
219
220#define NR_DEDICATED_STATES	2 /* snooze, CEDE */
221static u8 cede_latency_hint[NR_DEDICATED_STATES];
222
223static __cpuidle
224int dedicated_cede_loop(struct cpuidle_device *dev, struct cpuidle_driver *drv,
225			int index)
226{
227	u8 old_latency_hint;
228
229	pseries_idle_prolog();
230	get_lppaca()->donate_dedicated_cpu = 1;
231	old_latency_hint = get_lppaca()->cede_latency_hint;
232	get_lppaca()->cede_latency_hint = cede_latency_hint[index];
233
234	HMT_medium();
235	check_and_cede_processor();
236
237	raw_local_irq_disable();
238	get_lppaca()->donate_dedicated_cpu = 0;
239	get_lppaca()->cede_latency_hint = old_latency_hint;
240
241	pseries_idle_epilog();
242
243	return index;
244}
245
246static __cpuidle
247int shared_cede_loop(struct cpuidle_device *dev, struct cpuidle_driver *drv,
248		     int index)
249{
250
251	pseries_idle_prolog();
252
253	/*
254	 * Yield the processor to the hypervisor.  We return if
255	 * an external interrupt occurs (which are driven prior
256	 * to returning here) or if a prod occurs from another
257	 * processor. When returning here, external interrupts
258	 * are enabled.
259	 */
260	check_and_cede_processor();
261
262	raw_local_irq_disable();
263	pseries_idle_epilog();
264
265	return index;
266}
267
268/*
269 * States for dedicated partition case.
270 */
271static struct cpuidle_state dedicated_states[NR_DEDICATED_STATES] = {
272	{ /* Snooze */
273		.name = "snooze",
274		.desc = "snooze",
275		.exit_latency = 0,
276		.target_residency = 0,
277		.enter = &snooze_loop,
278		.flags = CPUIDLE_FLAG_POLLING },
279	{ /* CEDE */
280		.name = "CEDE",
281		.desc = "CEDE",
282		.exit_latency = 10,
283		.target_residency = 100,
284		.enter = &dedicated_cede_loop },
285};
286
287/*
288 * States for shared partition case.
289 */
290static struct cpuidle_state shared_states[] = {
291	{ /* Snooze */
292		.name = "snooze",
293		.desc = "snooze",
294		.exit_latency = 0,
295		.target_residency = 0,
296		.enter = &snooze_loop,
297		.flags = CPUIDLE_FLAG_POLLING },
298	{ /* Shared Cede */
299		.name = "Shared Cede",
300		.desc = "Shared Cede",
301		.exit_latency = 10,
302		.target_residency = 100,
303		.enter = &shared_cede_loop },
304};
305
306static int pseries_cpuidle_cpu_online(unsigned int cpu)
307{
308	struct cpuidle_device *dev = per_cpu(cpuidle_devices, cpu);
309
310	if (dev && cpuidle_get_driver()) {
311		cpuidle_pause_and_lock();
312		cpuidle_enable_device(dev);
313		cpuidle_resume_and_unlock();
314	}
315	return 0;
316}
317
318static int pseries_cpuidle_cpu_dead(unsigned int cpu)
319{
320	struct cpuidle_device *dev = per_cpu(cpuidle_devices, cpu);
321
322	if (dev && cpuidle_get_driver()) {
323		cpuidle_pause_and_lock();
324		cpuidle_disable_device(dev);
325		cpuidle_resume_and_unlock();
326	}
327	return 0;
328}
329
330/*
331 * pseries_cpuidle_driver_init()
332 */
333static int pseries_cpuidle_driver_init(void)
334{
335	int idle_state;
336	struct cpuidle_driver *drv = &pseries_idle_driver;
337
338	drv->state_count = 0;
339
340	for (idle_state = 0; idle_state < max_idle_state; ++idle_state) {
341		/* Is the state not enabled? */
342		if (cpuidle_state_table[idle_state].enter == NULL)
343			continue;
344
345		drv->states[drv->state_count] =	/* structure copy */
346			cpuidle_state_table[idle_state];
347
348		drv->state_count += 1;
349	}
350
351	return 0;
352}
353
354static void __init fixup_cede0_latency(void)
355{
356	struct xcede_latency_payload *payload;
357	u64 min_xcede_latency_us = UINT_MAX;
358	int i;
359
360	if (parse_cede_parameters())
361		return;
362
363	pr_info("cpuidle: Skipping the %d Extended CEDE idle states\n",
364		nr_xcede_records);
365
366	payload = &xcede_latency_parameter.payload;
367
368	/*
369	 * The CEDE idle state maps to CEDE(0). While the hypervisor
370	 * does not advertise CEDE(0) exit latency values, it does
371	 * advertise the latency values of the extended CEDE states.
372	 * We use the lowest advertised exit latency value as a proxy
373	 * for the exit latency of CEDE(0).
374	 */
375	for (i = 0; i < nr_xcede_records; i++) {
376		struct xcede_latency_record *record = &payload->records[i];
377		u8 hint = record->hint;
378		u64 latency_tb = be64_to_cpu(record->latency_ticks);
379		u64 latency_us = DIV_ROUND_UP_ULL(tb_to_ns(latency_tb), NSEC_PER_USEC);
380
381		/*
382		 * We expect the exit latency of an extended CEDE
383		 * state to be non-zero, it to since it takes at least
384		 * a few nanoseconds to wakeup the idle CPU and
385		 * dispatch the virtual processor into the Linux
386		 * Guest.
387		 *
388		 * So we consider only non-zero value for performing
389		 * the fixup of CEDE(0) latency.
390		 */
391		if (latency_us == 0) {
392			pr_warn("cpuidle: Skipping xcede record %d [hint=%d]. Exit latency = 0us\n",
393				i, hint);
394			continue;
395		}
396
397		if (latency_us < min_xcede_latency_us)
398			min_xcede_latency_us = latency_us;
399	}
400
401	if (min_xcede_latency_us != UINT_MAX) {
402		dedicated_states[1].exit_latency = min_xcede_latency_us;
403		dedicated_states[1].target_residency = 10 * (min_xcede_latency_us);
404		pr_info("cpuidle: Fixed up CEDE exit latency to %llu us\n",
405			min_xcede_latency_us);
406	}
407
408}
409
410/*
411 * pseries_idle_probe()
412 * Choose state table for shared versus dedicated partition
413 */
414static int __init pseries_idle_probe(void)
415{
416
417	if (cpuidle_disable != IDLE_NO_OVERRIDE)
418		return -ENODEV;
419
420	if (firmware_has_feature(FW_FEATURE_SPLPAR)) {
421		if (lppaca_shared_proc()) {
422			cpuidle_state_table = shared_states;
423			max_idle_state = ARRAY_SIZE(shared_states);
424		} else {
425			/*
426			 * Use firmware provided latency values
427			 * starting with POWER10 platforms. In the
428			 * case that we are running on a POWER10
429			 * platform but in an earlier compat mode, we
430			 * can still use the firmware provided values.
431			 *
432			 * However, on platforms prior to POWER10, we
433			 * cannot rely on the accuracy of the firmware
434			 * provided latency values. On such platforms,
435			 * go with the conservative default estimate
436			 * of 10us.
437			 */
438			if (cpu_has_feature(CPU_FTR_ARCH_31) || pvr_version_is(PVR_POWER10))
439				fixup_cede0_latency();
440			cpuidle_state_table = dedicated_states;
441			max_idle_state = NR_DEDICATED_STATES;
442		}
443	} else
444		return -ENODEV;
445
446	if (max_idle_state > 1) {
447		snooze_timeout_en = true;
448		snooze_timeout = cpuidle_state_table[1].target_residency *
449				 tb_ticks_per_usec;
450	}
451	return 0;
452}
453
454static int __init pseries_processor_idle_init(void)
455{
456	int retval;
457
458	retval = pseries_idle_probe();
459	if (retval)
460		return retval;
461
462	pseries_cpuidle_driver_init();
463	retval = cpuidle_register(&pseries_idle_driver, NULL);
464	if (retval) {
465		printk(KERN_DEBUG "Registration of pseries driver failed.\n");
466		return retval;
467	}
468
469	retval = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
470					   "cpuidle/pseries:online",
471					   pseries_cpuidle_cpu_online, NULL);
472	WARN_ON(retval < 0);
473	retval = cpuhp_setup_state_nocalls(CPUHP_CPUIDLE_DEAD,
474					   "cpuidle/pseries:DEAD", NULL,
475					   pseries_cpuidle_cpu_dead);
476	WARN_ON(retval < 0);
477	printk(KERN_DEBUG "pseries_idle_driver registered\n");
478	return 0;
479}
480
481device_initcall(pseries_processor_idle_init);
Configure Feed

Configure Feed