From fa86ee90eb1111267de67cb4272b5ce711f18cbb Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 3 Jul 2019 20:51:25 -0300 Subject: add cpuidle-haltpoll driver Add a cpuidle driver that calls the architecture default_idle routine. To be used in conjunction with the haltpoll governor. Signed-off-by: Marcelo Tosatti Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/Kconfig | 9 +++++ drivers/cpuidle/Makefile | 1 + drivers/cpuidle/cpuidle-haltpoll.c | 68 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 78 insertions(+) create mode 100644 drivers/cpuidle/cpuidle-haltpoll.c (limited to 'drivers') diff --git a/drivers/cpuidle/Kconfig b/drivers/cpuidle/Kconfig index a4ac31e4a58c..cc8efc56be7d 100644 --- a/drivers/cpuidle/Kconfig +++ b/drivers/cpuidle/Kconfig @@ -51,6 +51,15 @@ depends on PPC source "drivers/cpuidle/Kconfig.powerpc" endmenu +config HALTPOLL_CPUIDLE + tristate "Halt poll cpuidle driver" + depends on X86 && KVM_GUEST + default y + help + This option enables halt poll cpuidle driver, which allows to poll + before halting in the guest (more efficient than polling in the + host via halt_poll_ns for some scenarios). + endif config ARCH_NEEDS_CPU_IDLE_COUPLED diff --git a/drivers/cpuidle/Makefile b/drivers/cpuidle/Makefile index 9d7176cee3d3..240227474cd9 100644 --- a/drivers/cpuidle/Makefile +++ b/drivers/cpuidle/Makefile @@ -7,6 +7,7 @@ obj-y += cpuidle.o driver.o governor.o sysfs.o governors/ obj-$(CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED) += coupled.o obj-$(CONFIG_DT_IDLE_STATES) += dt_idle_states.o obj-$(CONFIG_ARCH_HAS_CPU_RELAX) += poll_state.o +obj-$(CONFIG_HALTPOLL_CPUIDLE) += cpuidle-haltpoll.o ################################################################################## # ARM SoC drivers diff --git a/drivers/cpuidle/cpuidle-haltpoll.c b/drivers/cpuidle/cpuidle-haltpoll.c new file mode 100644 index 000000000000..35cfb53e9287 --- /dev/null +++ b/drivers/cpuidle/cpuidle-haltpoll.c @@ -0,0 +1,68 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * cpuidle driver for haltpoll governor. + * + * Copyright 2019 Red Hat, Inc. and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + * Authors: Marcelo Tosatti + */ + +#include +#include +#include +#include +#include + +static int default_enter_idle(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int index) +{ + if (current_clr_polling_and_test()) { + local_irq_enable(); + return index; + } + default_idle(); + return index; +} + +static struct cpuidle_driver haltpoll_driver = { + .name = "haltpoll", + .owner = THIS_MODULE, + .states = { + { /* entry 0 is for polling */ }, + { + .enter = default_enter_idle, + .exit_latency = 1, + .target_residency = 1, + .power_usage = -1, + .name = "haltpoll idle", + .desc = "default architecture idle", + }, + }, + .safe_state_index = 0, + .state_count = 2, +}; + +static int __init haltpoll_init(void) +{ + struct cpuidle_driver *drv = &haltpoll_driver; + + cpuidle_poll_state_init(drv); + + if (!kvm_para_available()) + return 0; + + return cpuidle_register(&haltpoll_driver, NULL); +} + +static void __exit haltpoll_exit(void) +{ + cpuidle_unregister(&haltpoll_driver); +} + +module_init(haltpoll_init); +module_exit(haltpoll_exit); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Marcelo Tosatti "); -- cgit v1.2.3 From 259231a045616c4101d023a8f4dcc8379af265a6 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 3 Jul 2019 20:51:26 -0300 Subject: cpuidle: add poll_limit_ns to cpuidle_device structure Add a poll_limit_ns variable to cpuidle_device structure. Calculate and configure it in the new cpuidle_poll_time function, in case its zero. Individual governors are allowed to override this value. Signed-off-by: Marcelo Tosatti Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/cpuidle.c | 30 ++++++++++++++++++++++++++++++ drivers/cpuidle/poll_state.c | 11 ++--------- drivers/cpuidle/sysfs.c | 7 +++++++ 3 files changed, 39 insertions(+), 9 deletions(-) (limited to 'drivers') diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index 0f4b7c45df3e..0895b988fa92 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -361,6 +361,36 @@ void cpuidle_reflect(struct cpuidle_device *dev, int index) cpuidle_curr_governor->reflect(dev, index); } +/** + * cpuidle_poll_time - return amount of time to poll for, + * governors can override dev->poll_limit_ns if necessary + * + * @drv: the cpuidle driver tied with the cpu + * @dev: the cpuidle device + * + */ +u64 cpuidle_poll_time(struct cpuidle_driver *drv, + struct cpuidle_device *dev) +{ + int i; + u64 limit_ns; + + if (dev->poll_limit_ns) + return dev->poll_limit_ns; + + limit_ns = TICK_NSEC; + for (i = 1; i < drv->state_count; i++) { + if (drv->states[i].disabled || dev->states_usage[i].disable) + continue; + + limit_ns = (u64)drv->states[i].target_residency * NSEC_PER_USEC; + } + + dev->poll_limit_ns = limit_ns; + + return dev->poll_limit_ns; +} + /** * cpuidle_install_idle_handler - installs the cpuidle idle loop handler */ diff --git a/drivers/cpuidle/poll_state.c b/drivers/cpuidle/poll_state.c index 02b9315a9e96..c8fa5f41dfc4 100644 --- a/drivers/cpuidle/poll_state.c +++ b/drivers/cpuidle/poll_state.c @@ -20,16 +20,9 @@ static int __cpuidle poll_idle(struct cpuidle_device *dev, local_irq_enable(); if (!current_set_polling_and_test()) { unsigned int loop_count = 0; - u64 limit = TICK_NSEC; - int i; + u64 limit; - for (i = 1; i < drv->state_count; i++) { - if (drv->states[i].disabled || dev->states_usage[i].disable) - continue; - - limit = (u64)drv->states[i].target_residency * NSEC_PER_USEC; - break; - } + limit = cpuidle_poll_time(drv, dev); while (!need_resched()) { cpu_relax(); diff --git a/drivers/cpuidle/sysfs.c b/drivers/cpuidle/sysfs.c index eb20adb5de23..2bb2683b493c 100644 --- a/drivers/cpuidle/sysfs.c +++ b/drivers/cpuidle/sysfs.c @@ -334,6 +334,7 @@ struct cpuidle_state_kobj { struct cpuidle_state_usage *state_usage; struct completion kobj_unregister; struct kobject kobj; + struct cpuidle_device *device; }; #ifdef CONFIG_SUSPEND @@ -391,6 +392,7 @@ static inline void cpuidle_remove_s2idle_attr_group(struct cpuidle_state_kobj *k #define kobj_to_state_obj(k) container_of(k, struct cpuidle_state_kobj, kobj) #define kobj_to_state(k) (kobj_to_state_obj(k)->state) #define kobj_to_state_usage(k) (kobj_to_state_obj(k)->state_usage) +#define kobj_to_device(k) (kobj_to_state_obj(k)->device) #define attr_to_stateattr(a) container_of(a, struct cpuidle_state_attr, attr) static ssize_t cpuidle_state_show(struct kobject *kobj, struct attribute *attr, @@ -414,10 +416,14 @@ static ssize_t cpuidle_state_store(struct kobject *kobj, struct attribute *attr, struct cpuidle_state *state = kobj_to_state(kobj); struct cpuidle_state_usage *state_usage = kobj_to_state_usage(kobj); struct cpuidle_state_attr *cattr = attr_to_stateattr(attr); + struct cpuidle_device *dev = kobj_to_device(kobj); if (cattr->store) ret = cattr->store(state, state_usage, buf, size); + /* reset poll time cache */ + dev->poll_limit_ns = 0; + return ret; } @@ -468,6 +474,7 @@ static int cpuidle_add_state_sysfs(struct cpuidle_device *device) } kobj->state = &drv->states[i]; kobj->state_usage = &device->states_usage[i]; + kobj->device = device; init_completion(&kobj->kobj_unregister); ret = kobject_init_and_add(&kobj->kobj, &ktype_state_cpuidle, -- cgit v1.2.3 From 7d4daeedd575bbc3c40c87fc6708a8b88c50fe7e Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 3 Jul 2019 20:51:27 -0300 Subject: governors: unify last_state_idx Since this field is shared by all governors, move it to cpuidle device structure. Signed-off-by: Marcelo Tosatti Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/governors/ladder.c | 21 ++++++++++----------- drivers/cpuidle/governors/menu.c | 5 ++--- drivers/cpuidle/governors/teo.c | 12 ++++++------ 3 files changed, 18 insertions(+), 20 deletions(-) (limited to 'drivers') diff --git a/drivers/cpuidle/governors/ladder.c b/drivers/cpuidle/governors/ladder.c index f0dddc66af26..428eeb832fe7 100644 --- a/drivers/cpuidle/governors/ladder.c +++ b/drivers/cpuidle/governors/ladder.c @@ -38,7 +38,6 @@ struct ladder_device_state { struct ladder_device { struct ladder_device_state states[CPUIDLE_STATE_MAX]; - int last_state_idx; }; static DEFINE_PER_CPU(struct ladder_device, ladder_devices); @@ -49,12 +48,13 @@ static DEFINE_PER_CPU(struct ladder_device, ladder_devices); * @old_idx: the current state index * @new_idx: the new target state index */ -static inline void ladder_do_selection(struct ladder_device *ldev, +static inline void ladder_do_selection(struct cpuidle_device *dev, + struct ladder_device *ldev, int old_idx, int new_idx) { ldev->states[old_idx].stats.promotion_count = 0; ldev->states[old_idx].stats.demotion_count = 0; - ldev->last_state_idx = new_idx; + dev->last_state_idx = new_idx; } /** @@ -68,13 +68,13 @@ static int ladder_select_state(struct cpuidle_driver *drv, { struct ladder_device *ldev = this_cpu_ptr(&ladder_devices); struct ladder_device_state *last_state; - int last_residency, last_idx = ldev->last_state_idx; + int last_residency, last_idx = dev->last_state_idx; int first_idx = drv->states[0].flags & CPUIDLE_FLAG_POLLING ? 1 : 0; int latency_req = cpuidle_governor_latency_req(dev->cpu); /* Special case when user has set very strict latency requirement */ if (unlikely(latency_req == 0)) { - ladder_do_selection(ldev, last_idx, 0); + ladder_do_selection(dev, ldev, last_idx, 0); return 0; } @@ -91,7 +91,7 @@ static int ladder_select_state(struct cpuidle_driver *drv, last_state->stats.promotion_count++; last_state->stats.demotion_count = 0; if (last_state->stats.promotion_count >= last_state->threshold.promotion_count) { - ladder_do_selection(ldev, last_idx, last_idx + 1); + ladder_do_selection(dev, ldev, last_idx, last_idx + 1); return last_idx + 1; } } @@ -107,7 +107,7 @@ static int ladder_select_state(struct cpuidle_driver *drv, if (drv->states[i].exit_latency <= latency_req) break; } - ladder_do_selection(ldev, last_idx, i); + ladder_do_selection(dev, ldev, last_idx, i); return i; } @@ -116,7 +116,7 @@ static int ladder_select_state(struct cpuidle_driver *drv, last_state->stats.demotion_count++; last_state->stats.promotion_count = 0; if (last_state->stats.demotion_count >= last_state->threshold.demotion_count) { - ladder_do_selection(ldev, last_idx, last_idx - 1); + ladder_do_selection(dev, ldev, last_idx, last_idx - 1); return last_idx - 1; } } @@ -139,7 +139,7 @@ static int ladder_enable_device(struct cpuidle_driver *drv, struct ladder_device_state *lstate; struct cpuidle_state *state; - ldev->last_state_idx = first_idx; + dev->last_state_idx = first_idx; for (i = first_idx; i < drv->state_count; i++) { state = &drv->states[i]; @@ -167,9 +167,8 @@ static int ladder_enable_device(struct cpuidle_driver *drv, */ static void ladder_reflect(struct cpuidle_device *dev, int index) { - struct ladder_device *ldev = this_cpu_ptr(&ladder_devices); if (index > 0) - ldev->last_state_idx = index; + dev->last_state_idx = index; } static struct cpuidle_governor ladder_governor = { diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index e9a28c7846d6..dace4c7f830c 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c @@ -117,7 +117,6 @@ */ struct menu_device { - int last_state_idx; int needs_update; int tick_wakeup; @@ -455,7 +454,7 @@ static void menu_reflect(struct cpuidle_device *dev, int index) { struct menu_device *data = this_cpu_ptr(&menu_devices); - data->last_state_idx = index; + dev->last_state_idx = index; data->needs_update = 1; data->tick_wakeup = tick_nohz_idle_got_tick(); } @@ -468,7 +467,7 @@ static void menu_reflect(struct cpuidle_device *dev, int index) static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) { struct menu_device *data = this_cpu_ptr(&menu_devices); - int last_idx = data->last_state_idx; + int last_idx = dev->last_state_idx; struct cpuidle_state *target = &drv->states[last_idx]; unsigned int measured_us; unsigned int new_factor; diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c index 7d05efdbd3c6..a2fd81067a13 100644 --- a/drivers/cpuidle/governors/teo.c +++ b/drivers/cpuidle/governors/teo.c @@ -96,7 +96,6 @@ struct teo_idle_state { * @time_span_ns: Time between idle state selection and post-wakeup update. * @sleep_length_ns: Time till the closest timer event (at the selection time). * @states: Idle states data corresponding to this CPU. - * @last_state: Idle state entered by the CPU last time. * @interval_idx: Index of the most recent saved idle interval. * @intervals: Saved idle duration values. */ @@ -104,7 +103,6 @@ struct teo_cpu { u64 time_span_ns; u64 sleep_length_ns; struct teo_idle_state states[CPUIDLE_STATE_MAX]; - int last_state; int interval_idx; unsigned int intervals[INTERVALS]; }; @@ -130,7 +128,9 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) */ measured_us = sleep_length_us; } else { - unsigned int lat = drv->states[cpu_data->last_state].exit_latency; + unsigned int lat; + + lat = drv->states[dev->last_state_idx].exit_latency; measured_us = ktime_to_us(cpu_data->time_span_ns); /* @@ -245,9 +245,9 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, int max_early_idx, idx, i; ktime_t delta_tick; - if (cpu_data->last_state >= 0) { + if (dev->last_state_idx >= 0) { teo_update(drv, dev); - cpu_data->last_state = -1; + dev->last_state_idx = -1; } cpu_data->time_span_ns = local_clock(); @@ -394,7 +394,7 @@ static void teo_reflect(struct cpuidle_device *dev, int state) { struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); - cpu_data->last_state = state; + dev->last_state_idx = state; /* * If the wakeup was not "natural", but triggered by one of the safety * nets, assume that the CPU might have been idle for the entire sleep -- cgit v1.2.3 From 2cffe9f6b96fece065ee8522673c90e92ef2085d Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 3 Jul 2019 20:51:28 -0300 Subject: cpuidle: add haltpoll governor The cpuidle_haltpoll governor, in conjunction with the haltpoll cpuidle driver, allows guest vcpus to poll for a specified amount of time before halting. This provides the following benefits to host side polling: 1) The POLL flag is set while polling is performed, which allows a remote vCPU to avoid sending an IPI (and the associated cost of handling the IPI) when performing a wakeup. 2) The VM-exit cost can be avoided. The downside of guest side polling is that polling is performed even with other runnable tasks in the host. Results comparing halt_poll_ns and server/client application where a small packet is ping-ponged: host --> 31.33 halt_poll_ns=300000 / no guest busy spin --> 33.40 (93.8%) halt_poll_ns=0 / guest_halt_poll_ns=300000 --> 32.73 (95.7%) For the SAP HANA benchmarks (where idle_spin is a parameter of the previous version of the patch, results should be the same): hpns == halt_poll_ns idle_spin=0/ idle_spin=800/ idle_spin=0/ hpns=200000 hpns=0 hpns=800000 DeleteC06T03 (100 thread) 1.76 1.71 (-3%) 1.78 (+1%) InsertC16T02 (100 thread) 2.14 2.07 (-3%) 2.18 (+1.8%) DeleteC00T01 (1 thread) 1.34 1.28 (-4.5%) 1.29 (-3.7%) UpdateC00T03 (1 thread) 4.72 4.18 (-12%) 4.53 (-5%) Signed-off-by: Marcelo Tosatti Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/Kconfig | 11 +++ drivers/cpuidle/governors/Makefile | 1 + drivers/cpuidle/governors/haltpoll.c | 150 +++++++++++++++++++++++++++++++++++ 3 files changed, 162 insertions(+) create mode 100644 drivers/cpuidle/governors/haltpoll.c (limited to 'drivers') diff --git a/drivers/cpuidle/Kconfig b/drivers/cpuidle/Kconfig index cc8efc56be7d..88727b7c0d59 100644 --- a/drivers/cpuidle/Kconfig +++ b/drivers/cpuidle/Kconfig @@ -33,6 +33,17 @@ config CPU_IDLE_GOV_TEO Some workloads benefit from using it and it generally should be safe to use. Say Y here if you are not happy with the alternatives. +config CPU_IDLE_GOV_HALTPOLL + bool "Haltpoll governor (for virtualized systems)" + depends on KVM_GUEST + help + This governor implements haltpoll idle state selection, to be + used in conjunction with the haltpoll cpuidle driver, allowing + for polling for a certain amount of time before entering idle + state. + + Some virtualized workloads benefit from using it. + config DT_IDLE_STATES bool diff --git a/drivers/cpuidle/governors/Makefile b/drivers/cpuidle/governors/Makefile index 42f44cc610dd..63abb5393a4d 100644 --- a/drivers/cpuidle/governors/Makefile +++ b/drivers/cpuidle/governors/Makefile @@ -6,3 +6,4 @@ obj-$(CONFIG_CPU_IDLE_GOV_LADDER) += ladder.o obj-$(CONFIG_CPU_IDLE_GOV_MENU) += menu.o obj-$(CONFIG_CPU_IDLE_GOV_TEO) += teo.o +obj-$(CONFIG_CPU_IDLE_GOV_HALTPOLL) += haltpoll.o diff --git a/drivers/cpuidle/governors/haltpoll.c b/drivers/cpuidle/governors/haltpoll.c new file mode 100644 index 000000000000..797477bda486 --- /dev/null +++ b/drivers/cpuidle/governors/haltpoll.c @@ -0,0 +1,150 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * haltpoll.c - haltpoll idle governor + * + * Copyright 2019 Red Hat, Inc. and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + * Authors: Marcelo Tosatti + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static unsigned int guest_halt_poll_ns __read_mostly = 200000; +module_param(guest_halt_poll_ns, uint, 0644); + +/* division factor to shrink halt_poll_ns */ +static unsigned int guest_halt_poll_shrink __read_mostly = 2; +module_param(guest_halt_poll_shrink, uint, 0644); + +/* multiplication factor to grow per-cpu poll_limit_ns */ +static unsigned int guest_halt_poll_grow __read_mostly = 2; +module_param(guest_halt_poll_grow, uint, 0644); + +/* value in us to start growing per-cpu halt_poll_ns */ +static unsigned int guest_halt_poll_grow_start __read_mostly = 50000; +module_param(guest_halt_poll_grow_start, uint, 0644); + +/* allow shrinking guest halt poll */ +static bool guest_halt_poll_allow_shrink __read_mostly = true; +module_param(guest_halt_poll_allow_shrink, bool, 0644); + +/** + * haltpoll_select - selects the next idle state to enter + * @drv: cpuidle driver containing state data + * @dev: the CPU + * @stop_tick: indication on whether or not to stop the tick + */ +static int haltpoll_select(struct cpuidle_driver *drv, + struct cpuidle_device *dev, + bool *stop_tick) +{ + int latency_req = cpuidle_governor_latency_req(dev->cpu); + + if (!drv->state_count || latency_req == 0) { + *stop_tick = false; + return 0; + } + + if (dev->poll_limit_ns == 0) + return 1; + + /* Last state was poll? */ + if (dev->last_state_idx == 0) { + /* Halt if no event occurred on poll window */ + if (dev->poll_time_limit == true) + return 1; + + *stop_tick = false; + /* Otherwise, poll again */ + return 0; + } + + *stop_tick = false; + /* Last state was halt: poll */ + return 0; +} + +static void adjust_poll_limit(struct cpuidle_device *dev, unsigned int block_us) +{ + unsigned int val; + u64 block_ns = block_us*NSEC_PER_USEC; + + /* Grow cpu_halt_poll_us if + * cpu_halt_poll_us < block_ns < guest_halt_poll_us + */ + if (block_ns > dev->poll_limit_ns && block_ns <= guest_halt_poll_ns) { + val = dev->poll_limit_ns * guest_halt_poll_grow; + + if (val < guest_halt_poll_grow_start) + val = guest_halt_poll_grow_start; + if (val > guest_halt_poll_ns) + val = guest_halt_poll_ns; + + dev->poll_limit_ns = val; + } else if (block_ns > guest_halt_poll_ns && + guest_halt_poll_allow_shrink) { + unsigned int shrink = guest_halt_poll_shrink; + + val = dev->poll_limit_ns; + if (shrink == 0) + val = 0; + else + val /= shrink; + dev->poll_limit_ns = val; + } +} + +/** + * haltpoll_reflect - update variables and update poll time + * @dev: the CPU + * @index: the index of actual entered state + */ +static void haltpoll_reflect(struct cpuidle_device *dev, int index) +{ + dev->last_state_idx = index; + + if (index != 0) + adjust_poll_limit(dev, dev->last_residency); +} + +/** + * haltpoll_enable_device - scans a CPU's states and does setup + * @drv: cpuidle driver + * @dev: the CPU + */ +static int haltpoll_enable_device(struct cpuidle_driver *drv, + struct cpuidle_device *dev) +{ + dev->poll_limit_ns = 0; + + return 0; +} + +static struct cpuidle_governor haltpoll_governor = { + .name = "haltpoll", + .rating = 21, + .enable = haltpoll_enable_device, + .select = haltpoll_select, + .reflect = haltpoll_reflect, +}; + +static int __init init_haltpoll(void) +{ + if (kvm_para_available()) + return cpuidle_register_governor(&haltpoll_governor); + + return 0; +} + +postcore_initcall(init_haltpoll); -- cgit v1.2.3 From a1c4423b02b2121108e3ea9580741e0f26309a48 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 3 Jul 2019 20:51:29 -0300 Subject: cpuidle-haltpoll: disable host side polling when kvm virtualized When performing guest side polling, it is not necessary to also perform host side polling. So disable host side polling, via the new MSR interface, when loading cpuidle-haltpoll driver. Signed-off-by: Marcelo Tosatti Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/cpuidle-haltpoll.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/cpuidle/cpuidle-haltpoll.c b/drivers/cpuidle/cpuidle-haltpoll.c index 35cfb53e9287..9ac093dcbb01 100644 --- a/drivers/cpuidle/cpuidle-haltpoll.c +++ b/drivers/cpuidle/cpuidle-haltpoll.c @@ -15,6 +15,7 @@ #include #include #include +#include static int default_enter_idle(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index) @@ -47,6 +48,7 @@ static struct cpuidle_driver haltpoll_driver = { static int __init haltpoll_init(void) { + int ret; struct cpuidle_driver *drv = &haltpoll_driver; cpuidle_poll_state_init(drv); @@ -54,11 +56,16 @@ static int __init haltpoll_init(void) if (!kvm_para_available()) return 0; - return cpuidle_register(&haltpoll_driver, NULL); + ret = cpuidle_register(&haltpoll_driver, NULL); + if (ret == 0) + arch_haltpoll_enable(); + + return ret; } static void __exit haltpoll_exit(void) { + arch_haltpoll_disable(); cpuidle_unregister(&haltpoll_driver); } -- cgit v1.2.3 From 32b91ca15353b2803d27cfc747156e72dd2cd5d8 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 18 Jul 2019 10:53:21 +0200 Subject: cpuidle: menu: Allow tick to be stopped if PM QoS is used After commit 554c8aa8ecad ("sched: idle: Select idle state before stopping the tick") the menu governor prevents the scheduler tick from being stopped (unless stopped already) if there is a PM QoS latency constraint for the given CPU and the target residency of the deepest idle state matching that constraint is below the tick boundary. However, that is problematic if CPUs with PM QoS latency constraints are idle for long times, because it effectively causes the tick to run on them all the time which is wasteful. [It is also confusing and questionable if they are full dynticks CPUs.] To address that issue, make the menu governor allow the tick to be stopped only if the idle duration predicted by it is beyond the tick boundary, except when the shallowest idle state is selected upfront and it is not a "polling" one. Fixes: 554c8aa8ecad ("sched: idle: Select idle state before stopping the tick") Link: https://lore.kernel.org/lkml/79b247b3-e056-610e-9a07-e685dfdaa6c9@gmail.com/ Reported-by: Thomas Lindroth Tested-by: Thomas Lindroth Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/governors/menu.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) (limited to 'drivers') diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index dace4c7f830c..e5a5d0c8d66b 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c @@ -301,9 +301,10 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, !drv->states[0].disabled && !dev->states_usage[0].disable)) { /* * In this case state[0] will be used no matter what, so return - * it right away and keep the tick running. + * it right away and keep the tick running if state[0] is a + * polling one. */ - *stop_tick = false; + *stop_tick = !(drv->states[0].flags & CPUIDLE_FLAG_POLLING); return 0; } @@ -394,16 +395,9 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, return idx; } - if (s->exit_latency > latency_req) { - /* - * If we break out of the loop for latency reasons, use - * the target residency of the selected state as the - * expected idle duration so that the tick is retained - * as long as that target residency is low enough. - */ - predicted_us = drv->states[idx].target_residency; + if (s->exit_latency > latency_req) break; - } + idx = i; } -- cgit v1.2.3 From cab09f3d2d2a0a6cb3dfb678660d67a2c3764f50 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 19 Jul 2019 12:12:42 +0200 Subject: cpuidle: teo: Allow tick to be stopped if PM QoS is used The TEO goveror prevents the scheduler tick from being stopped (unless stopped already) if there is a PM QoS latency constraint for the given CPU and the target residency of the deepest idle state matching that constraint is below the tick boundary. However, that is problematic if CPUs with PM QoS latency constraints are idle for long times, because it effectively causes the tick to run on them all the time which is wasteful. [It is also confusing and questionable if they are full dynticks CPUs.] To address that issue, modify the TEO governor to carry out the entire search for the most suitable idle state (from the target residency perspective) even if a latency constraint is present, to allow it to determine the expected idle duration in all cases. Also, when using the last several measured idle duration values to refine the idle state selection, make it compare those values with the current expected idle duration value (instead of comparing them with the target residency of the idle state selected so far) which should prevent the tick from being retained when it makes sense to stop it sometimes (especially in the presence of PM QoS latency constraints). Fixes: b26bf6ab716f ("cpuidle: New timer events oriented governor for tickless systems") Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/governors/teo.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'drivers') diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c index a2fd81067a13..4d7a60c5b24a 100644 --- a/drivers/cpuidle/governors/teo.c +++ b/drivers/cpuidle/governors/teo.c @@ -242,7 +242,7 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); int latency_req = cpuidle_governor_latency_req(dev->cpu); unsigned int duration_us, count; - int max_early_idx, idx, i; + int max_early_idx, constraint_idx, idx, i; ktime_t delta_tick; if (dev->last_state_idx >= 0) { @@ -257,6 +257,7 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, count = 0; max_early_idx = -1; + constraint_idx = drv->state_count; idx = -1; for (i = 0; i < drv->state_count; i++) { @@ -286,16 +287,8 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, if (s->target_residency > duration_us) break; - if (s->exit_latency > latency_req) { - /* - * If we break out of the loop for latency reasons, use - * the target residency of the selected state as the - * expected idle duration to avoid stopping the tick - * as long as that target residency is low enough. - */ - duration_us = drv->states[idx].target_residency; - goto refine; - } + if (s->exit_latency > latency_req && constraint_idx > i) + constraint_idx = i; idx = i; @@ -321,7 +314,13 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, duration_us = drv->states[idx].target_residency; } -refine: + /* + * If there is a latency constraint, it may be necessary to use a + * shallower idle state than the one selected so far. + */ + if (constraint_idx < idx) + idx = constraint_idx; + if (idx < 0) { idx = 0; /* No states enabled. Must use 0. */ } else if (idx > 0) { @@ -331,13 +330,12 @@ refine: /* * Count and sum the most recent idle duration values less than - * the target residency of the state selected so far, find the - * max. + * the current expected idle duration value. */ for (i = 0; i < INTERVALS; i++) { unsigned int val = cpu_data->intervals[i]; - if (val >= drv->states[idx].target_residency) + if (val >= duration_us) continue; count++; @@ -356,8 +354,10 @@ refine: * would be too shallow. */ if (!(tick_nohz_tick_stopped() && avg_us < TICK_USEC)) { - idx = teo_find_shallower_state(drv, dev, idx, avg_us); duration_us = avg_us; + if (drv->states[idx].target_residency > avg_us) + idx = teo_find_shallower_state(drv, dev, + idx, avg_us); } } } -- cgit v1.2.3 From b7e7fffd3e8c81aa0654ed4c9f7a142b4c4dab1a Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 30 Jul 2019 12:11:08 +0200 Subject: cpuidle: teo: Get rid of redundant check in teo_update() Notice that setting measured_us to UINT_MAX in teo_update() earlier doesn't change the behavior of the following code, so do that and eliminate a redundant check used for setting measured_us to UINT_MAX. This change is not expected to alter functionality. Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/governors/teo.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) (limited to 'drivers') diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c index 4d7a60c5b24a..b5a0e498f798 100644 --- a/drivers/cpuidle/governors/teo.c +++ b/drivers/cpuidle/governors/teo.c @@ -123,10 +123,11 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) if (cpu_data->time_span_ns >= cpu_data->sleep_length_ns) { /* - * One of the safety nets has triggered or this was a timer - * wakeup (or equivalent). + * One of the safety nets has triggered or the wakeup was close + * enough to the closest timer event expected at the idle state + * selection time to be discarded. */ - measured_us = sleep_length_us; + measured_us = UINT_MAX; } else { unsigned int lat; @@ -188,15 +189,6 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) cpu_data->states[idx_timer].hits = hits; } - /* - * If the total time span between idle state selection and the "reflect" - * callback is greater than or equal to the sleep length determined at - * the idle state selection time, the wakeup is likely to be due to a - * timer event. - */ - if (cpu_data->time_span_ns >= cpu_data->sleep_length_ns) - measured_us = UINT_MAX; - /* * Save idle duration values corresponding to non-timer wakeups for * pattern detection. -- cgit v1.2.3 From 97d3eb9da84cae0548359b0aecb8619faad003b7 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Mon, 2 Sep 2019 11:40:31 +0100 Subject: cpuidle-haltpoll: vcpu hotplug support When cpus != maxcpus cpuidle-haltpoll will fail to register all vcpus past the online ones and thus fail to register the idle driver. This is because cpuidle_add_sysfs() will return with -ENODEV as a consequence from get_cpu_device() return no device for a non-existing CPU. Instead switch to cpuidle_register_driver() and manually register each of the present cpus through cpuhp_setup_state() callbacks and future ones that get onlined or offlined. This mimmics similar logic that intel_idle does. Fixes: fa86ee90eb11 ("add cpuidle-haltpoll driver") Signed-off-by: Joao Martins Signed-off-by: Boris Ostrovsky Reviewed-by: Marcelo Tosatti Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/cpuidle-haltpoll.c | 68 +++++++++++++++++++++++++++++++++++--- 1 file changed, 63 insertions(+), 5 deletions(-) (limited to 'drivers') diff --git a/drivers/cpuidle/cpuidle-haltpoll.c b/drivers/cpuidle/cpuidle-haltpoll.c index 9ac093dcbb01..56d8ab814466 100644 --- a/drivers/cpuidle/cpuidle-haltpoll.c +++ b/drivers/cpuidle/cpuidle-haltpoll.c @@ -11,12 +11,16 @@ */ #include +#include #include #include #include #include #include +static struct cpuidle_device __percpu *haltpoll_cpuidle_devices; +static enum cpuhp_state haltpoll_hp_state; + static int default_enter_idle(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index) { @@ -46,6 +50,46 @@ static struct cpuidle_driver haltpoll_driver = { .state_count = 2, }; +static int haltpoll_cpu_online(unsigned int cpu) +{ + struct cpuidle_device *dev; + + dev = per_cpu_ptr(haltpoll_cpuidle_devices, cpu); + if (!dev->registered) { + dev->cpu = cpu; + if (cpuidle_register_device(dev)) { + pr_notice("cpuidle_register_device %d failed!\n", cpu); + return -EIO; + } + arch_haltpoll_enable(cpu); + } + + return 0; +} + +static int haltpoll_cpu_offline(unsigned int cpu) +{ + struct cpuidle_device *dev; + + dev = per_cpu_ptr(haltpoll_cpuidle_devices, cpu); + if (dev->registered) { + arch_haltpoll_disable(cpu); + cpuidle_unregister_device(dev); + } + + return 0; +} + +static void haltpoll_uninit(void) +{ + if (haltpoll_hp_state) + cpuhp_remove_state(haltpoll_hp_state); + cpuidle_unregister_driver(&haltpoll_driver); + + free_percpu(haltpoll_cpuidle_devices); + haltpoll_cpuidle_devices = NULL; +} + static int __init haltpoll_init(void) { int ret; @@ -56,17 +100,31 @@ static int __init haltpoll_init(void) if (!kvm_para_available()) return 0; - ret = cpuidle_register(&haltpoll_driver, NULL); - if (ret == 0) - arch_haltpoll_enable(); + ret = cpuidle_register_driver(drv); + if (ret < 0) + return ret; + + haltpoll_cpuidle_devices = alloc_percpu(struct cpuidle_device); + if (haltpoll_cpuidle_devices == NULL) { + cpuidle_unregister_driver(drv); + return -ENOMEM; + } + + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "cpuidle/haltpoll:online", + haltpoll_cpu_online, haltpoll_cpu_offline); + if (ret < 0) { + haltpoll_uninit(); + } else { + haltpoll_hp_state = ret; + ret = 0; + } return ret; } static void __exit haltpoll_exit(void) { - arch_haltpoll_disable(); - cpuidle_unregister(&haltpoll_driver); + haltpoll_uninit(); } module_init(haltpoll_init); -- cgit v1.2.3 From 82e430a6df7f0b5972c7fe717faffea823c6b84a Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Fri, 2 Aug 2019 19:34:23 +0200 Subject: cpuidle: play_idle: Increase the resolution to usec The play_idle resolution is 1ms. The intel_powerclamp bases the idle duration on jiffies. The idle injection API is also using msec based duration but has no user yet. Unfortunately, msec based time does not fit well when we want to inject idle cycle precisely with shallow idle state. In order to set the scene for the incoming idle injection user, move the precision up to usec when calling play_idle. Signed-off-by: Daniel Lezcano Signed-off-by: Rafael J. Wysocki --- drivers/powercap/idle_inject.c | 2 +- drivers/thermal/intel/intel_powerclamp.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/powercap/idle_inject.c b/drivers/powercap/idle_inject.c index 24ff2a068978..10601f4bdf72 100644 --- a/drivers/powercap/idle_inject.c +++ b/drivers/powercap/idle_inject.c @@ -138,7 +138,7 @@ static void idle_inject_fn(unsigned int cpu) */ iit->should_run = 0; - play_idle(READ_ONCE(ii_dev->idle_duration_ms)); + play_idle(READ_ONCE(ii_dev->idle_duration_ms) * USEC_PER_MSEC); } /** diff --git a/drivers/thermal/intel/intel_powerclamp.c b/drivers/thermal/intel/intel_powerclamp.c index 5149a817456b..53216dcbe173 100644 --- a/drivers/thermal/intel/intel_powerclamp.c +++ b/drivers/thermal/intel/intel_powerclamp.c @@ -430,7 +430,7 @@ static void clamp_idle_injection_func(struct kthread_work *work) if (should_skip) goto balance; - play_idle(jiffies_to_msecs(w_data->duration_jiffies)); + play_idle(jiffies_to_usecs(w_data->duration_jiffies)); balance: if (clamping && w_data->clamping && cpu_online(w_data->cpu)) -- cgit v1.2.3 From cd4c0763064f02f42824eed61be38eafdc702281 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Fri, 2 Aug 2019 19:34:24 +0200 Subject: powercap: idle_inject: Use higher resolution for idle injection The resolution of the idle injection is limited to 1ms. If there is a need for an injection of 1.2 ms, it is not possible. The idle injection API is not yet used, so it is safe to convert the existing API to the new time unit instead of adding more functions. Convert to microsecond in order to use a finer grain time unit when injecting idle cycles. Signed-off-by: Daniel Lezcano Signed-off-by: Rafael J. Wysocki --- drivers/powercap/idle_inject.c | 53 +++++++++++++++++++++--------------------- 1 file changed, 27 insertions(+), 26 deletions(-) (limited to 'drivers') diff --git a/drivers/powercap/idle_inject.c b/drivers/powercap/idle_inject.c index 10601f4bdf72..cd1270614cc6 100644 --- a/drivers/powercap/idle_inject.c +++ b/drivers/powercap/idle_inject.c @@ -59,14 +59,14 @@ struct idle_inject_thread { /** * struct idle_inject_device - idle injection data * @timer: idle injection period timer - * @idle_duration_ms: duration of CPU idle time to inject - * @run_duration_ms: duration of CPU run time to allow + * @idle_duration_us: duration of CPU idle time to inject + * @run_duration_us: duration of CPU run time to allow * @cpumask: mask of CPUs affected by idle injection */ struct idle_inject_device { struct hrtimer timer; - unsigned int idle_duration_ms; - unsigned int run_duration_ms; + unsigned int idle_duration_us; + unsigned int run_duration_us; unsigned long int cpumask[0]; }; @@ -104,16 +104,16 @@ static void idle_inject_wakeup(struct idle_inject_device *ii_dev) */ static enum hrtimer_restart idle_inject_timer_fn(struct hrtimer *timer) { - unsigned int duration_ms; + unsigned int duration_us; struct idle_inject_device *ii_dev = container_of(timer, struct idle_inject_device, timer); - duration_ms = READ_ONCE(ii_dev->run_duration_ms); - duration_ms += READ_ONCE(ii_dev->idle_duration_ms); + duration_us = READ_ONCE(ii_dev->run_duration_us); + duration_us += READ_ONCE(ii_dev->idle_duration_us); idle_inject_wakeup(ii_dev); - hrtimer_forward_now(timer, ms_to_ktime(duration_ms)); + hrtimer_forward_now(timer, ns_to_ktime(duration_us * NSEC_PER_USEC)); return HRTIMER_RESTART; } @@ -138,35 +138,35 @@ static void idle_inject_fn(unsigned int cpu) */ iit->should_run = 0; - play_idle(READ_ONCE(ii_dev->idle_duration_ms) * USEC_PER_MSEC); + play_idle(READ_ONCE(ii_dev->idle_duration_us)); } /** * idle_inject_set_duration - idle and run duration update helper - * @run_duration_ms: CPU run time to allow in milliseconds - * @idle_duration_ms: CPU idle time to inject in milliseconds + * @run_duration_us: CPU run time to allow in microseconds + * @idle_duration_us: CPU idle time to inject in microseconds */ void idle_inject_set_duration(struct idle_inject_device *ii_dev, - unsigned int run_duration_ms, - unsigned int idle_duration_ms) + unsigned int run_duration_us, + unsigned int idle_duration_us) { - if (run_duration_ms && idle_duration_ms) { - WRITE_ONCE(ii_dev->run_duration_ms, run_duration_ms); - WRITE_ONCE(ii_dev->idle_duration_ms, idle_duration_ms); + if (run_duration_us && idle_duration_us) { + WRITE_ONCE(ii_dev->run_duration_us, run_duration_us); + WRITE_ONCE(ii_dev->idle_duration_us, idle_duration_us); } } /** * idle_inject_get_duration - idle and run duration retrieval helper - * @run_duration_ms: memory location to store the current CPU run time - * @idle_duration_ms: memory location to store the current CPU idle time + * @run_duration_us: memory location to store the current CPU run time + * @idle_duration_us: memory location to store the current CPU idle time */ void idle_inject_get_duration(struct idle_inject_device *ii_dev, - unsigned int *run_duration_ms, - unsigned int *idle_duration_ms) + unsigned int *run_duration_us, + unsigned int *idle_duration_us) { - *run_duration_ms = READ_ONCE(ii_dev->run_duration_ms); - *idle_duration_ms = READ_ONCE(ii_dev->idle_duration_ms); + *run_duration_us = READ_ONCE(ii_dev->run_duration_us); + *idle_duration_us = READ_ONCE(ii_dev->idle_duration_us); } /** @@ -181,10 +181,10 @@ void idle_inject_get_duration(struct idle_inject_device *ii_dev, */ int idle_inject_start(struct idle_inject_device *ii_dev) { - unsigned int idle_duration_ms = READ_ONCE(ii_dev->idle_duration_ms); - unsigned int run_duration_ms = READ_ONCE(ii_dev->run_duration_ms); + unsigned int idle_duration_us = READ_ONCE(ii_dev->idle_duration_us); + unsigned int run_duration_us = READ_ONCE(ii_dev->run_duration_us); - if (!idle_duration_ms || !run_duration_ms) + if (!idle_duration_us || !run_duration_us) return -EINVAL; pr_debug("Starting injecting idle cycles on CPUs '%*pbl'\n", @@ -193,7 +193,8 @@ int idle_inject_start(struct idle_inject_device *ii_dev) idle_inject_wakeup(ii_dev); hrtimer_start(&ii_dev->timer, - ms_to_ktime(idle_duration_ms + run_duration_ms), + ns_to_ktime((idle_duration_us + run_duration_us) * + NSEC_PER_USEC), HRTIMER_MODE_REL); return 0; -- cgit v1.2.3 From cb5d8c45ab6c3daf8269e550cfb2d5018a876fe3 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Sun, 8 Sep 2019 00:45:21 +0100 Subject: cpuidle: allow governor switch on cpuidle_register_driver() The recently introduced haltpoll driver is largely only useful with haltpoll governor. To allow drivers to associate with a particular idle behaviour, add a @governor property to 'struct cpuidle_driver' and thus allow a cpuidle driver to switch to a *preferred* governor on idle driver registration. We save the previous governor, and when an idle driver is unregistered we switch back to that. The @governor can be overridden by cpuidle.governor= boot param or alternatively be ignored if the governor doesn't exist. Signed-off-by: Joao Martins Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/cpuidle.h | 2 ++ drivers/cpuidle/driver.c | 25 +++++++++++++++++++++++++ drivers/cpuidle/governor.c | 7 ++++--- 3 files changed, 31 insertions(+), 3 deletions(-) (limited to 'drivers') diff --git a/drivers/cpuidle/cpuidle.h b/drivers/cpuidle/cpuidle.h index d6613101af92..9f336af17fa6 100644 --- a/drivers/cpuidle/cpuidle.h +++ b/drivers/cpuidle/cpuidle.h @@ -9,6 +9,7 @@ /* For internal use only */ extern char param_governor[]; extern struct cpuidle_governor *cpuidle_curr_governor; +extern struct cpuidle_governor *cpuidle_prev_governor; extern struct list_head cpuidle_governors; extern struct list_head cpuidle_detected_devices; extern struct mutex cpuidle_lock; @@ -22,6 +23,7 @@ extern void cpuidle_install_idle_handler(void); extern void cpuidle_uninstall_idle_handler(void); /* governors */ +extern struct cpuidle_governor *cpuidle_find_governor(const char *str); extern int cpuidle_switch_governor(struct cpuidle_governor *gov); /* sysfs */ diff --git a/drivers/cpuidle/driver.c b/drivers/cpuidle/driver.c index dc32f34e68d9..80c1a830d991 100644 --- a/drivers/cpuidle/driver.c +++ b/drivers/cpuidle/driver.c @@ -254,12 +254,25 @@ static void __cpuidle_unregister_driver(struct cpuidle_driver *drv) */ int cpuidle_register_driver(struct cpuidle_driver *drv) { + struct cpuidle_governor *gov; int ret; spin_lock(&cpuidle_driver_lock); ret = __cpuidle_register_driver(drv); spin_unlock(&cpuidle_driver_lock); + if (!ret && !strlen(param_governor) && drv->governor && + (cpuidle_get_driver() == drv)) { + mutex_lock(&cpuidle_lock); + gov = cpuidle_find_governor(drv->governor); + if (gov) { + cpuidle_prev_governor = cpuidle_curr_governor; + if (cpuidle_switch_governor(gov) < 0) + cpuidle_prev_governor = NULL; + } + mutex_unlock(&cpuidle_lock); + } + return ret; } EXPORT_SYMBOL_GPL(cpuidle_register_driver); @@ -274,9 +287,21 @@ EXPORT_SYMBOL_GPL(cpuidle_register_driver); */ void cpuidle_unregister_driver(struct cpuidle_driver *drv) { + bool enabled = (cpuidle_get_driver() == drv); + spin_lock(&cpuidle_driver_lock); __cpuidle_unregister_driver(drv); spin_unlock(&cpuidle_driver_lock); + + if (!enabled) + return; + + mutex_lock(&cpuidle_lock); + if (cpuidle_prev_governor) { + if (!cpuidle_switch_governor(cpuidle_prev_governor)) + cpuidle_prev_governor = NULL; + } + mutex_unlock(&cpuidle_lock); } EXPORT_SYMBOL_GPL(cpuidle_unregister_driver); diff --git a/drivers/cpuidle/governor.c b/drivers/cpuidle/governor.c index 2e3e14192bee..e9801f26c732 100644 --- a/drivers/cpuidle/governor.c +++ b/drivers/cpuidle/governor.c @@ -20,14 +20,15 @@ char param_governor[CPUIDLE_NAME_LEN]; LIST_HEAD(cpuidle_governors); struct cpuidle_governor *cpuidle_curr_governor; +struct cpuidle_governor *cpuidle_prev_governor; /** - * __cpuidle_find_governor - finds a governor of the specified name + * cpuidle_find_governor - finds a governor of the specified name * @str: the name * * Must be called with cpuidle_lock acquired. */ -static struct cpuidle_governor * __cpuidle_find_governor(const char *str) +struct cpuidle_governor *cpuidle_find_governor(const char *str) { struct cpuidle_governor *gov; @@ -87,7 +88,7 @@ int cpuidle_register_governor(struct cpuidle_governor *gov) return -ENODEV; mutex_lock(&cpuidle_lock); - if (__cpuidle_find_governor(gov->name) == NULL) { + if (cpuidle_find_governor(gov->name) == NULL) { ret = 0; list_add_tail(&gov->governor_list, &cpuidle_governors); if (!cpuidle_curr_governor || -- cgit v1.2.3 From 7321440829a27d58c88b7fcfcbbc37487b5e39a5 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Sun, 8 Sep 2019 00:45:22 +0100 Subject: cpuidle-haltpoll: set haltpoll as preferred governor Right now, guest current governors have the following ratings: * ladder -> 10 * teo -> 19 * menu -> 20 * haltpoll -> 21 * ladder + nohz=off -> 25 haltpoll governor got introduced and it is now the default governor given its highest rating -- with ladder+nohz being the exception -- regardless of idle driver in the guest. An example of an undesirable case is x86 KVM guests with MWAIT which have intel_idle registered first, and consequently will have haltpoll be used as governor which would get limited to a poll state and state 1 and the other states wouldn't get used. To keep the previous defaults we decrease rating of governor to 9 (below current lowest rating) and thus rely on @governor switch on cpuidle_register_driver() to tie in haltpoll idle driver and governor together. Signed-off-by: Joao Martins Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/cpuidle-haltpoll.c | 1 + drivers/cpuidle/governors/haltpoll.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/cpuidle/cpuidle-haltpoll.c b/drivers/cpuidle/cpuidle-haltpoll.c index 56d8ab814466..519e90d125cf 100644 --- a/drivers/cpuidle/cpuidle-haltpoll.c +++ b/drivers/cpuidle/cpuidle-haltpoll.c @@ -34,6 +34,7 @@ static int default_enter_idle(struct cpuidle_device *dev, static struct cpuidle_driver haltpoll_driver = { .name = "haltpoll", + .governor = "haltpoll", .owner = THIS_MODULE, .states = { { /* entry 0 is for polling */ }, diff --git a/drivers/cpuidle/governors/haltpoll.c b/drivers/cpuidle/governors/haltpoll.c index 797477bda486..7a703d2e0064 100644 --- a/drivers/cpuidle/governors/haltpoll.c +++ b/drivers/cpuidle/governors/haltpoll.c @@ -133,7 +133,7 @@ static int haltpoll_enable_device(struct cpuidle_driver *drv, static struct cpuidle_governor haltpoll_governor = { .name = "haltpoll", - .rating = 21, + .rating = 9, .enable = haltpoll_enable_device, .select = haltpoll_select, .reflect = haltpoll_reflect, -- cgit v1.2.3 From 5cc59f597c0666c6a7e1c67aac9063895949fd56 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Sun, 8 Sep 2019 00:45:23 +0100 Subject: cpuidle-haltpoll: return -ENODEV on modinit failure When a user loads cpuidle-haltpoll on a non KVM guest the module will successfully load, even though idle driver registration didn't take place. We should instead return -ENODEV signaling the user that the driver can't be loaded, like other error paths in haltpoll_init(). An example of such error paths is when we return -EBUSY when attempting to register an idle driver when it had one already (e.g. intel_idle loads at boot and then we attempt to insert module cpuidle-haltpoll). Fixes: fa86ee90eb11 ("add cpuidle-haltpoll driver") Signed-off-by: Joao Martins Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/cpuidle-haltpoll.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/cpuidle/cpuidle-haltpoll.c b/drivers/cpuidle/cpuidle-haltpoll.c index 519e90d125cf..7a0239ef717e 100644 --- a/drivers/cpuidle/cpuidle-haltpoll.c +++ b/drivers/cpuidle/cpuidle-haltpoll.c @@ -99,7 +99,7 @@ static int __init haltpoll_init(void) cpuidle_poll_state_init(drv); if (!kvm_para_available()) - return 0; + return -ENODEV; ret = cpuidle_register_driver(drv); if (ret < 0) -- cgit v1.2.3 From 472f263660832b90e53bede2020f68cd14f8b76c Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Sun, 8 Sep 2019 00:45:24 +0100 Subject: cpuidle-haltpoll: do not set an owner to allow modunload cpuidle-haltpoll can be built as a module to allow optional late load. Given we are setting @owner to THIS_MODULE, cpuidle will attempt to grab a module reference every time a cpuidle_device is registered -- so essentially all online cpus get a reference. This prevents for the module to be unloaded later, which makes the module_exit callback entirely unused. Thus remove the @owner and allow module to be unloaded. Fixes: fa86ee90eb11 ("add cpuidle-haltpoll driver") Signed-off-by: Joao Martins Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/cpuidle-haltpoll.c | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers') diff --git a/drivers/cpuidle/cpuidle-haltpoll.c b/drivers/cpuidle/cpuidle-haltpoll.c index 7a0239ef717e..49a65c6fe91e 100644 --- a/drivers/cpuidle/cpuidle-haltpoll.c +++ b/drivers/cpuidle/cpuidle-haltpoll.c @@ -35,7 +35,6 @@ static int default_enter_idle(struct cpuidle_device *dev, static struct cpuidle_driver haltpoll_driver = { .name = "haltpoll", .governor = "haltpoll", - .owner = THIS_MODULE, .states = { { /* entry 0 is for polling */ }, { -- cgit v1.2.3 From 1328edca4a142ee3c7442d1eece2c3ca383eff35 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Thu, 29 Aug 2019 16:49:57 +0800 Subject: cpuidle-haltpoll: Enable kvm guest polling when dedicated physical CPUs are available The downside of guest side polling is that polling is performed even with other runnable tasks in the host. However, even if poll in kvm can aware whether or not other runnable tasks in the same pCPU, it can still incur extra overhead in over-subscribe scenario. Now we can just enable guest polling when dedicated pCPUs are available. Acked-by: Paolo Bonzini Signed-off-by: Wanpeng Li Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/cpuidle-haltpoll.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/cpuidle/cpuidle-haltpoll.c b/drivers/cpuidle/cpuidle-haltpoll.c index 49a65c6fe91e..932390b028f1 100644 --- a/drivers/cpuidle/cpuidle-haltpoll.c +++ b/drivers/cpuidle/cpuidle-haltpoll.c @@ -97,7 +97,8 @@ static int __init haltpoll_init(void) cpuidle_poll_state_init(drv); - if (!kvm_para_available()) + if (!kvm_para_available() || + !kvm_para_has_hint(KVM_HINTS_REALTIME)) return -ENODEV; ret = cpuidle_register_driver(drv); -- cgit v1.2.3