From 5a25e3f7cc538fb49e11267c1e41c54ccf83835e Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 26 Mar 2019 12:15:13 +0100 Subject: cpufreq: intel_pstate: Driver-specific handling of _PPC updates In some cases, the platform firmware disables or enables turbo frequencies for all CPUs globally before triggering a _PPC change notification for one of them. Obviously, that global change affects all CPUs, not just the notified one, and it needs to be acted upon by cpufreq. The intel_pstate driver is able to detect such global changes of the settings, but it also needs to update policy limits for all CPUs if that happens, in particular if turbo frequencies are enabled globally - to allow them to be used. For this reason, introduce a new cpufreq driver callback to be invoked on _PPC notifications, if present, instead of simply calling cpufreq_update_policy() for the notified CPU and make intel_pstate use it to trigger policy updates for all CPUs in the system if global settings change. Link: https://bugzilla.kernel.org/show_bug.cgi?id=200759 Reported-by: Gabriele Mazzotta Tested-by: Gabriele Mazzotta Signed-off-by: Rafael J. Wysocki Acked-by: Viresh Kumar --- drivers/acpi/processor_perflib.c | 2 +- drivers/cpufreq/cpufreq.c | 16 ++++++++++++++++ drivers/cpufreq/intel_pstate.c | 24 ++++++++++++++++++++++++ include/linux/cpufreq.h | 4 ++++ 4 files changed, 45 insertions(+), 1 deletion(-) diff --git a/drivers/acpi/processor_perflib.c b/drivers/acpi/processor_perflib.c index a303fd0e108c..c73d3a62799a 100644 --- a/drivers/acpi/processor_perflib.c +++ b/drivers/acpi/processor_perflib.c @@ -181,7 +181,7 @@ void acpi_processor_ppc_has_changed(struct acpi_processor *pr, int event_flag) acpi_processor_ppc_ost(pr->handle, 0); } if (ret >= 0) - cpufreq_update_policy(pr->id); + cpufreq_update_limits(pr->id); } int acpi_processor_get_bios_limit(int cpu, unsigned int *limit) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index e10922709d13..bb63347f6af1 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -2370,6 +2370,22 @@ unlock: } EXPORT_SYMBOL(cpufreq_update_policy); +/** + * cpufreq_update_limits - Update policy limits for a given CPU. + * @cpu: CPU to update the policy limits for. + * + * Invoke the driver's ->update_limits callback if present or call + * cpufreq_update_policy() for @cpu. + */ +void cpufreq_update_limits(unsigned int cpu) +{ + if (cpufreq_driver->update_limits) + cpufreq_driver->update_limits(cpu); + else + cpufreq_update_policy(cpu); +} +EXPORT_SYMBOL_GPL(cpufreq_update_limits); + /********************************************************************* * BOOST * *********************************************************************/ diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index b599c7318aab..e2191a570ade 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -179,6 +179,7 @@ struct vid_data { * based on the MSR_IA32_MISC_ENABLE value and whether or * not the maximum reported turbo P-state is different from * the maximum reported non-turbo one. + * @turbo_disabled_s: Saved @turbo_disabled value. * @min_perf_pct: Minimum capacity limit in percent of the maximum turbo * P-state capacity. * @max_perf_pct: Maximum capacity limit in percent of the maximum turbo @@ -187,6 +188,7 @@ struct vid_data { struct global_params { bool no_turbo; bool turbo_disabled; + bool turbo_disabled_s; int max_perf_pct; int min_perf_pct; }; @@ -897,6 +899,25 @@ static void intel_pstate_update_policies(void) cpufreq_update_policy(cpu); } +static void intel_pstate_update_limits(unsigned int cpu) +{ + mutex_lock(&intel_pstate_driver_lock); + + update_turbo_state(); + /* + * If turbo has been turned on or off globally, policy limits for + * all CPUs need to be updated to reflect that. + */ + if (global.turbo_disabled_s != global.turbo_disabled) { + global.turbo_disabled_s = global.turbo_disabled; + intel_pstate_update_policies(); + } else { + cpufreq_update_policy(cpu); + } + + mutex_unlock(&intel_pstate_driver_lock); +} + /************************** sysfs begin ************************/ #define show_one(file_name, object) \ static ssize_t show_##file_name \ @@ -2138,6 +2159,7 @@ static int __intel_pstate_cpu_init(struct cpufreq_policy *policy) /* cpuinfo and default policy values */ policy->cpuinfo.min_freq = cpu->pstate.min_pstate * cpu->pstate.scaling; update_turbo_state(); + global.turbo_disabled_s = global.turbo_disabled; policy->cpuinfo.max_freq = global.turbo_disabled ? cpu->pstate.max_pstate : cpu->pstate.turbo_pstate; policy->cpuinfo.max_freq *= cpu->pstate.scaling; @@ -2182,6 +2204,7 @@ static struct cpufreq_driver intel_pstate = { .init = intel_pstate_cpu_init, .exit = intel_pstate_cpu_exit, .stop_cpu = intel_pstate_stop_cpu, + .update_limits = intel_pstate_update_limits, .name = "intel_pstate", }; @@ -2316,6 +2339,7 @@ static struct cpufreq_driver intel_cpufreq = { .init = intel_cpufreq_cpu_init, .exit = intel_pstate_cpu_exit, .stop_cpu = intel_cpufreq_stop_cpu, + .update_limits = intel_pstate_update_limits, .name = "intel_cpufreq", }; diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index b160e98076e3..5005ea40364f 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -195,6 +195,7 @@ void disable_cpufreq(void); u64 get_cpu_idle_time(unsigned int cpu, u64 *wall, int io_busy); int cpufreq_get_policy(struct cpufreq_policy *policy, unsigned int cpu); void cpufreq_update_policy(unsigned int cpu); +void cpufreq_update_limits(unsigned int cpu); bool have_governor_per_policy(void); struct kobject *get_governor_parent_kobj(struct cpufreq_policy *policy); void cpufreq_enable_fast_switch(struct cpufreq_policy *policy); @@ -322,6 +323,9 @@ struct cpufreq_driver { /* should be defined, if possible */ unsigned int (*get)(unsigned int cpu); + /* Called to update policy limits on firmware notifications. */ + void (*update_limits)(unsigned int cpu); + /* optional */ int (*bios_limit)(int cpu, unsigned int *limit); -- cgit v1.2.3 From 540a375822a40675868c5f62ae57b608a579e56a Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 26 Mar 2019 12:16:58 +0100 Subject: cpufreq: Add cpufreq_cpu_acquire() and cpufreq_cpu_release() It sometimes is necessary to find a cpufreq policy for a given CPU and acquire its rwsem (for writing) immediately after that, so introduce cpufreq_cpu_acquire() as a helper for that and the complementary cpufreq_cpu_release(). Make cpufreq_update_policy() use the new functions. Signed-off-by: Rafael J. Wysocki Acked-by: Viresh Kumar --- drivers/cpufreq/cpufreq.c | 56 +++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 47 insertions(+), 9 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index bb63347f6af1..d8fc395af773 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -250,6 +250,51 @@ void cpufreq_cpu_put(struct cpufreq_policy *policy) } EXPORT_SYMBOL_GPL(cpufreq_cpu_put); +/** + * cpufreq_cpu_release - Unlock a policy and decrement its usage counter. + * @policy: cpufreq policy returned by cpufreq_cpu_acquire(). + */ +static void cpufreq_cpu_release(struct cpufreq_policy *policy) +{ + if (WARN_ON(!policy)) + return; + + lockdep_assert_held(&policy->rwsem); + + up_write(&policy->rwsem); + + cpufreq_cpu_put(policy); +} + +/** + * cpufreq_cpu_acquire - Find policy for a CPU, mark it as busy and lock it. + * @cpu: CPU to find the policy for. + * + * Call cpufreq_cpu_get() to get a reference on the cpufreq policy for @cpu and + * if the policy returned by it is not NULL, acquire its rwsem for writing. + * Return the policy if it is active or release it and return NULL otherwise. + * + * The policy returned by this function has to be released with the help of + * cpufreq_cpu_release() in order to release its rwsem and balance its usage + * counter properly. + */ +static struct cpufreq_policy *cpufreq_cpu_acquire(unsigned int cpu) +{ + struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); + + if (!policy) + return NULL; + + down_write(&policy->rwsem); + + if (policy_is_inactive(policy)) { + cpufreq_cpu_release(policy); + return NULL; + } + + return policy; +} + /********************************************************************* * EXTERNALLY AFFECTING FREQUENCY CHANGES * *********************************************************************/ @@ -2337,17 +2382,12 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy, */ void cpufreq_update_policy(unsigned int cpu) { - struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); + struct cpufreq_policy *policy = cpufreq_cpu_acquire(cpu); struct cpufreq_policy new_policy; if (!policy) return; - down_write(&policy->rwsem); - - if (policy_is_inactive(policy)) - goto unlock; - /* * BIOS might change freq behind our back * -> ask driver for current freq and notify governors about a change @@ -2364,9 +2404,7 @@ void cpufreq_update_policy(unsigned int cpu) cpufreq_set_policy(policy, &new_policy); unlock: - up_write(&policy->rwsem); - - cpufreq_cpu_put(policy); + cpufreq_cpu_release(policy); } EXPORT_SYMBOL(cpufreq_update_policy); -- cgit v1.2.3 From c324f43aed89b33eee37aaf022b32c31a2b3104d Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Thu, 28 Mar 2019 14:33:58 +0100 Subject: cpuidle: exynos: Unify target residency for AFTR and coupled AFTR states Since commit 45f1ff59e27c ("cpuidle: Return nohz hint from cpuidle_select()") Exynos CPUidle driver stopped entering C1 (AFTR) mode on Exynos4412-based Trats2 board. Further analysis revealed that the CPUidle framework changed the way it handles predicted timer ticks and reported target residency for the given idle states. As a result, the C1 (AFTR) state was not chosen anymore on completely idle device. The main issue was to high target residency value. The similar C1 (AFTR) state for 'coupled' CPUidle version used 10 times lower value for the target residency, despite the fact that it is the same state from the hardware perspective. The 100000us value for standard C1 (AFTR) mode is there from the begining of the support for this idle state, added by the commit 67173ca492ab ("ARM: EXYNOS: Add support AFTR mode on EXYNOS4210"). That commit doesn't give any reason for it, instead it looks like it was blindly copied from the WFI/IDLE state of the same driver that time. That time, that value was probably not really used by the framework for any critical decision, so it didn't matter that much. Now it turned out to be an issue, so unify the target residency with the 'coupled' version, as it seems to better match the real use case values and restores the operation of the Exynos CPUidle driver on the idle device. Signed-off-by: Marek Szyprowski Reviewed-by: Krzysztof Kozlowski Acked-by: Daniel Lezcano Acked-by: Bartlomiej Zolnierkiewicz Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/cpuidle-exynos.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cpuidle/cpuidle-exynos.c b/drivers/cpuidle/cpuidle-exynos.c index 0171a6e190d7..f7199a35cbb6 100644 --- a/drivers/cpuidle/cpuidle-exynos.c +++ b/drivers/cpuidle/cpuidle-exynos.c @@ -84,7 +84,7 @@ static struct cpuidle_driver exynos_idle_driver = { [1] = { .enter = exynos_enter_lowpower, .exit_latency = 300, - .target_residency = 100000, + .target_residency = 10000, .name = "C1", .desc = "ARM power down", }, -- cgit v1.2.3 From 74a1dd86d1739eae2015a3832f62c1d6546893a7 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Mon, 25 Mar 2019 10:24:56 -0700 Subject: PM / wakeup: Use pm_pr_dbg() instead of pr_debug() These prints are useful if we're doing PM suspend debugging. Having them at pr_debug() level means that we need to either enable DEBUG in this file, or compile the kernel with dynamic debug capabilities. Both of these options have drawbacks like custom compilation or opting into all debug statements being included into the kernel image. Given that we already have infrastructure to collect PM debugging information with CONFIG_PM_DEBUG and friends, let's change the pr_debug usage here to be pm_pr_dbg() instead so we can collect the wakeup information in the kernel logs. Signed-off-by: Stephen Boyd Signed-off-by: Rafael J. Wysocki --- drivers/base/power/wakeup.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c index bb1ae175fae1..23c243a4c675 100644 --- a/drivers/base/power/wakeup.c +++ b/drivers/base/power/wakeup.c @@ -804,7 +804,7 @@ void pm_print_active_wakeup_sources(void) srcuidx = srcu_read_lock(&wakeup_srcu); list_for_each_entry_rcu(ws, &wakeup_sources, entry) { if (ws->active) { - pr_debug("active wakeup source: %s\n", ws->name); + pm_pr_dbg("active wakeup source: %s\n", ws->name); active = 1; } else if (!active && (!last_activity_ws || @@ -815,7 +815,7 @@ void pm_print_active_wakeup_sources(void) } if (!active && last_activity_ws) - pr_debug("last active wakeup source: %s\n", + pm_pr_dbg("last active wakeup source: %s\n", last_activity_ws->name); srcu_read_unlock(&wakeup_srcu, srcuidx); } @@ -845,7 +845,7 @@ bool pm_wakeup_pending(void) raw_spin_unlock_irqrestore(&events_lock, flags); if (ret) { - pr_debug("Wakeup pending, aborting suspend\n"); + pm_pr_dbg("Wakeup pending, aborting suspend\n"); pm_print_active_wakeup_sources(); } -- cgit v1.2.3 From b5dee3130bb4014511f5d0dd46855ed843e3fdc8 Mon Sep 17 00:00:00 2001 From: Harry Pan Date: Mon, 25 Feb 2019 20:36:41 +0800 Subject: PM / sleep: Refactor filesystems sync to reduce duplication Create a common helper to sync filesystems for system suspend and hibernation. Signed-off-by: Harry Pan Acked-by: Pavel Machek [ rjw: Changelog ] Signed-off-by: Rafael J. Wysocki --- include/linux/suspend.h | 3 +++ kernel/power/hibernate.c | 5 +---- kernel/power/main.c | 9 +++++++++ kernel/power/suspend.c | 13 +++++-------- kernel/power/user.c | 5 +---- 5 files changed, 19 insertions(+), 16 deletions(-) diff --git a/include/linux/suspend.h b/include/linux/suspend.h index 3f529ad9a9d2..6b3ea9ea6a9e 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -425,6 +425,7 @@ void restore_processor_state(void); /* kernel/power/main.c */ extern int register_pm_notifier(struct notifier_block *nb); extern int unregister_pm_notifier(struct notifier_block *nb); +extern void ksys_sync_helper(void); #define pm_notifier(fn, pri) { \ static struct notifier_block fn##_nb = \ @@ -462,6 +463,8 @@ static inline int unregister_pm_notifier(struct notifier_block *nb) return 0; } +static inline void ksys_sync_helper(void) {} + #define pm_notifier(fn, pri) do { (void)(fn); } while (0) static inline bool pm_wakeup_pending(void) { return false; } diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index abef759de7c8..cc105ecd9c07 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -14,7 +14,6 @@ #include #include -#include #include #include #include @@ -709,9 +708,7 @@ int hibernate(void) goto Exit; } - pr_info("Syncing filesystems ... \n"); - ksys_sync(); - pr_info("done.\n"); + ksys_sync_helper(); error = freeze_processes(); if (error) diff --git a/kernel/power/main.c b/kernel/power/main.c index 98e76cad128b..40472a7c5536 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -16,6 +16,7 @@ #include #include #include +#include #include "power.h" @@ -51,6 +52,14 @@ void unlock_system_sleep(void) } EXPORT_SYMBOL_GPL(unlock_system_sleep); +void ksys_sync_helper(void) +{ + pr_info("Syncing filesystems ... "); + ksys_sync(); + pr_cont("done.\n"); +} +EXPORT_SYMBOL_GPL(ksys_sync_helper); + /* Routines for PM-transition notifications */ static BLOCKING_NOTIFIER_HEAD(pm_chain_head); diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 0bd595a0b610..e39059dea38b 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -17,7 +17,6 @@ #include #include #include -#include #include #include #include @@ -568,13 +567,11 @@ static int enter_state(suspend_state_t state) if (state == PM_SUSPEND_TO_IDLE) s2idle_begin(); -#ifndef CONFIG_SUSPEND_SKIP_SYNC - trace_suspend_resume(TPS("sync_filesystems"), 0, true); - pr_info("Syncing filesystems ... "); - ksys_sync(); - pr_cont("done.\n"); - trace_suspend_resume(TPS("sync_filesystems"), 0, false); -#endif + if (!IS_ENABLED(CONFIG_SUSPEND_SKIP_SYNC)) { + trace_suspend_resume(TPS("sync_filesystems"), 0, true); + ksys_sync_helper(); + trace_suspend_resume(TPS("sync_filesystems"), 0, false); + } pm_pr_dbg("Preparing system for sleep (%s)\n", mem_sleep_labels[state]); pm_suspend_clear_flags(); diff --git a/kernel/power/user.c b/kernel/power/user.c index 2d8b60a3c86b..cb24e840a3e6 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -10,7 +10,6 @@ */ #include -#include #include #include #include @@ -228,9 +227,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, if (data->frozen) break; - printk("Syncing filesystems ... "); - ksys_sync(); - printk("done.\n"); + ksys_sync_helper(); error = freeze_processes(); if (error) -- cgit v1.2.3 From c64546b17bc940643545dd34eac21f51764d633c Mon Sep 17 00:00:00 2001 From: Harry Pan Date: Mon, 25 Feb 2019 20:36:43 +0800 Subject: PM / sleep: Measure the time of filesystems syncing Measure the filesystems sync time during system sleep more precisely. Among other things, this allows the pr_cont() to be dropped from ksys_sync_helper() and makes automatic system suspend and hibernation profiling somewhat more straightforward. Signed-off-by: Harry Pan [ rjw: Changelog ] Signed-off-by: Rafael J. Wysocki --- kernel/power/main.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/kernel/power/main.c b/kernel/power/main.c index 40472a7c5536..4f43e724f6eb 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -54,9 +54,14 @@ EXPORT_SYMBOL_GPL(unlock_system_sleep); void ksys_sync_helper(void) { - pr_info("Syncing filesystems ... "); + ktime_t start; + long elapsed_msecs; + + start = ktime_get(); ksys_sync(); - pr_cont("done.\n"); + elapsed_msecs = ktime_to_ms(ktime_sub(ktime_get(), start)); + pr_info("Filesystems sync: %ld.%03ld seconds\n", + elapsed_msecs / MSEC_PER_SEC, elapsed_msecs % MSEC_PER_SEC); } EXPORT_SYMBOL_GPL(ksys_sync_helper); -- cgit v1.2.3 From 5861381d486601430cccf64849bd0a226154bc0d Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 21 Mar 2019 23:18:01 +0100 Subject: PM / arch: x86: Rework the MSR_IA32_ENERGY_PERF_BIAS handling The current handling of MSR_IA32_ENERGY_PERF_BIAS in the kernel is problematic, because it may cause changes made by user space to that MSR (with the help of the x86_energy_perf_policy tool, for example) to be lost every time a CPU goes offline and then back online as well as during system-wide power management transitions into sleep states and back into the working state. The first problem is that if the current EPB value for a CPU going online is 0 ('performance'), the kernel will change it to 6 ('normal') regardless of whether or not this is the first bring-up of that CPU. That also happens during system-wide resume from sleep states (including, but not limited to, hibernation). However, the EPB may have been adjusted by user space this way and the kernel should not blindly override that setting. The second problem is that if the platform firmware resets the EPB values for any CPUs during system-wide resume from a sleep state, the kernel will not restore their previous EPB values that may have been set by user space before the preceding system-wide suspend transition. Again, that behavior may at least be confusing from the user space perspective. In order to address these issues, rework the handling of MSR_IA32_ENERGY_PERF_BIAS so that the EPB value is saved on CPU offline and restored on CPU online as well as (for the boot CPU) during the syscore stages of system-wide suspend and resume transitions, respectively. However, retain the policy by which the EPB is set to 6 ('normal') on the first bring-up of each CPU if its initial value is 0, based on the observation that 0 may mean 'not initialized' just as well as 'performance' in that case. While at it, move the MSR_IA32_ENERGY_PERF_BIAS handling code into a separate file and document it in Documentation/admin-guide. Fixes: abe48b108247 (x86, intel, power: Initialize MSR_IA32_ENERGY_PERF_BIAS) Fixes: b51ef52df71c (x86/cpu: Restore MSR_IA32_ENERGY_PERF_BIAS after resume) Reported-by: Thomas Renninger Signed-off-by: Rafael J. Wysocki Reviewed-by: Hannes Reinecke Acked-by: Borislav Petkov Acked-by: Thomas Gleixner --- Documentation/admin-guide/pm/intel_epb.rst | 6 ++ Documentation/admin-guide/pm/working-state.rst | 1 + arch/x86/kernel/cpu/Makefile | 2 +- arch/x86/kernel/cpu/common.c | 17 ---- arch/x86/kernel/cpu/cpu.h | 1 - arch/x86/kernel/cpu/intel.c | 34 ------- arch/x86/kernel/cpu/intel_epb.c | 131 +++++++++++++++++++++++++ include/linux/cpuhotplug.h | 1 + 8 files changed, 140 insertions(+), 53 deletions(-) create mode 100644 Documentation/admin-guide/pm/intel_epb.rst create mode 100644 arch/x86/kernel/cpu/intel_epb.c diff --git a/Documentation/admin-guide/pm/intel_epb.rst b/Documentation/admin-guide/pm/intel_epb.rst new file mode 100644 index 000000000000..e9cfa7ec5420 --- /dev/null +++ b/Documentation/admin-guide/pm/intel_epb.rst @@ -0,0 +1,6 @@ +====================================== +Intel Performance and Energy Bias Hint +====================================== + +.. kernel-doc:: arch/x86/kernel/cpu/intel_epb.c + :doc: overview diff --git a/Documentation/admin-guide/pm/working-state.rst b/Documentation/admin-guide/pm/working-state.rst index b6cef9b5e961..beb004d3632b 100644 --- a/Documentation/admin-guide/pm/working-state.rst +++ b/Documentation/admin-guide/pm/working-state.rst @@ -8,3 +8,4 @@ Working-State Power Management cpuidle cpufreq intel_pstate + intel_epb diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index cfd24f9f7614..1796d2bdcaaa 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -28,7 +28,7 @@ obj-y += cpuid-deps.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o -obj-$(CONFIG_CPU_SUP_INTEL) += intel.o intel_pconfig.o +obj-$(CONFIG_CPU_SUP_INTEL) += intel.o intel_pconfig.o intel_epb.o obj-$(CONFIG_CPU_SUP_AMD) += amd.o obj-$(CONFIG_CPU_SUP_HYGON) += hygon.o obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index cb28e98a0659..5e37dfa4d9df 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1864,23 +1864,6 @@ void cpu_init(void) } #endif -static void bsp_resume(void) -{ - if (this_cpu->c_bsp_resume) - this_cpu->c_bsp_resume(&boot_cpu_data); -} - -static struct syscore_ops cpu_syscore_ops = { - .resume = bsp_resume, -}; - -static int __init init_cpu_syscore(void) -{ - register_syscore_ops(&cpu_syscore_ops); - return 0; -} -core_initcall(init_cpu_syscore); - /* * The microcode loader calls this upon late microcode load to recheck features, * only when microcode has been updated. Caller holds microcode_mutex and CPU diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 5eb946b9a9f3..c0e2407abdd6 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -14,7 +14,6 @@ struct cpu_dev { void (*c_init)(struct cpuinfo_x86 *); void (*c_identify)(struct cpuinfo_x86 *); void (*c_detect_tlb)(struct cpuinfo_x86 *); - void (*c_bsp_resume)(struct cpuinfo_x86 *); int c_x86_vendor; #ifdef CONFIG_X86_32 /* Optional vendor specific routine to obtain the cache size. */ diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index fc3c07fe7df5..f17c1a714779 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -596,36 +596,6 @@ detect_keyid_bits: c->x86_phys_bits -= keyid_bits; } -static void init_intel_energy_perf(struct cpuinfo_x86 *c) -{ - u64 epb; - - /* - * Initialize MSR_IA32_ENERGY_PERF_BIAS if not already initialized. - * (x86_energy_perf_policy(8) is available to change it at run-time.) - */ - if (!cpu_has(c, X86_FEATURE_EPB)) - return; - - rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb); - if ((epb & 0xF) != ENERGY_PERF_BIAS_PERFORMANCE) - return; - - pr_warn_once("ENERGY_PERF_BIAS: Set to 'normal', was 'performance'\n"); - pr_warn_once("ENERGY_PERF_BIAS: View and update with x86_energy_perf_policy(8)\n"); - epb = (epb & ~0xF) | ENERGY_PERF_BIAS_NORMAL; - wrmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb); -} - -static void intel_bsp_resume(struct cpuinfo_x86 *c) -{ - /* - * MSR_IA32_ENERGY_PERF_BIAS is lost across suspend/resume, - * so reinitialize it properly like during bootup: - */ - init_intel_energy_perf(c); -} - static void init_cpuid_fault(struct cpuinfo_x86 *c) { u64 msr; @@ -763,8 +733,6 @@ static void init_intel(struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_TME)) detect_tme(c); - init_intel_energy_perf(c); - init_intel_misc_features(c); } @@ -1023,9 +991,7 @@ static const struct cpu_dev intel_cpu_dev = { .c_detect_tlb = intel_detect_tlb, .c_early_init = early_init_intel, .c_init = init_intel, - .c_bsp_resume = intel_bsp_resume, .c_x86_vendor = X86_VENDOR_INTEL, }; cpu_dev_register(intel_cpu_dev); - diff --git a/arch/x86/kernel/cpu/intel_epb.c b/arch/x86/kernel/cpu/intel_epb.c new file mode 100644 index 000000000000..8d53cc88bd22 --- /dev/null +++ b/arch/x86/kernel/cpu/intel_epb.c @@ -0,0 +1,131 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Intel Performance and Energy Bias Hint support. + * + * Copyright (C) 2019 Intel Corporation + * + * Author: + * Rafael J. Wysocki + */ + +#include +#include +#include + +#include +#include + +/** + * DOC: overview + * + * The Performance and Energy Bias Hint (EPB) allows software to specify its + * preference with respect to the power-performance tradeoffs present in the + * processor. Generally, the EPB is expected to be set by user space through + * the generic MSR interface (with the help of the x86_energy_perf_policy tool), + * but there are two reasons for the kernel to touch it. + * + * First, there are systems where the platform firmware resets the EPB during + * system-wide transitions from sleep states back into the working state + * effectively causing the previous EPB updates by user space to be lost. + * Thus the kernel needs to save the current EPB values for all CPUs during + * system-wide transitions to sleep states and restore them on the way back to + * the working state. That can be achieved by saving EPB for secondary CPUs + * when they are taken offline during transitions into system sleep states and + * for the boot CPU in a syscore suspend operation, so that it can be restored + * for the boot CPU in a syscore resume operation and for the other CPUs when + * they are brought back online. However, CPUs that are already offline when + * a system-wide PM transition is started are not taken offline again, but their + * EPB values may still be reset by the platform firmware during the transition, + * so in fact it is necessary to save the EPB of any CPU taken offline and to + * restore it when the given CPU goes back online at all times. + * + * Second, on many systems the initial EPB value coming from the platform + * firmware is 0 ('performance') and at least on some of them that is because + * the platform firmware does not initialize EPB at all with the assumption that + * the OS will do that anyway. That sometimes is problematic, as it may cause + * the system battery to drain too fast, for example, so it is better to adjust + * it on CPU bring-up and if the initial EPB value for a given CPU is 0, the + * kernel changes it to 6 ('normal'). + */ + +static DEFINE_PER_CPU(u8, saved_epb); + +#define EPB_MASK 0x0fULL +#define EPB_SAVED 0x10ULL + +static int intel_epb_save(void) +{ + u64 epb; + + rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb); + /* + * Ensure that saved_epb will always be nonzero after this write even if + * the EPB value read from the MSR is 0. + */ + this_cpu_write(saved_epb, (epb & EPB_MASK) | EPB_SAVED); + + return 0; +} + +static void intel_epb_restore(void) +{ + u64 val = this_cpu_read(saved_epb); + u64 epb; + + rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb); + if (val) { + val &= EPB_MASK; + } else { + /* + * Because intel_epb_save() has not run for the current CPU yet, + * it is going online for the first time, so if its EPB value is + * 0 ('performance') at this point, assume that it has not been + * initialized by the platform firmware and set it to 6 + * ('normal'). + */ + val = epb & EPB_MASK; + if (val == ENERGY_PERF_BIAS_PERFORMANCE) { + val = ENERGY_PERF_BIAS_NORMAL; + pr_warn_once("ENERGY_PERF_BIAS: Set to 'normal', was 'performance'\n"); + } + } + wrmsrl(MSR_IA32_ENERGY_PERF_BIAS, (epb & ~EPB_MASK) | val); +} + +static struct syscore_ops intel_epb_syscore_ops = { + .suspend = intel_epb_save, + .resume = intel_epb_restore, +}; + +static int intel_epb_online(unsigned int cpu) +{ + intel_epb_restore(); + return 0; +} + +static int intel_epb_offline(unsigned int cpu) +{ + return intel_epb_save(); +} + +static __init int intel_epb_init(void) +{ + int ret; + + if (!boot_cpu_has(X86_FEATURE_EPB)) + return -ENODEV; + + ret = cpuhp_setup_state(CPUHP_AP_X86_INTEL_EPB_ONLINE, + "x86/intel/epb:online", intel_epb_online, + intel_epb_offline); + if (ret < 0) + goto err_out_online; + + register_syscore_ops(&intel_epb_syscore_ops); + return 0; + +err_out_online: + cpuhp_remove_state(CPUHP_AP_X86_INTEL_EPB_ONLINE); + return ret; +} +subsys_initcall(intel_epb_init); diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index e78281d07b70..dbfdd0fadbef 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -147,6 +147,7 @@ enum cpuhp_state { CPUHP_AP_X86_VDSO_VMA_ONLINE, CPUHP_AP_IRQ_AFFINITY_ONLINE, CPUHP_AP_ARM_MVEBU_SYNC_CLOCKS, + CPUHP_AP_X86_INTEL_EPB_ONLINE, CPUHP_AP_PERF_ONLINE, CPUHP_AP_PERF_X86_ONLINE, CPUHP_AP_PERF_X86_UNCORE_ONLINE, -- cgit v1.2.3 From b9c273babce791cf228fc466577f55056a699f9c Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 21 Mar 2019 23:20:17 +0100 Subject: PM / arch: x86: MSR_IA32_ENERGY_PERF_BIAS sysfs interface The Performance and Energy Bias Hint (EPB) is expected to be set by user space through the generic MSR interface, but that interface is not particularly nice and there are security concerns regarding it, so it is not always available. For this reason, add a sysfs interface for reading and updating the EPB, in the form of a new attribute, energy_perf_bias, located under /sys/devices/system/cpu/cpu#/power/ for online CPUs that support the EPB feature. Signed-off-by: Rafael J. Wysocki Reviewed-by: Hannes Reinecke Acked-by: Borislav Petkov --- Documentation/ABI/testing/sysfs-devices-system-cpu | 18 +++++ Documentation/admin-guide/pm/intel_epb.rst | 27 +++++++ arch/x86/kernel/cpu/intel_epb.c | 93 +++++++++++++++++++++- 3 files changed, 134 insertions(+), 4 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu index 9605dbd4b5b5..7f4af7da3fbc 100644 --- a/Documentation/ABI/testing/sysfs-devices-system-cpu +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu @@ -518,3 +518,21 @@ Description: Control Symetric Multi Threading (SMT) If control status is "forceoff" or "notsupported" writes are rejected. + +What: /sys/devices/system/cpu/cpu#/power/energy_perf_bias +Date: March 2019 +Contact: linux-pm@vger.kernel.org +Description: Intel Energy and Performance Bias Hint (EPB) + + EPB for the given CPU in a sliding scale 0 - 15, where a value + of 0 corresponds to a hint preference for highest performance + and a value of 15 corresponds to the maximum energy savings. + + In order to change the EPB value for the CPU, write either + a number in the 0 - 15 sliding scale above, or one of the + strings: "performance", "balance-performance", "normal", + "balance-power", "power" (that represent values reflected by + their meaning), to this attribute. + + This attribute is present for all online CPUs supporting the + Intel EPB feature. diff --git a/Documentation/admin-guide/pm/intel_epb.rst b/Documentation/admin-guide/pm/intel_epb.rst index e9cfa7ec5420..d100849edfc4 100644 --- a/Documentation/admin-guide/pm/intel_epb.rst +++ b/Documentation/admin-guide/pm/intel_epb.rst @@ -4,3 +4,30 @@ Intel Performance and Energy Bias Hint .. kernel-doc:: arch/x86/kernel/cpu/intel_epb.c :doc: overview + +Intel Performance and Energy Bias Attribute in ``sysfs`` +======================================================== + +The Intel Performance and Energy Bias Hint (EPB) value for a given (logical) CPU +can be checked or updated through a ``sysfs`` attribute (file) under +:file:`/sys/devices/system/cpu/cpu/power/`, where the CPU number ```` +is allocated at the system initialization time: + +``energy_perf_bias`` + Shows the current EPB value for the CPU in a sliding scale 0 - 15, where + a value of 0 corresponds to a hint preference for highest performance + and a value of 15 corresponds to the maximum energy savings. + + In order to update the EPB value for the CPU, this attribute can be + written to, either with a number in the 0 - 15 sliding scale above, or + with one of the strings: "performance", "balance-performance", "normal", + "balance-power", "power" that represent values reflected by their + meaning. + + This attribute is present for all online CPUs supporting the EPB + feature. + +Note that while the EPB interface to the processor is defined at the logical CPU +level, the physical register backing it may be shared by multiple CPUs (for +example, SMT siblings or cores in one package). For this reason, updating the +EPB value for one CPU may cause the EPB values for other CPUs to change. diff --git a/arch/x86/kernel/cpu/intel_epb.c b/arch/x86/kernel/cpu/intel_epb.c index 8d53cc88bd22..f4dd73396f28 100644 --- a/arch/x86/kernel/cpu/intel_epb.c +++ b/arch/x86/kernel/cpu/intel_epb.c @@ -9,8 +9,12 @@ */ #include +#include +#include #include +#include #include +#include #include #include @@ -20,9 +24,9 @@ * * The Performance and Energy Bias Hint (EPB) allows software to specify its * preference with respect to the power-performance tradeoffs present in the - * processor. Generally, the EPB is expected to be set by user space through - * the generic MSR interface (with the help of the x86_energy_perf_policy tool), - * but there are two reasons for the kernel to touch it. + * processor. Generally, the EPB is expected to be set by user space (directly + * via sysfs or with the help of the x86_energy_perf_policy tool), but there are + * two reasons for the kernel to update it. * * First, there are systems where the platform firmware resets the EPB during * system-wide transitions from sleep states back into the working state @@ -52,6 +56,7 @@ static DEFINE_PER_CPU(u8, saved_epb); #define EPB_MASK 0x0fULL #define EPB_SAVED 0x10ULL +#define MAX_EPB EPB_MASK static int intel_epb_save(void) { @@ -97,15 +102,95 @@ static struct syscore_ops intel_epb_syscore_ops = { .resume = intel_epb_restore, }; +static const char * const energy_perf_strings[] = { + "performance", + "balance-performance", + "normal", + "balance-power", + "power" +}; +static const u8 energ_perf_values[] = { + ENERGY_PERF_BIAS_PERFORMANCE, + ENERGY_PERF_BIAS_BALANCE_PERFORMANCE, + ENERGY_PERF_BIAS_NORMAL, + ENERGY_PERF_BIAS_BALANCE_POWERSAVE, + ENERGY_PERF_BIAS_POWERSAVE +}; + +static ssize_t energy_perf_bias_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + unsigned int cpu = dev->id; + u64 epb; + int ret; + + ret = rdmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb); + if (ret < 0) + return ret; + + return sprintf(buf, "%llu\n", epb); +} + +static ssize_t energy_perf_bias_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + unsigned int cpu = dev->id; + u64 epb, val; + int ret; + + ret = __sysfs_match_string(energy_perf_strings, + ARRAY_SIZE(energy_perf_strings), buf); + if (ret >= 0) + val = energ_perf_values[ret]; + else if (kstrtou64(buf, 0, &val) || val > MAX_EPB) + return -EINVAL; + + ret = rdmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb); + if (ret < 0) + return ret; + + ret = wrmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, + (epb & ~EPB_MASK) | val); + if (ret < 0) + return ret; + + return count; +} + +static DEVICE_ATTR_RW(energy_perf_bias); + +static struct attribute *intel_epb_attrs[] = { + &dev_attr_energy_perf_bias.attr, + NULL +}; + +static const struct attribute_group intel_epb_attr_group = { + .name = power_group_name, + .attrs = intel_epb_attrs +}; + static int intel_epb_online(unsigned int cpu) { + struct device *cpu_dev = get_cpu_device(cpu); + intel_epb_restore(); + if (!cpuhp_tasks_frozen) + sysfs_merge_group(&cpu_dev->kobj, &intel_epb_attr_group); + return 0; } static int intel_epb_offline(unsigned int cpu) { - return intel_epb_save(); + struct device *cpu_dev = get_cpu_device(cpu); + + if (!cpuhp_tasks_frozen) + sysfs_unmerge_group(&cpu_dev->kobj, &intel_epb_attr_group); + + intel_epb_save(); + return 0; } static __init int intel_epb_init(void) -- cgit v1.2.3 From 9eca544b1491df90ea7102a7ed14acc3c562d97b Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 28 Mar 2019 11:33:21 +0100 Subject: cpufreq: schedutil: Simplify iowait boosting There is not reason for the minimum iowait boost value in the schedutil cpufreq governor to depend on the available range of CPU frequencies. In fact, that dependency is generally confusing, because it causes the iowait boost to behave somewhat differently on CPUs with the same maximum frequency and different minimum frequencies, for example. For this reason, replace the min field in struct sugov_cpu with a constant and choose its values to be 1/8 of SCHED_CAPACITY_SCALE (for consistency with the intel_pstate driver's internal governor). [Note that policy->cpuinfo.max_freq will not be a constant any more after a subsequent change, so this change is depended on by it.] Link: https://lore.kernel.org/lkml/20190305083202.GU32494@hirez.programming.kicks-ass.net/T/#ee20bdc98b7d89f6110c0d00e5c3ee8c2ced93c3d Suggested-by: Peter Zijlstra Signed-off-by: Rafael J. Wysocki Acked-by: Peter Zijlstra (Intel) Acked-by: Viresh Kumar --- kernel/sched/cpufreq_schedutil.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 5c41ea367422..b3a878aa593d 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -13,6 +13,8 @@ #include #include +#define IOWAIT_BOOST_MIN (SCHED_CAPACITY_SCALE / 8) + struct sugov_tunables { struct gov_attr_set attr_set; unsigned int rate_limit_us; @@ -51,7 +53,6 @@ struct sugov_cpu { u64 last_update; unsigned long bw_dl; - unsigned long min; unsigned long max; /* The field below is for single-CPU policies only: */ @@ -291,8 +292,8 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) * * The IO wait boost of a task is disabled after a tick since the last update * of a CPU. If a new IO wait boost is requested after more then a tick, then - * we enable the boost starting from the minimum frequency, which improves - * energy efficiency by ignoring sporadic wakeups from IO. + * we enable the boost starting from IOWAIT_BOOST_MIN, which improves energy + * efficiency by ignoring sporadic wakeups from IO. */ static bool sugov_iowait_reset(struct sugov_cpu *sg_cpu, u64 time, bool set_iowait_boost) @@ -303,7 +304,7 @@ static bool sugov_iowait_reset(struct sugov_cpu *sg_cpu, u64 time, if (delta_ns <= TICK_NSEC) return false; - sg_cpu->iowait_boost = set_iowait_boost ? sg_cpu->min : 0; + sg_cpu->iowait_boost = set_iowait_boost ? IOWAIT_BOOST_MIN : 0; sg_cpu->iowait_boost_pending = set_iowait_boost; return true; @@ -317,8 +318,9 @@ static bool sugov_iowait_reset(struct sugov_cpu *sg_cpu, u64 time, * * Each time a task wakes up after an IO operation, the CPU utilization can be * boosted to a certain utilization which doubles at each "frequent and - * successive" wakeup from IO, ranging from the utilization of the minimum - * OPP to the utilization of the maximum OPP. + * successive" wakeup from IO, ranging from IOWAIT_BOOST_MIN to the utilization + * of the maximum OPP. + * * To keep doubling, an IO boost has to be requested at least once per tick, * otherwise we restart from the utilization of the minimum OPP. */ @@ -349,7 +351,7 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, } /* First wakeup after IO: start with minimum boost */ - sg_cpu->iowait_boost = sg_cpu->min; + sg_cpu->iowait_boost = IOWAIT_BOOST_MIN; } /** @@ -389,7 +391,7 @@ static unsigned long sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time, * No boost pending; reduce the boost value. */ sg_cpu->iowait_boost >>= 1; - if (sg_cpu->iowait_boost < sg_cpu->min) { + if (sg_cpu->iowait_boost < IOWAIT_BOOST_MIN) { sg_cpu->iowait_boost = 0; return util; } @@ -826,9 +828,6 @@ static int sugov_start(struct cpufreq_policy *policy) memset(sg_cpu, 0, sizeof(*sg_cpu)); sg_cpu->cpu = cpu; sg_cpu->sg_policy = sg_policy; - sg_cpu->min = - (SCHED_CAPACITY_SCALE * policy->cpuinfo.min_freq) / - policy->cpuinfo.max_freq; } for_each_cpu(cpu, policy->cpus) { -- cgit v1.2.3 From 9083e4986124389e2a7c0ffca95630a4983887f0 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 26 Mar 2019 12:19:52 +0100 Subject: cpufreq: intel_pstate: Update max frequency on global turbo changes While the cpuinfo.max_freq value doesn't really matter for intel_pstate in the active mode, in the passive mode it is used by governors as the maximum physical frequency of the CPU and the results of governor computations generally depend on it. Also it is made available to user space via sysfs and it should match the current HW configuration. For this reason, make intel_pstate update cpuinfo.max_freq for all CPUs if it detects a global change of turbo frequency settings from "disable" to "enable" or the other way associated with a _PPC change notification from the platform firmware. Note that policy_is_inactive(), cpufreq_cpu_acquire(), cpufreq_cpu_release(), and cpufreq_set_policy() need to be made available to it for this purpose. Link: https://bugzilla.kernel.org/show_bug.cgi?id=200759 Reported-by: Gabriele Mazzotta Tested-by: Gabriele Mazzotta Signed-off-by: Rafael J. Wysocki Acked-by: Viresh Kumar --- drivers/cpufreq/cpufreq.c | 16 ++++------------ drivers/cpufreq/intel_pstate.c | 35 +++++++++++++++++++++++++++++------ include/linux/cpufreq.h | 10 ++++++++++ 3 files changed, 43 insertions(+), 18 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index d8fc395af773..f3f79266ab48 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -34,11 +34,6 @@ static LIST_HEAD(cpufreq_policy_list); -static inline bool policy_is_inactive(struct cpufreq_policy *policy) -{ - return cpumask_empty(policy->cpus); -} - /* Macros to iterate over CPU policies */ #define for_each_suitable_policy(__policy, __active) \ list_for_each_entry(__policy, &cpufreq_policy_list, policy_list) \ @@ -254,7 +249,7 @@ EXPORT_SYMBOL_GPL(cpufreq_cpu_put); * cpufreq_cpu_release - Unlock a policy and decrement its usage counter. * @policy: cpufreq policy returned by cpufreq_cpu_acquire(). */ -static void cpufreq_cpu_release(struct cpufreq_policy *policy) +void cpufreq_cpu_release(struct cpufreq_policy *policy) { if (WARN_ON(!policy)) return; @@ -278,7 +273,7 @@ static void cpufreq_cpu_release(struct cpufreq_policy *policy) * cpufreq_cpu_release() in order to release its rwsem and balance its usage * counter properly. */ -static struct cpufreq_policy *cpufreq_cpu_acquire(unsigned int cpu) +struct cpufreq_policy *cpufreq_cpu_acquire(unsigned int cpu) { struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); @@ -714,9 +709,6 @@ static ssize_t show_scaling_cur_freq(struct cpufreq_policy *policy, char *buf) return ret; } -static int cpufreq_set_policy(struct cpufreq_policy *policy, - struct cpufreq_policy *new_policy); - /** * cpufreq_per_cpu_attr_write() / store_##file_name() - sysfs write access */ @@ -2274,8 +2266,8 @@ EXPORT_SYMBOL(cpufreq_get_policy); * * The cpuinfo part of @policy is not updated by this function. */ -static int cpufreq_set_policy(struct cpufreq_policy *policy, - struct cpufreq_policy *new_policy) +int cpufreq_set_policy(struct cpufreq_policy *policy, + struct cpufreq_policy *new_policy) { struct cpufreq_governor *old_gov; int ret; diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index e2191a570ade..08d1a1e845aa 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -179,7 +179,7 @@ struct vid_data { * based on the MSR_IA32_MISC_ENABLE value and whether or * not the maximum reported turbo P-state is different from * the maximum reported non-turbo one. - * @turbo_disabled_s: Saved @turbo_disabled value. + * @turbo_disabled_mf: The @turbo_disabled value reflected by cpuinfo.max_freq. * @min_perf_pct: Minimum capacity limit in percent of the maximum turbo * P-state capacity. * @max_perf_pct: Maximum capacity limit in percent of the maximum turbo @@ -188,7 +188,7 @@ struct vid_data { struct global_params { bool no_turbo; bool turbo_disabled; - bool turbo_disabled_s; + bool turbo_disabled_mf; int max_perf_pct; int min_perf_pct; }; @@ -899,6 +899,28 @@ static void intel_pstate_update_policies(void) cpufreq_update_policy(cpu); } +static void intel_pstate_update_max_freq(unsigned int cpu) +{ + struct cpufreq_policy *policy = cpufreq_cpu_acquire(cpu); + struct cpufreq_policy new_policy; + struct cpudata *cpudata; + + if (!policy) + return; + + cpudata = all_cpu_data[cpu]; + policy->cpuinfo.max_freq = global.turbo_disabled_mf ? + cpudata->pstate.max_freq : cpudata->pstate.turbo_freq; + + memcpy(&new_policy, policy, sizeof(*policy)); + new_policy.max = min(policy->user_policy.max, policy->cpuinfo.max_freq); + new_policy.min = min(policy->user_policy.min, new_policy.max); + + cpufreq_set_policy(policy, &new_policy); + + cpufreq_cpu_release(policy); +} + static void intel_pstate_update_limits(unsigned int cpu) { mutex_lock(&intel_pstate_driver_lock); @@ -908,9 +930,10 @@ static void intel_pstate_update_limits(unsigned int cpu) * If turbo has been turned on or off globally, policy limits for * all CPUs need to be updated to reflect that. */ - if (global.turbo_disabled_s != global.turbo_disabled) { - global.turbo_disabled_s = global.turbo_disabled; - intel_pstate_update_policies(); + if (global.turbo_disabled_mf != global.turbo_disabled) { + global.turbo_disabled_mf = global.turbo_disabled; + for_each_possible_cpu(cpu) + intel_pstate_update_max_freq(cpu); } else { cpufreq_update_policy(cpu); } @@ -2159,7 +2182,7 @@ static int __intel_pstate_cpu_init(struct cpufreq_policy *policy) /* cpuinfo and default policy values */ policy->cpuinfo.min_freq = cpu->pstate.min_pstate * cpu->pstate.scaling; update_turbo_state(); - global.turbo_disabled_s = global.turbo_disabled; + global.turbo_disabled_mf = global.turbo_disabled; policy->cpuinfo.max_freq = global.turbo_disabled ? cpu->pstate.max_pstate : cpu->pstate.turbo_pstate; policy->cpuinfo.max_freq *= cpu->pstate.scaling; diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 5005ea40364f..684caf067003 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -178,6 +178,11 @@ static inline struct cpufreq_policy *cpufreq_cpu_get(unsigned int cpu) static inline void cpufreq_cpu_put(struct cpufreq_policy *policy) { } #endif +static inline bool policy_is_inactive(struct cpufreq_policy *policy) +{ + return cpumask_empty(policy->cpus); +} + static inline bool policy_is_shared(struct cpufreq_policy *policy) { return cpumask_weight(policy->cpus) > 1; @@ -193,7 +198,12 @@ unsigned int cpufreq_quick_get_max(unsigned int cpu); void disable_cpufreq(void); u64 get_cpu_idle_time(unsigned int cpu, u64 *wall, int io_busy); + +struct cpufreq_policy *cpufreq_cpu_acquire(unsigned int cpu); +void cpufreq_cpu_release(struct cpufreq_policy *policy); int cpufreq_get_policy(struct cpufreq_policy *policy, unsigned int cpu); +int cpufreq_set_policy(struct cpufreq_policy *policy, + struct cpufreq_policy *new_policy); void cpufreq_update_policy(unsigned int cpu); void cpufreq_update_limits(unsigned int cpu); bool have_governor_per_policy(void); -- cgit v1.2.3 From 108ec36b699475001f5af81ff7db624427d14dbe Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sat, 30 Mar 2019 12:20:22 +0100 Subject: drivers/cpufreq: Convert some slow-path static_cpu_has() callers to boot_cpu_has() Using static_cpu_has() is pointless on those paths, convert them to the boot_cpu_has() variant. No functional changes. Signed-off-by: Borislav Petkov Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/amd_freq_sensitivity.c | 2 +- drivers/cpufreq/intel_pstate.c | 18 +++++++++--------- drivers/cpufreq/powernow-k8.c | 2 +- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/drivers/cpufreq/amd_freq_sensitivity.c b/drivers/cpufreq/amd_freq_sensitivity.c index 4ac7c3cf34be..6927a8c0e748 100644 --- a/drivers/cpufreq/amd_freq_sensitivity.c +++ b/drivers/cpufreq/amd_freq_sensitivity.c @@ -124,7 +124,7 @@ static int __init amd_freq_sensitivity_init(void) PCI_DEVICE_ID_AMD_KERNCZ_SMBUS, NULL); if (!pcidev) { - if (!static_cpu_has(X86_FEATURE_PROC_FEEDBACK)) + if (!boot_cpu_has(X86_FEATURE_PROC_FEEDBACK)) return -ENODEV; } diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 08d1a1e845aa..840500b457c6 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -527,7 +527,7 @@ static s16 intel_pstate_get_epb(struct cpudata *cpu_data) u64 epb; int ret; - if (!static_cpu_has(X86_FEATURE_EPB)) + if (!boot_cpu_has(X86_FEATURE_EPB)) return -ENXIO; ret = rdmsrl_on_cpu(cpu_data->cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb); @@ -541,7 +541,7 @@ static s16 intel_pstate_get_epp(struct cpudata *cpu_data, u64 hwp_req_data) { s16 epp; - if (static_cpu_has(X86_FEATURE_HWP_EPP)) { + if (boot_cpu_has(X86_FEATURE_HWP_EPP)) { /* * When hwp_req_data is 0, means that caller didn't read * MSR_HWP_REQUEST, so need to read and get EPP. @@ -566,7 +566,7 @@ static int intel_pstate_set_epb(int cpu, s16 pref) u64 epb; int ret; - if (!static_cpu_has(X86_FEATURE_EPB)) + if (!boot_cpu_has(X86_FEATURE_EPB)) return -ENXIO; ret = rdmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb); @@ -614,7 +614,7 @@ static int intel_pstate_get_energy_pref_index(struct cpudata *cpu_data) if (epp < 0) return epp; - if (static_cpu_has(X86_FEATURE_HWP_EPP)) { + if (boot_cpu_has(X86_FEATURE_HWP_EPP)) { if (epp == HWP_EPP_PERFORMANCE) return 1; if (epp <= HWP_EPP_BALANCE_PERFORMANCE) @@ -623,7 +623,7 @@ static int intel_pstate_get_energy_pref_index(struct cpudata *cpu_data) return 3; else return 4; - } else if (static_cpu_has(X86_FEATURE_EPB)) { + } else if (boot_cpu_has(X86_FEATURE_EPB)) { /* * Range: * 0x00-0x03 : Performance @@ -651,7 +651,7 @@ static int intel_pstate_set_energy_pref_index(struct cpudata *cpu_data, mutex_lock(&intel_pstate_limits_lock); - if (static_cpu_has(X86_FEATURE_HWP_EPP)) { + if (boot_cpu_has(X86_FEATURE_HWP_EPP)) { u64 value; ret = rdmsrl_on_cpu(cpu_data->cpu, MSR_HWP_REQUEST, &value); @@ -826,7 +826,7 @@ static void intel_pstate_hwp_set(unsigned int cpu) epp = cpu_data->epp_powersave; } update_epp: - if (static_cpu_has(X86_FEATURE_HWP_EPP)) { + if (boot_cpu_has(X86_FEATURE_HWP_EPP)) { value &= ~GENMASK_ULL(31, 24); value |= (u64)epp << 24; } else { @@ -851,7 +851,7 @@ static void intel_pstate_hwp_force_min_perf(int cpu) value |= HWP_MIN_PERF(min_perf); /* Set EPP/EPB to min */ - if (static_cpu_has(X86_FEATURE_HWP_EPP)) + if (boot_cpu_has(X86_FEATURE_HWP_EPP)) value |= HWP_ENERGY_PERF_PREFERENCE(HWP_EPP_POWERSAVE); else intel_pstate_set_epb(cpu, HWP_EPP_BALANCE_POWERSAVE); @@ -1241,7 +1241,7 @@ static void __init intel_pstate_sysfs_expose_params(void) static void intel_pstate_hwp_enable(struct cpudata *cpudata) { /* First disable HWP notification interrupt as we don't process them */ - if (static_cpu_has(X86_FEATURE_HWP_NOTIFY)) + if (boot_cpu_has(X86_FEATURE_HWP_NOTIFY)) wrmsrl_on_cpu(cpudata->cpu, MSR_HWP_INTERRUPT, 0x00); wrmsrl_on_cpu(cpudata->cpu, MSR_PM_ENABLE, 0x1); diff --git a/drivers/cpufreq/powernow-k8.c b/drivers/cpufreq/powernow-k8.c index fb77b39a4ce3..3c12e03fa343 100644 --- a/drivers/cpufreq/powernow-k8.c +++ b/drivers/cpufreq/powernow-k8.c @@ -1178,7 +1178,7 @@ static int powernowk8_init(void) unsigned int i, supported_cpus = 0; int ret; - if (static_cpu_has(X86_FEATURE_HW_PSTATE)) { + if (boot_cpu_has(X86_FEATURE_HW_PSTATE)) { __request_acpi_cpufreq(); return -ENODEV; } -- cgit v1.2.3 From b623fa320f8360f049a6f3c3ccc487cb85af4c5b Mon Sep 17 00:00:00 2001 From: Wen Yang Date: Mon, 1 Apr 2019 09:37:48 +0800 Subject: cpufreq: ap806: fix possible object reference leak The call to of_find_compatible_node returns a node pointer with refcount incremented thus it must be explicitly decremented after the last usage. Detected by coccinelle with the following warnings: ./drivers/cpufreq/armada-8k-cpufreq.c:187:1-7: ERROR: missing of_node_put; acquired a node pointer with refcount incremented on line 130, but without a corresponding object release within this function. ./drivers/cpufreq/armada-8k-cpufreq.c:191:1-7: ERROR: missing of_node_put; acquired a node pointer with refcount incremented on line 130, but without a corresponding object release within this function. Signed-off-by: Wen Yang Cc: Jason Cooper Cc: Andrew Lunn Cc: Gregory Clement Cc: Sebastian Hesselbarth Cc: "Rafael J. Wysocki" Cc: Viresh Kumar Cc: linux-arm-kernel@lists.infradead.org Cc: linux-pm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Viresh Kumar --- drivers/cpufreq/armada-8k-cpufreq.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/cpufreq/armada-8k-cpufreq.c b/drivers/cpufreq/armada-8k-cpufreq.c index b3f4bd647e9b..988ebc326bdb 100644 --- a/drivers/cpufreq/armada-8k-cpufreq.c +++ b/drivers/cpufreq/armada-8k-cpufreq.c @@ -132,6 +132,7 @@ static int __init armada_8k_cpufreq_init(void) of_node_put(node); return -ENODEV; } + of_node_put(node); nb_cpus = num_possible_cpus(); freq_tables = kcalloc(nb_cpus, sizeof(*freq_tables), GFP_KERNEL); -- cgit v1.2.3 From ddb64c5db3cc8fb9c1242214d5798b2c2865681c Mon Sep 17 00:00:00 2001 From: Wen Yang Date: Mon, 1 Apr 2019 09:37:49 +0800 Subject: cpufreq: imx6q: fix possible object reference leak The call to of_node_get returns a node pointer with refcount incremented thus it must be explicitly decremented after the last usage. Detected by coccinelle with the following warnings: ./drivers/cpufreq/imx6q-cpufreq.c:391:4-10: ERROR: missing of_node_put; acquired a node pointer with refcount incremented on line 348, but without a corresponding object release within this function. ./drivers/cpufreq/imx6q-cpufreq.c:395:3-9: ERROR: missing of_node_put; acquired a node pointer with refcount incremented on line 348, but without a corresponding object release within this function. Signed-off-by: Wen Yang Cc: "Rafael J. Wysocki" Cc: Viresh Kumar Cc: Shawn Guo Cc: Sascha Hauer Cc: Pengutronix Kernel Team Cc: Fabio Estevam Cc: NXP Linux Team Cc: linux-pm@vger.kernel.org Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Viresh Kumar --- drivers/cpufreq/imx6q-cpufreq.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/imx6q-cpufreq.c b/drivers/cpufreq/imx6q-cpufreq.c index a4ff09f91c8f..3e17560b1efe 100644 --- a/drivers/cpufreq/imx6q-cpufreq.c +++ b/drivers/cpufreq/imx6q-cpufreq.c @@ -388,11 +388,11 @@ static int imx6q_cpufreq_probe(struct platform_device *pdev) ret = imx6ul_opp_check_speed_grading(cpu_dev); if (ret) { if (ret == -EPROBE_DEFER) - return ret; + goto put_node; dev_err(cpu_dev, "failed to read ocotp: %d\n", ret); - return ret; + goto put_node; } } else { imx6q_opp_check_speed_grading(cpu_dev); -- cgit v1.2.3 From 7c468966f05ac9c17bb5948275283d34e6fe0660 Mon Sep 17 00:00:00 2001 From: Wen Yang Date: Mon, 1 Apr 2019 09:37:50 +0800 Subject: cpufreq: kirkwood: fix possible object reference leak The call to of_get_child_by_name returns a node pointer with refcount incremented thus it must be explicitly decremented after the last usage. Detected by coccinelle with the following warnings: ./drivers/cpufreq/kirkwood-cpufreq.c:127:2-8: ERROR: missing of_node_put; acquired a node pointer with refcount incremented on line 118, but without a corresponding object release within this function. ./drivers/cpufreq/kirkwood-cpufreq.c:133:2-8: ERROR: missing of_node_put; acquired a node pointer with refcount incremented on line 118, but without a corresponding object release within this function. and also do some cleanup: - of_node_put(np); - np = NULL; ... of_node_put(np); Signed-off-by: Wen Yang Cc: "Rafael J. Wysocki" Cc: Viresh Kumar Cc: linux-pm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Viresh Kumar --- drivers/cpufreq/kirkwood-cpufreq.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/drivers/cpufreq/kirkwood-cpufreq.c b/drivers/cpufreq/kirkwood-cpufreq.c index c2dd43f3f5d8..8d63a6dc8383 100644 --- a/drivers/cpufreq/kirkwood-cpufreq.c +++ b/drivers/cpufreq/kirkwood-cpufreq.c @@ -124,13 +124,14 @@ static int kirkwood_cpufreq_probe(struct platform_device *pdev) priv.cpu_clk = of_clk_get_by_name(np, "cpu_clk"); if (IS_ERR(priv.cpu_clk)) { dev_err(priv.dev, "Unable to get cpuclk\n"); - return PTR_ERR(priv.cpu_clk); + err = PTR_ERR(priv.cpu_clk); + goto out_node; } err = clk_prepare_enable(priv.cpu_clk); if (err) { dev_err(priv.dev, "Unable to prepare cpuclk\n"); - return err; + goto out_node; } kirkwood_freq_table[0].frequency = clk_get_rate(priv.cpu_clk) / 1000; @@ -161,20 +162,22 @@ static int kirkwood_cpufreq_probe(struct platform_device *pdev) goto out_ddr; } - of_node_put(np); - np = NULL; - err = cpufreq_register_driver(&kirkwood_cpufreq_driver); - if (!err) - return 0; + if (err) { + dev_err(priv.dev, "Failed to register cpufreq driver\n"); + goto out_powersave; + } - dev_err(priv.dev, "Failed to register cpufreq driver\n"); + of_node_put(np); + return 0; +out_powersave: clk_disable_unprepare(priv.powersave_clk); out_ddr: clk_disable_unprepare(priv.ddr_clk); out_cpu: clk_disable_unprepare(priv.cpu_clk); +out_node: of_node_put(np); return err; -- cgit v1.2.3 From ddb07fba1c645791ead16d1eee0639a033fb8cf9 Mon Sep 17 00:00:00 2001 From: Wen Yang Date: Mon, 1 Apr 2019 09:37:51 +0800 Subject: cpufreq: maple: fix possible object reference leak The call to of_cpu_device_node_get returns a node pointer with refcount incremented thus it must be explicitly decremented after the last usage. Detected by coccinelle with the following warnings: ./drivers/cpufreq/maple-cpufreq.c:213:2-8: ERROR: missing of_node_put; acquired a node pointer with refcount incremented on line 177, but without a corresponding object release within this function. Signed-off-by: Wen Yang Cc: "Rafael J. Wysocki" Cc: Viresh Kumar Cc: linux-pm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Viresh Kumar --- drivers/cpufreq/maple-cpufreq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cpufreq/maple-cpufreq.c b/drivers/cpufreq/maple-cpufreq.c index d9df89392b84..a05f1342ec02 100644 --- a/drivers/cpufreq/maple-cpufreq.c +++ b/drivers/cpufreq/maple-cpufreq.c @@ -210,7 +210,7 @@ static int __init maple_cpufreq_init(void) */ valp = of_get_property(cpunode, "clock-frequency", NULL); if (!valp) - return -ENODEV; + goto bail_noprops; max_freq = (*valp)/1000; maple_cpu_freqs[0].frequency = max_freq; maple_cpu_freqs[1].frequency = max_freq/2; -- cgit v1.2.3 From a9acc26b75f652f697e02a9febe2ab0da648a571 Mon Sep 17 00:00:00 2001 From: Wen Yang Date: Mon, 1 Apr 2019 09:37:52 +0800 Subject: cpufreq/pasemi: fix possible object reference leak The call to of_get_cpu_node returns a node pointer with refcount incremented thus it must be explicitly decremented after the last usage. Detected by coccinelle with the following warnings: ./drivers/cpufreq/pasemi-cpufreq.c:212:1-7: ERROR: missing of_node_put; acquired a node pointer with refcount incremented on line 147, but without a corresponding object release within this function. ./drivers/cpufreq/pasemi-cpufreq.c:220:1-7: ERROR: missing of_node_put; acquired a node pointer with refcount incremented on line 147, but without a corresponding object release within this function. Signed-off-by: Wen Yang Cc: "Rafael J. Wysocki" Cc: Viresh Kumar Cc: linuxppc-dev@lists.ozlabs.org Cc: linux-pm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Viresh Kumar --- drivers/cpufreq/pasemi-cpufreq.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/cpufreq/pasemi-cpufreq.c b/driver