From 521b512b157a1315ff2bf11c11ab184c79515aea Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Wed, 27 May 2020 10:58:47 +0100 Subject: PM / EM: change naming convention from 'capacity' to 'performance' The Energy Model uses concept of performance domain and capacity states in order to calculate power used by CPUs. Change naming convention from capacity to performance state would enable wider usage in future, e.g. upcoming support for other devices other than CPUs. Acked-by: Daniel Lezcano Acked-by: Quentin Perret Signed-off-by: Lukasz Luba Signed-off-by: Rafael J. Wysocki --- drivers/thermal/cpufreq_cooling.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'drivers') diff --git a/drivers/thermal/cpufreq_cooling.c b/drivers/thermal/cpufreq_cooling.c index 9e124020519f..641995ebc107 100644 --- a/drivers/thermal/cpufreq_cooling.c +++ b/drivers/thermal/cpufreq_cooling.c @@ -333,18 +333,18 @@ static inline bool em_is_sane(struct cpufreq_cooling_device *cpufreq_cdev, return false; policy = cpufreq_cdev->policy; - if (!cpumask_equal(policy->related_cpus, to_cpumask(em->cpus))) { + if (!cpumask_equal(policy->related_cpus, em_span_cpus(em))) { pr_err("The span of pd %*pbl is misaligned with cpufreq policy %*pbl\n", - cpumask_pr_args(to_cpumask(em->cpus)), + cpumask_pr_args(em_span_cpus(em)), cpumask_pr_args(policy->related_cpus)); return false; } nr_levels = cpufreq_cdev->max_level + 1; - if (em->nr_cap_states != nr_levels) { - pr_err("The number of cap states in pd %*pbl (%u) doesn't match the number of cooling levels (%u)\n", - cpumask_pr_args(to_cpumask(em->cpus)), - em->nr_cap_states, nr_levels); + if (em_pd_nr_perf_states(em) != nr_levels) { + pr_err("The number of performance states in pd %*pbl (%u) doesn't match the number of cooling levels (%u)\n", + cpumask_pr_args(em_span_cpus(em)), + em_pd_nr_perf_states(em), nr_levels); return false; } -- cgit v1.2.3 From d0351cc3b0f57214d157e4d589564730af2aedae Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Wed, 27 May 2020 10:58:49 +0100 Subject: PM / EM: update callback structure and add device pointer The Energy Model framework is going to support devices other that CPUs. In order to make this happen change the callback function and add pointer to a device as an argument. Update the related users to use new function and new callback from the Energy Model. Acked-by: Quentin Perret Signed-off-by: Lukasz Luba Acked-by: Daniel Lezcano Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/scmi-cpufreq.c | 11 +++-------- drivers/opp/of.c | 9 ++------- 2 files changed, 5 insertions(+), 15 deletions(-) (limited to 'drivers') diff --git a/drivers/cpufreq/scmi-cpufreq.c b/drivers/cpufreq/scmi-cpufreq.c index 61623e2ff149..11ee24e06d12 100644 --- a/drivers/cpufreq/scmi-cpufreq.c +++ b/drivers/cpufreq/scmi-cpufreq.c @@ -103,17 +103,12 @@ scmi_get_sharing_cpus(struct device *cpu_dev, struct cpumask *cpumask) } static int __maybe_unused -scmi_get_cpu_power(unsigned long *power, unsigned long *KHz, int cpu) +scmi_get_cpu_power(unsigned long *power, unsigned long *KHz, + struct device *cpu_dev) { - struct device *cpu_dev = get_cpu_device(cpu); unsigned long Hz; int ret, domain; - if (!cpu_dev) { - pr_err("failed to get cpu%d device\n", cpu); - return -ENODEV; - } - domain = handle->perf_ops->device_domain_id(cpu_dev); if (domain < 0) return domain; @@ -200,7 +195,7 @@ static int scmi_cpufreq_init(struct cpufreq_policy *policy) policy->fast_switch_possible = true; - em_register_perf_domain(policy->cpus, nr_opp, &em_cb); + em_dev_register_perf_domain(cpu_dev, nr_opp, &em_cb, policy->cpus); return 0; diff --git a/drivers/opp/of.c b/drivers/opp/of.c index 9a5873591a40..e273f419a4bf 100644 --- a/drivers/opp/of.c +++ b/drivers/opp/of.c @@ -1216,9 +1216,8 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_get_of_node); * calculation failed because of missing parameters, 0 otherwise. */ static int __maybe_unused _get_cpu_power(unsigned long *mW, unsigned long *kHz, - int cpu) + struct device *cpu_dev) { - struct device *cpu_dev; struct dev_pm_opp *opp; struct device_node *np; unsigned long mV, Hz; @@ -1226,10 +1225,6 @@ static int __maybe_unused _get_cpu_power(unsigned long *mW, unsigned long *kHz, u64 tmp; int ret; - cpu_dev = get_cpu_device(cpu); - if (!cpu_dev) - return -ENODEV; - np = of_node_get(cpu_dev->of_node); if (!np) return -EINVAL; @@ -1297,6 +1292,6 @@ void dev_pm_opp_of_register_em(struct cpumask *cpus) if (ret || !cap) return; - em_register_perf_domain(cpus, nr_opp, &em_cb); + em_dev_register_perf_domain(cpu_dev, nr_opp, &em_cb, cpus); } EXPORT_SYMBOL_GPL(dev_pm_opp_of_register_em); -- cgit v1.2.3 From 0e0ffa855d1590e54ec0033404a49e2e57e294fe Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Wed, 27 May 2020 10:58:54 +0100 Subject: OPP: refactor dev_pm_opp_of_register_em() and update related drivers The Energy Model framework supports not only CPU devices. Drop the CPU specific interface with cpumask and add struct device. Add also a return value, user might use it. This new interface provides easy way to create a simple Energy Model, which then might be used by e.g. thermal subsystem. Acked-by: Daniel Lezcano Signed-off-by: Lukasz Luba Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq-dt.c | 2 +- drivers/cpufreq/imx6q-cpufreq.c | 2 +- drivers/cpufreq/mediatek-cpufreq.c | 2 +- drivers/cpufreq/omap-cpufreq.c | 2 +- drivers/cpufreq/qcom-cpufreq-hw.c | 2 +- drivers/cpufreq/scpi-cpufreq.c | 2 +- drivers/cpufreq/vexpress-spc-cpufreq.c | 2 +- drivers/opp/of.c | 71 +++++++++++++++++++++------------- 8 files changed, 52 insertions(+), 33 deletions(-) (limited to 'drivers') diff --git a/drivers/cpufreq/cpufreq-dt.c b/drivers/cpufreq/cpufreq-dt.c index 79742bbd221f..944d7b45afe9 100644 --- a/drivers/cpufreq/cpufreq-dt.c +++ b/drivers/cpufreq/cpufreq-dt.c @@ -279,7 +279,7 @@ static int cpufreq_init(struct cpufreq_policy *policy) policy->cpuinfo.transition_latency = transition_latency; policy->dvfs_possible_from_any_cpu = true; - dev_pm_opp_of_register_em(policy->cpus); + dev_pm_opp_of_register_em(cpu_dev, policy->cpus); return 0; diff --git a/drivers/cpufreq/imx6q-cpufreq.c b/drivers/cpufreq/imx6q-cpufreq.c index fdb2ffffbd15..ef7b34c1fd2b 100644 --- a/drivers/cpufreq/imx6q-cpufreq.c +++ b/drivers/cpufreq/imx6q-cpufreq.c @@ -193,7 +193,7 @@ static int imx6q_cpufreq_init(struct cpufreq_policy *policy) policy->clk = clks[ARM].clk; cpufreq_generic_init(policy, freq_table, transition_latency); policy->suspend_freq = max_freq; - dev_pm_opp_of_register_em(policy->cpus); + dev_pm_opp_of_register_em(cpu_dev, policy->cpus); return 0; } diff --git a/drivers/cpufreq/mediatek-cpufreq.c b/drivers/cpufreq/mediatek-cpufreq.c index 0c98dd08273d..7d1212c9b7c8 100644 --- a/drivers/cpufreq/mediatek-cpufreq.c +++ b/drivers/cpufreq/mediatek-cpufreq.c @@ -448,7 +448,7 @@ static int mtk_cpufreq_init(struct cpufreq_policy *policy) policy->driver_data = info; policy->clk = info->cpu_clk; - dev_pm_opp_of_register_em(policy->cpus); + dev_pm_opp_of_register_em(info->cpu_dev, policy->cpus); return 0; } diff --git a/drivers/cpufreq/omap-cpufreq.c b/drivers/cpufreq/omap-cpufreq.c index 8d14b42a8c6f..3694bb030df3 100644 --- a/drivers/cpufreq/omap-cpufreq.c +++ b/drivers/cpufreq/omap-cpufreq.c @@ -131,7 +131,7 @@ static int omap_cpu_init(struct cpufreq_policy *policy) /* FIXME: what's the actual transition time? */ cpufreq_generic_init(policy, freq_table, 300 * 1000); - dev_pm_opp_of_register_em(policy->cpus); + dev_pm_opp_of_register_em(mpu_dev, policy->cpus); return 0; } diff --git a/drivers/cpufreq/qcom-cpufreq-hw.c b/drivers/cpufreq/qcom-cpufreq-hw.c index fc92a8842e25..0a04b6f03b9a 100644 --- a/drivers/cpufreq/qcom-cpufreq-hw.c +++ b/drivers/cpufreq/qcom-cpufreq-hw.c @@ -238,7 +238,7 @@ static int qcom_cpufreq_hw_cpu_init(struct cpufreq_policy *policy) goto error; } - dev_pm_opp_of_register_em(policy->cpus); + dev_pm_opp_of_register_em(cpu_dev, policy->cpus); policy->fast_switch_possible = true; diff --git a/drivers/cpufreq/scpi-cpufreq.c b/drivers/cpufreq/scpi-cpufreq.c index 20d1f85d5f5a..b0f5388b8854 100644 --- a/drivers/cpufreq/scpi-cpufreq.c +++ b/drivers/cpufreq/scpi-cpufreq.c @@ -167,7 +167,7 @@ static int scpi_cpufreq_init(struct cpufreq_policy *policy) policy->fast_switch_possible = false; - dev_pm_opp_of_register_em(policy->cpus); + dev_pm_opp_of_register_em(cpu_dev, policy->cpus); return 0; diff --git a/drivers/cpufreq/vexpress-spc-cpufreq.c b/drivers/cpufreq/vexpress-spc-cpufreq.c index 83c85d3d67e3..4e8b1dee7c9a 100644 --- a/drivers/cpufreq/vexpress-spc-cpufreq.c +++ b/drivers/cpufreq/vexpress-spc-cpufreq.c @@ -450,7 +450,7 @@ static int ve_spc_cpufreq_init(struct cpufreq_policy *policy) policy->freq_table = freq_table[cur_cluster]; policy->cpuinfo.transition_latency = 1000000; /* 1 ms */ - dev_pm_opp_of_register_em(policy->cpus); + dev_pm_opp_of_register_em(cpu_dev, policy->cpus); if (is_bL_switching_enabled()) per_cpu(cpu_last_req_freq, policy->cpu) = diff --git a/drivers/opp/of.c b/drivers/opp/of.c index e273f419a4bf..4aa42739599e 100644 --- a/drivers/opp/of.c +++ b/drivers/opp/of.c @@ -1205,18 +1205,18 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_get_of_node); /* * Callback function provided to the Energy Model framework upon registration. - * This computes the power estimated by @CPU at @kHz if it is the frequency + * This computes the power estimated by @dev at @kHz if it is the frequency * of an existing OPP, or at the frequency of the first OPP above @kHz otherwise * (see dev_pm_opp_find_freq_ceil()). This function updates @kHz to the ceiled * frequency and @mW to the associated power. The power is estimated as - * P = C * V^2 * f with C being the CPU's capacitance and V and f respectively - * the voltage and frequency of the OPP. + * P = C * V^2 * f with C being the device's capacitance and V and f + * respectively the voltage and frequency of the OPP. * - * Returns -ENODEV if the CPU device cannot be found, -EINVAL if the power - * calculation failed because of missing parameters, 0 otherwise. + * Returns -EINVAL if the power calculation failed because of missing + * parameters, 0 otherwise. */ -static int __maybe_unused _get_cpu_power(unsigned long *mW, unsigned long *kHz, - struct device *cpu_dev) +static int __maybe_unused _get_power(unsigned long *mW, unsigned long *kHz, + struct device *dev) { struct dev_pm_opp *opp; struct device_node *np; @@ -1225,7 +1225,7 @@ static int __maybe_unused _get_cpu_power(unsigned long *mW, unsigned long *kHz, u64 tmp; int ret; - np = of_node_get(cpu_dev->of_node); + np = of_node_get(dev->of_node); if (!np) return -EINVAL; @@ -1235,7 +1235,7 @@ static int __maybe_unused _get_cpu_power(unsigned long *mW, unsigned long *kHz, return -EINVAL; Hz = *kHz * 1000; - opp = dev_pm_opp_find_freq_ceil(cpu_dev, &Hz); + opp = dev_pm_opp_find_freq_ceil(dev, &Hz); if (IS_ERR(opp)) return -EINVAL; @@ -1255,30 +1255,38 @@ static int __maybe_unused _get_cpu_power(unsigned long *mW, unsigned long *kHz, /** * dev_pm_opp_of_register_em() - Attempt to register an Energy Model - * @cpus : CPUs for which an Energy Model has to be registered + * @dev : Device for which an Energy Model has to be registered + * @cpus : CPUs for which an Energy Model has to be registered. For + * other type of devices it should be set to NULL. * * This checks whether the "dynamic-power-coefficient" devicetree property has * been specified, and tries to register an Energy Model with it if it has. + * Having this property means the voltages are known for OPPs and the EM + * might be calculated. */ -void dev_pm_opp_of_register_em(struct cpumask *cpus) +int dev_pm_opp_of_register_em(struct device *dev, struct cpumask *cpus) { - struct em_data_callback em_cb = EM_DATA_CB(_get_cpu_power); - int ret, nr_opp, cpu = cpumask_first(cpus); - struct device *cpu_dev; + struct em_data_callback em_cb = EM_DATA_CB(_get_power); struct device_node *np; + int ret, nr_opp; u32 cap; - cpu_dev = get_cpu_device(cpu); - if (!cpu_dev) - return; + if (IS_ERR_OR_NULL(dev)) { + ret = -EINVAL; + goto failed; + } - nr_opp = dev_pm_opp_get_opp_count(cpu_dev); - if (nr_opp <= 0) - return; + nr_opp = dev_pm_opp_get_opp_count(dev); + if (nr_opp <= 0) { + ret = -EINVAL; + goto failed; + } - np = of_node_get(cpu_dev->of_node); - if (!np) - return; + np = of_node_get(dev->of_node); + if (!np) { + ret = -EINVAL; + goto failed; + } /* * Register an EM only if the 'dynamic-power-coefficient' property is @@ -1289,9 +1297,20 @@ void dev_pm_opp_of_register_em(struct cpumask *cpus) */ ret = of_property_read_u32(np, "dynamic-power-coefficient", &cap); of_node_put(np); - if (ret || !cap) - return; + if (ret || !cap) { + dev_dbg(dev, "Couldn't find proper 'dynamic-power-coefficient' in DT\n"); + ret = -EINVAL; + goto failed; + } - em_dev_register_perf_domain(cpu_dev, nr_opp, &em_cb, cpus); + ret = em_dev_register_perf_domain(dev, nr_opp, &em_cb, cpus); + if (ret) + goto failed; + + return 0; + +failed: + dev_dbg(dev, "Couldn't register Energy Model %d\n", ret); + return ret; } EXPORT_SYMBOL_GPL(dev_pm_opp_of_register_em); -- cgit v1.2.3 From dab20177b626198603176604c0f85ea1d67044ef Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 29 Jun 2020 13:58:28 +0200 Subject: intel_idle: Eliminate redundant static variable The value of the lapic_timer_always_reliable static variable in the intel_idle driver reflects the boot_cpu_has(X86_FEATURE_ARAT) value and so it also reflects the static_cpu_has(X86_FEATURE_ARAT) value. Hence, the lapic_timer_always_reliable check in intel_idle() is redundant and apart from this lapic_timer_always_reliable is only used in two places in which boot_cpu_has(X86_FEATURE_ARAT) can be used directly. Eliminate the lapic_timer_always_reliable variable in accordance with the above observations. No intentional functional impact. Signed-off-by: Rafael J. Wysocki --- drivers/idle/intel_idle.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'drivers') diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index f4495841bf68..fa23a7ea01ac 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -66,8 +66,6 @@ static struct cpuidle_device __percpu *intel_idle_cpuidle_devices; static unsigned long auto_demotion_disable_flags; static bool disable_promotion_to_c1e; -static bool lapic_timer_always_reliable; - struct idle_cpu { struct cpuidle_state *state_table; @@ -142,7 +140,7 @@ static __cpuidle int intel_idle(struct cpuidle_device *dev, if (state->flags & CPUIDLE_FLAG_TLB_FLUSHED) leave_mm(cpu); - if (!static_cpu_has(X86_FEATURE_ARAT) && !lapic_timer_always_reliable) { + if (!static_cpu_has(X86_FEATURE_ARAT)) { /* * Switch over to one-shot tick broadcast if the target C-state * is deeper than C1. @@ -1562,7 +1560,7 @@ static int intel_idle_cpu_online(unsigned int cpu) { struct cpuidle_device *dev; - if (!lapic_timer_always_reliable) + if (!boot_cpu_has(X86_FEATURE_ARAT)) tick_broadcast_enable(); /* @@ -1655,16 +1653,13 @@ static int __init intel_idle_init(void) goto init_driver_fail; } - if (boot_cpu_has(X86_FEATURE_ARAT)) /* Always Reliable APIC Timer */ - lapic_timer_always_reliable = true; - retval = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "idle/intel:online", intel_idle_cpu_online, NULL); if (retval < 0) goto hp_setup_fail; pr_debug("Local APIC timer is reliable in %s\n", - lapic_timer_always_reliable ? "all C-states" : "C1"); + boot_cpu_has(X86_FEATURE_ARAT) ? "all C-states" : "C1"); return 0; -- cgit v1.2.3 From 2d798d9f5967a3f134b1c55acbbe026c032e3429 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Mon, 29 Jun 2020 13:34:50 +0800 Subject: powercap: intel_rapl: add support for Sapphire Rapids RAPL on SPR behaves similar to Haswell server, except that SPR uses a fixed energy unit (1 Joule) for the PSYS RAPL domain. Signed-off-by: Zhang Rui Signed-off-by: Rafael J. Wysocki --- drivers/powercap/intel_rapl_common.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c index 61a63a16b5e7..b739ce4f390d 100644 --- a/drivers/powercap/intel_rapl_common.c +++ b/drivers/powercap/intel_rapl_common.c @@ -93,6 +93,7 @@ struct rapl_defaults { u64 (*compute_time_window)(struct rapl_package *rp, u64 val, bool to_raw); unsigned int dram_domain_energy_unit; + unsigned int psys_domain_energy_unit; }; static struct rapl_defaults *rapl_defaults; @@ -533,12 +534,23 @@ static void rapl_init_domains(struct rapl_package *rp) for (j = 0; j < RAPL_DOMAIN_REG_MAX; j++) rd->regs[j] = rp->priv->regs[i][j]; - if (i == RAPL_DOMAIN_DRAM) { + switch (i) { + case RAPL_DOMAIN_DRAM: rd->domain_energy_unit = rapl_defaults->dram_domain_energy_unit; if (rd->domain_energy_unit) pr_info("DRAM domain energy unit %dpj\n", rd->domain_energy_unit); + break; + case RAPL_DOMAIN_PLATFORM: + rd->domain_energy_unit = + rapl_defaults->psys_domain_energy_unit; + if (rd->domain_energy_unit) + pr_info("Platform domain energy unit %dpj\n", + rd->domain_energy_unit); + break; + default: + break; } rd++; } @@ -919,6 +931,14 @@ static const struct rapl_defaults rapl_defaults_hsw_server = { .dram_domain_energy_unit = 15300, }; +static const struct rapl_defaults rapl_defaults_spr_server = { + .check_unit = rapl_check_unit_core, + .set_floor_freq = set_floor_freq_default, + .compute_time_window = rapl_compute_time_window_core, + .dram_domain_energy_unit = 15300, + .psys_domain_energy_unit = 1000000000, +}; + static const struct rapl_defaults rapl_defaults_byt = { .floor_freq_reg_addr = IOSF_CPU_POWER_BUDGET_CTL_BYT, .check_unit = rapl_check_unit_atom, @@ -978,6 +998,7 @@ static const struct x86_cpu_id rapl_ids[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE_L, &rapl_defaults_core), X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE, &rapl_defaults_core), X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, &rapl_defaults_core), + X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &rapl_defaults_spr_server), X86_MATCH_INTEL_FAM6_MODEL(ATOM_SILVERMONT, &rapl_defaults_byt), X86_MATCH_INTEL_FAM6_MODEL(ATOM_AIRMONT, &rapl_defaults_cht), -- cgit v1.2.3 From ed7bde7a6dab521ee9e28fe2264018f83d83c7fa Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Fri, 26 Jun 2020 11:34:00 -0700 Subject: cpufreq: intel_pstate: Allow enable/disable energy efficiency By default intel_pstate the driver disables energy efficiency by setting MSR_IA32_POWER_CTL bit 19 for Kaby Lake desktop CPU model in HWP mode. This CPU model is also shared by Coffee Lake desktop CPUs. This allows these systems to reach maximum possible frequency. But this adds power penalty, which some customers don't want. They want some way to enable/ disable dynamically. So, add an additional attribute "energy_efficiency" under /sys/devices/system/cpu/intel_pstate/ for these CPU models. This allows to read and write bit 19 ("Disable Energy Efficiency Optimization") in the MSR IA32_POWER_CTL. This attribute is present in both HWP and non-HWP mode as this has an effect in both modes. Refer to Intel Software Developer's manual for details. The scope of this bit is package wide. Also these systems are single package systems. So read/write MSR on the current CPU is enough. The energy efficiency (EE) bit setting needs to be preserved during suspend/resume and CPU offline/online operation. To do this: - Restoring the EE setting from the cpufreq resume() callback, if there is change from the system default. - By default, don't disable EE from cpufreq init() callback for matching CPU models. Since the scope is package wide and is a single package system, move the disable EE calls from init() callback to intel_pstate_init() function, which is called only once. Suggested-by: Len Brown Signed-off-by: Srinivas Pandruvada Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 97 +++++++++++++++++++++++++++++++----------- 1 file changed, 73 insertions(+), 24 deletions(-) (limited to 'drivers') diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index e771e8b4f99f..fd1bef0b9bb4 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -866,10 +866,39 @@ static int intel_pstate_hwp_save_state(struct cpufreq_policy *policy) return 0; } +#define POWER_CTL_EE_ENABLE 1 +#define POWER_CTL_EE_DISABLE 2 + +static int power_ctl_ee_state; + +static void set_power_ctl_ee_state(bool input) +{ + u64 power_ctl; + + mutex_lock(&intel_pstate_driver_lock); + rdmsrl(MSR_IA32_POWER_CTL, power_ctl); + if (input) { + power_ctl &= ~BIT(MSR_IA32_POWER_CTL_BIT_EE); + power_ctl_ee_state = POWER_CTL_EE_ENABLE; + } else { + power_ctl |= BIT(MSR_IA32_POWER_CTL_BIT_EE); + power_ctl_ee_state = POWER_CTL_EE_DISABLE; + } + wrmsrl(MSR_IA32_POWER_CTL, power_ctl); + mutex_unlock(&intel_pstate_driver_lock); +} + static void intel_pstate_hwp_enable(struct cpudata *cpudata); static int intel_pstate_resume(struct cpufreq_policy *policy) { + + /* Only restore if the system default is changed */ + if (power_ctl_ee_state == POWER_CTL_EE_ENABLE) + set_power_ctl_ee_state(true); + else if (power_ctl_ee_state == POWER_CTL_EE_DISABLE) + set_power_ctl_ee_state(false); + if (!hwp_active) return 0; @@ -1218,6 +1247,32 @@ static ssize_t store_hwp_dynamic_boost(struct kobject *a, return count; } +static ssize_t show_energy_efficiency(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + u64 power_ctl; + int enable; + + rdmsrl(MSR_IA32_POWER_CTL, power_ctl); + enable = !!(power_ctl & BIT(MSR_IA32_POWER_CTL_BIT_EE)); + return sprintf(buf, "%d\n", !enable); +} + +static ssize_t store_energy_efficiency(struct kobject *a, struct kobj_attribute *b, + const char *buf, size_t count) +{ + bool input; + int ret; + + ret = kstrtobool(buf, &input); + if (ret) + return ret; + + set_power_ctl_ee_state(input); + + return count; +} + show_one(max_perf_pct, max_perf_pct); show_one(min_perf_pct, min_perf_pct); @@ -1228,6 +1283,7 @@ define_one_global_rw(min_perf_pct); define_one_global_ro(turbo_pct); define_one_global_ro(num_pstates); define_one_global_rw(hwp_dynamic_boost); +define_one_global_rw(energy_efficiency); static struct attribute *intel_pstate_attributes[] = { &status.attr, @@ -1241,6 +1297,8 @@ static const struct attribute_group intel_pstate_attr_group = { .attrs = intel_pstate_attributes, }; +static const struct x86_cpu_id intel_pstate_cpu_ee_disable_ids[]; + static void __init intel_pstate_sysfs_expose_params(void) { struct kobject *intel_pstate_kobject; @@ -1273,6 +1331,11 @@ static void __init intel_pstate_sysfs_expose_params(void) &hwp_dynamic_boost.attr); WARN_ON(rc); } + + if (x86_match_cpu(intel_pstate_cpu_ee_disable_ids)) { + rc = sysfs_create_file(intel_pstate_kobject, &energy_efficiency.attr); + WARN_ON(rc); + } } /************************** sysfs end ************************/ @@ -1288,25 +1351,6 @@ static void intel_pstate_hwp_enable(struct cpudata *cpudata) cpudata->epp_default = intel_pstate_get_epp(cpudata, 0); } -#define MSR_IA32_POWER_CTL_BIT_EE 19 - -/* Disable energy efficiency optimization */ -static void intel_pstate_disable_ee(int cpu) -{ - u64 power_ctl; - int ret; - - ret = rdmsrl_on_cpu(cpu, MSR_IA32_POWER_CTL, &power_ctl); - if (ret) - return; - - if (!(power_ctl & BIT(MSR_IA32_POWER_CTL_BIT_EE))) { - pr_info("Disabling energy efficiency optimization\n"); - power_ctl |= BIT(MSR_IA32_POWER_CTL_BIT_EE); - wrmsrl_on_cpu(cpu, MSR_IA32_POWER_CTL, power_ctl); - } -} - static int atom_get_min_pstate(void) { u64 value; @@ -1982,10 +2026,6 @@ static int intel_pstate_init_cpu(unsigned int cpunum) if (hwp_active) { const struct x86_cpu_id *id; - id = x86_match_cpu(intel_pstate_cpu_ee_disable_ids); - if (id) - intel_pstate_disable_ee(cpunum); - intel_pstate_hwp_enable(cpu); id = x86_match_cpu(intel_pstate_hwp_boost_ids); @@ -2806,8 +2846,17 @@ hwp_cpu_matched: if (rc) return rc; - if (hwp_active) + if (hwp_active) { + const struct x86_cpu_id *id; + + id = x86_match_cpu(intel_pstate_cpu_ee_disable_ids); + if (id) { + set_power_ctl_ee_state(false); + pr_info("Disabling energy efficiency optimization\n"); + } + pr_info("HWP enabled\n"); + } return 0; } -- cgit v1.2.3 From f473bf398bf1ed42b1bdbc5206a804d8d3140a2d Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Fri, 26 Jun 2020 11:34:01 -0700 Subject: cpufreq: intel_pstate: Allow raw energy performance preference value Currently using attribute "energy_performance_preference", user space can write one of the four per-defined preference string. These preference strings gets mapped to a hard-coded Energy-Performance Preference (EPP) or Energy-Performance Bias (EPB) knob. These four values are supposed to cover broad spectrum of use cases, but are not uniformly distributed in the range. There are number of cases, where this is not enough. For example: Suppose user wants more performance when connected to AC. Instead of using default "balance performance", the "performance" setting can be used. This changes EPP value from 0x80 to 0x00. But setting EPP to 0, results in electrical and thermal issues on some platforms. This results in aggressive throttling, which causes a drop in performance. But some value between 0x80 and 0x00 results in better performance. But that value can't be fixed as the power curve is not linear. In some cases just changing EPP from 0x80 to 0x75 is enough to get significant performance gain. Similarly on battery the default "balance_performance" mode can be aggressive in power consumption. But picking up the next choice "balance power" results in too much loss of performance, which results in bad user experience in use cases like "Google Hangout". It was observed that some value between these two EPP is optimal. This change allows fine grain EPP tuning for platform like Chromebook or for users who wants to fine tune power and performance. Here based on the product and use cases, different EPP values can be set. This change is similar to the change done for: /sys/devices/system/cpu/cpu*/power/energy_perf_bias where user has choice to write a predefined string or raw value. The change itself is trivial. When user preference doesn't match predefined string preferences and value is an unsigned integer and in range, use that value for EPP. When the EPP feature is not present writing raw value is not supported. Suggested-by: Len Brown Signed-off-by: Srinivas Pandruvada Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 50 +++++++++++++++++++++++++++++++++--------- 1 file changed, 40 insertions(+), 10 deletions(-) (limited to 'drivers') diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index fd1bef0b9bb4..44c7b4677675 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -602,11 +602,12 @@ static const unsigned int epp_values[] = { HWP_EPP_POWERSAVE }; -static int intel_pstate_get_energy_pref_index(struct cpudata *cpu_data) +static int intel_pstate_get_energy_pref_index(struct cpudata *cpu_data, int *raw_epp) { s16 epp; int index = -EINVAL; + *raw_epp = 0; epp = intel_pstate_get_epp(cpu_data, 0); if (epp < 0) return epp; @@ -614,12 +615,14 @@ static int intel_pstate_get_energy_pref_index(struct cpudata *cpu_data) if (boot_cpu_has(X86_FEATURE_HWP_EPP)) { if (epp == HWP_EPP_PERFORMANCE) return 1; - if (epp <= HWP_EPP_BALANCE_PERFORMANCE) + if (epp == HWP_EPP_BALANCE_PERFORMANCE) return 2; - if (epp <= HWP_EPP_BALANCE_POWERSAVE) + if (epp == HWP_EPP_BALANCE_POWERSAVE) return 3; - else + if (epp == HWP_EPP_POWERSAVE) return 4; + *raw_epp = epp; + return 0; } else if (boot_cpu_has(X86_FEATURE_EPB)) { /* * Range: @@ -638,7 +641,8 @@ static int intel_pstate_get_energy_pref_index(struct cpudata *cpu_data) } static int intel_pstate_set_energy_pref_index(struct cpudata *cpu_data, - int pref_index) + int pref_index, bool use_raw, + u32 raw_epp) { int epp = -EINVAL; int ret; @@ -657,6 +661,16 @@ static int intel_pstate_set_energy_pref_index(struct cpudata *cpu_data, value &= ~GENMASK_ULL(31, 24); + if (use_raw) { + if (raw_epp > 255) { + ret = -EINVAL; + goto return_pref; + } + value |= (u64)raw_epp << 24; + ret = wrmsrl_on_cpu(cpu_data->cpu, MSR_HWP_REQUEST, value); + goto return_pref; + } + if (epp == -EINVAL) epp = epp_values[pref_index - 1]; @@ -694,6 +708,8 @@ static ssize_t store_energy_performance_preference( { struct cpudata *cpu_data = all_cpu_data[policy->cpu]; char str_preference[21]; + bool raw = false; + u32 epp; int ret; ret = sscanf(buf, "%20s", str_preference); @@ -701,10 +717,21 @@ static ssize_t store_energy_performance_preference( return -EINVAL; ret = match_string(energy_perf_strings, -1, str_preference); - if (ret < 0) + if (ret < 0) { + if (!boot_cpu_has(X86_FEATURE_HWP_EPP)) + return ret; + + ret = kstrtouint(buf, 10, &epp); + if (ret) + return ret; + + raw = true; + } + + ret = intel_pstate_set_energy_pref_index(cpu_data, ret, raw, epp); + if (ret) return ret; - intel_pstate_set_energy_pref_index(cpu_data, ret); return count; } @@ -712,13 +739,16 @@ static ssize_t show_energy_performance_preference( struct cpufreq_policy *policy, char *buf) { struct cpudata *cpu_data = all_cpu_data[policy->cpu]; - int preference; + int preference, raw_epp; - preference = intel_pstate_get_energy_pref_index(cpu_data); + preference = intel_pstate_get_energy_pref_index(cpu_data, &raw_epp); if (preference < 0) return preference; - return sprintf(buf, "%s\n", energy_perf_strings[preference]); + if (raw_epp) + return sprintf(buf, "%d\n", raw_epp); + else + return sprintf(buf, "%s\n", energy_perf_strings[preference]); } cpufreq_freq_attr_rw(energy_performance_preference); -- cgit v1.2.3 From 8cc46ae565c393f77417cb9530b1265eb50f5d2e Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Mon, 29 Jun 2020 13:54:58 +0530 Subject: cpufreq: Fix locking issues with governors The locking around governors handling isn't adequate currently. The list of governors should never be traversed without the locking in place. Also governor modules must not be removed while the code in them is still in use. Reported-by: Quentin Perret Signed-off-by: Viresh Kumar Cc: All applicable [ rjw: Changelog ] Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq.c | 58 ++++++++++++++++++++++++++++------------------- 1 file changed, 35 insertions(+), 23 deletions(-) (limited to 'drivers') diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 0128de3603df..e9e8200a0211 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -621,6 +621,24 @@ static struct cpufreq_governor *find_governor(const char *str_governor) return NULL; } +static struct cpufreq_governor *get_governor(const char *str_governor) +{ + struct cpufreq_governor *t; + + mutex_lock(&cpufreq_governor_mutex); + t = find_governor(str_governor); + if (!t) + goto unlock; + + if (!try_module_get(t->owner)) + t = NULL; + +unlock: + mutex_unlock(&cpufreq_governor_mutex); + + return t; +} + static unsigned int cpufreq_parse_policy(char *str_governor) { if (!strncasecmp(str_governor, "performance", CPUFREQ_NAME_LEN)) @@ -640,28 +658,14 @@ static struct cpufreq_governor *cpufreq_parse_governor(char *str_governor) { struct cpufreq_governor *t; - mutex_lock(&cpufreq_governor_mutex); - - t = find_governor(str_governor); - if (!t) { - int ret; - - mutex_unlock(&cpufreq_governor_mutex); - - ret = request_module("cpufreq_%s", str_governor); - if (ret) - return NULL; - - mutex_lock(&cpufreq_governor_mutex); + t = get_governor(str_governor); + if (t) + return t; - t = find_governor(str_governor); - } - if (t && !try_module_get(t->owner)) - t = NULL; - - mutex_unlock(&cpufreq_governor_mutex); + if (request_module("cpufreq_%s", str_governor)) + return NULL; - return t; + return get_governor(str_governor); } /** @@ -815,12 +819,14 @@ static ssize_t show_scaling_available_governors(struct cpufreq_policy *policy, goto out; } + mutex_lock(&cpufreq_governor_mutex); for_each_governor(t) { if (i >= (ssize_t) ((PAGE_SIZE / sizeof(char)) - (CPUFREQ_NAME_LEN + 2))) - goto out; + break; i += scnprintf(&buf[i], CPUFREQ_NAME_PLEN, "%s ", t->name); } + mutex_unlock(&cpufreq_governor_mutex); out: i += sprintf(&buf[i], "\n"); return i; @@ -1058,15 +1064,17 @@ static int cpufreq_init_policy(struct cpufreq_policy *policy) struct cpufreq_governor *def_gov = cpufreq_default_governor(); struct cpufreq_governor *gov = NULL; unsigned int pol = CPUFREQ_POLICY_UNKNOWN; + int ret; if (has_target()) { /* Update policy governor to the one used before hotplug. */ - gov = find_governor(policy->last_governor); + gov = get_governor(policy->last_governor); if (gov) { pr_debug("Restoring governor %s for cpu %d\n", policy->governor->name, policy->cpu); } else if (def_gov) { gov = def_gov; + __module_get(gov->owner); } else { return -ENODATA; } @@ -1089,7 +1097,11 @@ static int cpufreq_init_policy(struct cpufreq_policy *policy) return -ENODATA; } - return cpufreq_set_policy(policy, gov, pol); + ret = cpufreq_set_policy(policy, gov, pol); + if (gov) + module_put(gov->owner); + + return ret; } static int cpufreq_add_policy_cpu(struct cpufreq_policy *policy, unsigned int cpu) -- cgit v1.2.3 From 10dd8573b09e84b81539d939d55ebdb6a36c5f3a Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Mon, 29 Jun 2020 13:54:59 +0530 Subject: cpufreq: Register governors at core_initcall Currently, most CPUFreq governors are registered at the core_initcall time when the given governor is the default one, and the module_init time otherwise. In preparation for letting users specify the default governor on the kernel command line, change all of them to be registered at the core_initcall unconditionally, as it is already the case for the schedutil and performance governors. This will allow us to assume that builtin governors have been registered before the built-in CPUFreq drivers probe. And since all governors have similar init/exit patterns now, introduce two new macros, cpufreq_governor_{init,exit}(), to factorize the code. Acked-by: Viresh Kumar Signed-off-by: Quentin Perret Signed-off-by: Viresh Kumar [ rjw: Changelog ] Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq_conservative.c | 22 +++++----------------- drivers/cpufreq/cpufreq_ondemand.c | 24 ++++++------------------ drivers/cpufreq/cpufreq_performance.c | 14 ++------------ drivers/cpufreq/cpufreq_powersave.c | 18 +++--------------- drivers/cpufreq/cpufreq_userspace.c | 18 +++--------------- 5 files changed, 19 insertions(+), 77 deletions(-) (limited to 'drivers') diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c index 737ff3b9c2c0..aa39ff31ec9f 100644 --- a/drivers/cpufreq/cpufreq_conservative.c +++ b/drivers/cpufreq/cpufreq_conservative.c @@ -322,17 +322,7 @@ static struct dbs_governor cs_governor = { .start = cs_start, }; -#define CPU_FREQ_GOV_CONSERVATIVE (&cs_governor.gov) - -static int __init cpufreq_gov_dbs_init(void) -{ - return cpufreq_register_governor(CPU_FREQ_GOV_CONSERVATIVE); -} - -static void __exit cpufreq_gov_dbs_exit(void) -{ - cpufreq_unregister_governor(CPU_FREQ_GOV_CONSERVATIVE); -} +#define CPU_FREQ_GOV_CONSERVATIVE (cs_governor.gov) MODULE_AUTHOR("Alexander Clouter "); MODULE_DESCRIPTION("'cpufreq_conservative' - A dynamic cpufreq governor for " @@ -343,11 +333,9 @@ MODULE_LICENSE("GPL"); #ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE struct cpufreq_governor *cpufreq_default_governor(void) { - return CPU_FREQ_GOV_CONSERVATIVE; + return &CPU_FREQ_GOV_CONSERVATIVE; } - -core_initcall(cpufreq_gov_dbs_init); -#else -module_init(cpufreq_gov_dbs_init); #endif -module_exit(cpufreq_gov_dbs_exit); + +cpufreq_governor_init(CPU_FREQ_GOV_CONSERVATIVE); +cpufreq_governor_exit(CPU_FREQ_GOV_CONSERVATIVE); diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c index 82a4d37ddecb..ac361a8b1d3b 100644 --- a/drivers/cpufreq/cpufreq_ondemand.c +++ b/drivers/cpufreq/cpufreq_ondemand.c @@ -408,7 +408,7 @@ static struct dbs_governor od_dbs_gov = { .start = od_start, }; -#define CPU_FREQ_GOV_ONDEMAND (&od_dbs_gov.gov) +#define CPU_FREQ_GOV_ONDEMAND (od_dbs_gov.gov) static void od_set_powersave_bias(unsigned int powersave_bias) { @@ -429,7 +429,7 @@ static void od_set_powersave_bias(unsigned int powersave_bias) continue; policy = cpufreq_cpu_get_raw(cpu); - if (!policy || policy->governor != CPU_FREQ_GOV_ONDEMAND) + if (!policy || policy->governor != &CPU_FREQ_GOV_ONDEMAND) continue; policy_dbs = policy->governor_data; @@ -461,16 +461,6 @@ void od_unregister_powersave_bias_handler(void) } EXPORT_SYMBOL_GPL(od_unregister_powersave_bias_handler); -static int __init cpufreq_gov_dbs_init(void) -{ - return cpufreq_register_governor(CPU_FREQ_GOV_ONDEMAND); -} - -static void __exit cpufreq_gov_dbs_exit(void) -{ - cpufreq_unregister_governor(CPU_FREQ_GOV_ONDEMAND); -} - MODULE_AUTHOR("Venkatesh Pallipadi "); MODULE_AUTHOR("Alexey Starikovskiy "); MODULE_DESCRIPTION("'cpufreq_ondemand' - A dynamic cpufreq governor for " @@ -480,11 +470,9 @@ MODULE_LICENSE("GPL"); #ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND struct cpufreq_governor *cpufreq_default_governor(void) { - return CPU_FREQ_GOV_ONDEMAND; + return &CPU_FREQ_GOV_ONDEMAND; } - -core_initcall(cpufreq_gov_dbs_init); -#else -module_init(cpufreq_gov_dbs_init); #endif -module_exit(cpufreq_gov_dbs_exit); + +cpufreq_governor_init(CPU_FREQ_GOV_ONDEMAND); +cpufreq_governor_exit(CPU_FREQ_GOV_ONDEMAND); diff --git a/drivers/cpufreq/cpufreq_performance.c b/drivers/cpufreq/cpufreq_performance.c index def9afe0f5b8..71c1d9aba772 100644 --- a/drivers/cpufreq/cpufreq_performance.c +++ b/drivers/cpufreq/cpufreq_performance.c @@ -23,16 +23,6 @@ static struct cpufreq_governor cpufreq_gov_performance = { .limits = cpufreq_gov_performance_limits, }; -static int __init cpufreq_gov_performance_init(void) -{ - return cpufreq_register_governor(&cpufreq_gov_performance); -} - -static void __exit cpufreq_gov_performance_exit(void) -{ - cpufreq_unregister_governor(&cpufreq_gov_performance); -} - #ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE struct cpufreq_governor *cpufreq_default_governor(void) { @@ -50,5 +40,5 @@ MODULE_AUTHOR("Dominik Brodowski "); MODULE_DESCRIPTION("CPUfreq policy governor 'performance'"); MODULE_LICENSE("GPL"); -core_initcall(cpufreq_gov_performance_init); -module_exit(cpufreq_gov_performance_exit); +cpufreq_governor_init(cpufreq_gov_performance); +cpufreq_governor_exit(cpufreq_gov_performance); diff --git a/drivers/cpufreq/cpufreq_powersave.c b/drivers/cpufreq/cpufreq_powersave.c index 1ae66019eb83..7749522355b5 100644 --- a/drivers/cpufreq/cpufreq_powersave.c +++ b/drivers/cpufreq/cpufreq_powersave.c @@ -23,16 +23,6 @@ static struct cpufreq_governor cpufreq_gov_powersave = { .owner = THIS_MODULE, }; -static int __init cpufreq_gov_powersave_init(void) -{ - return cpufreq_register_governor(&cpufreq_gov_powersave); -} - -static void __exit cpufreq_gov_powersave_exit(void) -{ - cpufreq_unregister_governor(&cpufreq_gov_powersave); -} - MODULE_AUTHOR("Dominik Brodowski "); MODULE_DESCRIPTION("CPUfreq policy governor 'powersave'"); MODULE_LICENSE("GPL"); @@ -42,9 +32,7 @@ struct cpufreq_governor *cpufreq_default_governor(void) { return &cpufreq_gov_powersave; } - -core_initcall(cpufreq_gov_powersave_init); -#else -module_init(cpufreq_gov_powersave_init); #endif -module_exit(cpufreq_gov_powersave_exit); + +cpufreq_governor_init(cpufreq_gov_powersave); +cpufreq_governor_exit(cpufreq_gov_powersave); diff --git a/drivers/cpufreq/cpufreq_userspace.c b/drivers/cpufreq/cpufreq_userspace.c index b43e7cd502c5..50a4d7846580 100644 --- a/drivers/cpufreq/cpufreq_userspace.c +++ b/drivers/cpufreq/cpufreq_userspace.c @@ -126,16 +126,6 @@ static struct cpufreq_governor cpufreq_gov_userspace = { .owner = THIS_MODULE, }; -static int __init cpufreq_gov_userspace_init(void) -{ - return cpufreq_register_governor(&cpufreq_gov_userspace); -} - -static void __exit cpufreq_gov_userspace_exit(void) -{ - cpufreq_unregister_governor(&cpufreq_gov_userspace); -} - MODULE_AUTHOR("Dominik Brodowski , " "Russell King "); MODULE_DESCRIPTION("CPUfreq policy governor 'userspace'"); @@ -146,9 +136,7 @@ struct cpufreq_governor *cpufreq_default_governor(void) { return &cpufreq_gov_userspace; } - -core_initcall(cpufreq_gov_userspace_init); -#else -module_init(cpufreq_gov_userspace_init); #endif -module_exit(cpufreq_gov_userspace_exit); + +cpufreq_governor_init(cpufreq_gov_userspace); +cpufreq_governor_exit(cpufreq_gov_userspace); -- cgit v1.2.3 From 8412b4563e5910485c7bcd4fdcd8bcc3e728284c Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Mon, 29 Jun 2020 13:55:00 +0530 Subject: cpufreq: Specify default governor on command line Currently, the only way to specify the default CPUfreq governor is via Kconfig options, which suits users who can build the kernel themselves perfectly. However, for those who use a distro-like kernel (such as Android, with the Generic Kernel Image project), the only way to use a non-default governor is to boot to userspace, and to then switch using the sysfs interface. Being able to specify the default governor on the command line, like is the case for cpuidle, would allow those users to specify their governor of choice earlier on, and to simplify the userspace boot procedure slighlty. To support this use-case, add a kernel command line parameter allowing the default governor for CPUfreq to be specified, which takes precedence over the built-in default. This implementation has one notable limitation: the default governor must be registered before the driver. This is solved for builtin governors and drivers using appropriate *_initcall() functions. And in the modular case, this must be reflected as a constraint on the module loading order. Signed-off-by: Quentin Perret [ Viresh: Converted 'default_governor' to a string and parsing it only at initcall level, and several updates to cpufreq_init_policy(). ] Signed-off-by: Viresh Kumar [ rjw: Changelog ] Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq.c | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) (limited to 'drivers') diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index e9e8200a0211..ad94b1d47ddb 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -50,6 +50,8 @@ static LIST_HEAD(cpufreq_governor_list); #define for_each_governor(__governor) \ list_for_each_entry(__governor, &cpufreq_governor_list, governor_list) +static char default_governor[CPUFREQ_NAME_LEN]; + /** * The "cpufreq driver" - the arch- or hardware-dependent low * level driver of CPUFreq support, and its spinlock. This lock @@ -1061,7 +1063,6 @@ __weak struct cpufreq_governor *cpufreq_default_governor(void) static int cpufreq_init_policy(struct cpufreq_policy *policy) { - struct cpufreq_governor *def_gov = cpufreq_default_governor(); struct cpufreq_governor *gov = NULL; unsigned int pol = CPUFREQ_POLICY_UNKNOWN; int ret; @@ -1071,21 +1072,27 @@ static int cpufreq_init_policy(struct cpufreq_policy *policy) gov = get_governor(policy->last_governor); if (gov) { pr_debug("Restoring governor %s for cpu %d\n", - policy->governor->name, policy->cpu); - } else if (def_gov) { - gov = def_gov; - __module_get(gov->owner); + gov->name, policy->cpu); } else { - return -ENODATA; + gov = get_governor(default_governor); + } + + if (!gov) { + gov = cpufreq_default_governor(); + if (!gov) + return -ENODATA; + __module_get(gov->owner); } + } else { + /* Use the default policy if there is no last_policy. */ if (policy->last_policy) { pol = policy->last_policy; - } else if (def_gov) { - pol = cpufreq_parse_policy(def_gov->name); + } else { + pol = cpufreq_parse_policy(default_governor); /* - * In case the default governor is neiter "performance" + * In case the default governor is neither "performance" * nor "powersave", fall back to the initial policy * value set by the driver. */ @@ -2795,13 +2802,19 @@ EXPORT_SYMBOL_GPL(cpufreq_unregister_driver); static int __init cpufreq_core_init(void) { + struct cpufreq_governor *gov = cpufreq_default_governor(); + if (cpufreq_disabled()) return -ENODEV; cpufreq_global_kobject = kobject_create_and_add("cpufreq", &cpu_subsys.dev_root->kobj); BUG_ON(!cpufreq_global_kobject); + if (!strlen(default_governor)) + strncpy(default_governor, gov->name, CPUFREQ_NAME_LEN); + return 0; } module_param(off, int, 0444); +module_param_string(default_governor, default_governor, CPUFREQ_NAME_LEN, 0444); core_initcall(cpufreq_core_init); -- cgit v1.2.3 From 3a7e4fbbfd1a266ceea33e9b48f77f233ac994ac Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Mon, 29 Jun 2020 15:34:40 +0530 Subject: cpufreq: Remove the weakly defined cpufreq_default_governor() The default cpufreq governor is chosen with the help of a "choice" option in the Kconfig which will always end up selecting one of the governors and so the weakly defined definition of cpufreq_default_governor() will never get called. Moreover, this makes us skip the checking of the return value of that routine as it will always be non NULL. If the Kconfig option changes in future, then we will start getting a link error instead (and it won't go unnoticed as in the case of the weak definition). Suggested-by: Quentin Perret Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'drivers') diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index ad94b1d47ddb..036f4cc42ede 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -1056,11 +1056,6 @@ static int cpufreq_add_dev_interface(struct cpufreq_policy *policy) return 0; } -__weak struct cpufreq_governor *cpufreq_default_governor(void) -{ - return NULL; -} - static int cpufreq_init_policy(struct cpufreq_policy *policy) { struct cpufreq_governor *gov = NULL; @@ -1079,8 +1074,6 @@ static int cpufreq_init_policy(struct cpufreq_policy *policy) if (!gov) { gov = cpufreq_default_governor(); - if (!gov) - return -ENODATA; __module_get(gov->owner); } -- cgit v1.2.3 From 8d87ae48ced2dffd5e7247d19eb4c88be6f1c6f1 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 8 Jul 2020 16:32:13 -0700 Subject: PM: domains: Fix up terminology with parent/child The genpd infrastructure uses the terms master/slave, but such uses have no external exposures (not even in Documentation/driver-api/pm/*) and are not mandated by nor associated with any external specifications. Change the language used through-out to parent/child. There was one possible exception in the debugfs node "pm_genpd/pm_genpd_summary" but its path has no hits outside of the kernel itself when performing a code search[1], and it seems even this single usage has been non-functional since it was introduced due to a typo in the Python ("apend" instead of correct "append"). Fix the typo while we're at it. Link: https://codesearch.debian.net/ # [1] Signed-off-by: Kees Cook Reviewed-by: Greg Kroah-Hartman Reviewed-by: Kieran Bingham Signed-off-by: Rafael J. Wysocki --- drivers/base/power/domain.c | 194 +++++++++++++++++------------------ drivers/base/power/domain_governor.c | 12 +-- 2 files changed, 103 insertions(+), 103 deletions(-) (limited to 'drivers') diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c index 0a01df608849..2cb5e04cf86c 100644 --- a/drivers/base/power/domain.c +++ b/drivers/base/power/domain.c @@ -263,18 +263,18 @@ static int _genpd_reeval_performance_state(struct generic_pm_domain *genpd, /* * Traverse all sub-domains within the domain. This can be * done without any additional locking as the link->performance_state - * field is protected by the master genpd->lock, which is already taken. + * field is protected by the parent genpd->lock, which is already taken. * * Also note that link->performance_state (subdomain's performance state - * requirement to master domain) is different from - * link->slave->performance_state (current performance state requirement + * requirement to parent domain) is different from + * link->child->performance_state (current performance state requirement * of the devices/sub-domains of the subdomain) and so can have a * different value. * * Note that we also take vote from powered-off sub-domains into account * as the same is done for devices right now. */ - list_for_each_entry(link, &genpd->master_links, master_node) { + list_for_each_entry(link, &genpd->parent_links, parent_node) { if (link->performance_state > state) state = link->performance_state; } @@ -285,40 +285,40 @@ static int _genpd_reeval_performance_state(struct generic_pm_domain *genpd, static int _genpd_set_performance_state(struct generic_pm_domain *genpd, unsigned int state, int depth) { - struct generic_pm_domain *master; + struct generic_pm_domain *parent; struct gpd_link *link; - int master_state, ret; + int parent_state, ret; if (state == genpd->performance_state) return 0; - /* Propagate to masters of genpd */ - list_for_each_entry(link, &genpd->slave_links, slave_node) { - master = link->master; + /* Propagate to parents of genpd */ + list_for_each_entry(link, &genpd->child_links, child_node) { + parent = link->parent; - if (!master->set_performance_state) + if (!parent->set_performance_state) continue; - /* Find master's performance state */ + /* Find parent's performance state */ ret = dev_pm_opp_xlate_performance_state(genpd->opp_table, - master->opp_table, + parent->opp_table, state); if (unlikely(ret < 0)) goto err; - master_state = ret; + parent_state = ret; - genpd_lock_nested(master, depth + 1); + genpd_lock_nested(parent, depth + 1); link->prev_performance_state = link->performance_state; - link->performance_state = master_state; - master_state = _genpd_reeval_performance_state(master, - master_state); - ret = _genpd_set_performance_state(master, master_state, depth + 1); + link->performance_state = parent_state; + parent_state = _genpd_reeval_performance_state(parent, + parent_state); + ret = _genpd_set_performance_state(parent, parent_state, depth + 1); if (ret) link->performance_state = link->prev_performance_state; - genpd_unlock(master); + genpd_unlock(parent); if (ret) goto err; @@ -333,26 +333,26 @@ static int _genpd_set_performance_state(struct generic_pm_domain *genpd, err: /* Encountered an error, lets rollback */ - list_for_each_entry_continue_reverse(link, &genpd->slave_links, - slave_node) { - master = link->master; + list_for_each_entry_continue_reverse(link, &genpd->child_links, + child_node) { + parent = link->parent; - if (!master->set_performance_state) + if (!parent->set_performance_state) continue; - genpd_lock_nested(master, depth + 1); + genpd_lock_nested(parent, depth + 1); - master_state = link->prev_performance_state; - link->performance_state = master_state; + parent_state = link->prev_performance_state; + link->performance_state = parent_state; - master_state = _genpd_reeval_performance_state(master, - master_state); - if (_genpd_set_performance_state(master, master_state, depth + 1)) { + parent_state = _genpd_reeval_performance_state(parent, + parent_state); + if (_genpd_set_performance_state(parent, parent_state, depth + 1)) { pr_err("%s: Failed to roll back to %d performance state\n", - master->name, master_state); + parent->name, parent_state); } - genpd_unlock(master); + genpd_unlock(parent); } return ret; @@ -552,7 +552,7 @@ static int genpd_power_off(struct generic_pm_domain *genpd, bool one_dev_on, /* * If sd_count > 0 at this point, one of the subdomains hasn't - * managed to call genpd_power_on() for the master yet after + * managed to call genpd_power_on() for the parent yet after * incrementing it. In that case genpd_power_on() will wait * for us to drop the lock, so we can call .power_off() and let * the genpd_power_on() restore power for us (this shouldn't @@ -566,22 +566,22 @@ static int genpd_power_off(struct generic_pm_domain *genpd, bool one_dev_on, genpd->status = GPD_STATE_POWER_OFF; genpd_update_accounting(genpd); - list_for_each_entry(link, &genpd->slave_links, slave_node) { - genpd_sd_counter_dec(link->master); - genpd_lock_nested(link->master, depth + 1); - genpd_power_off(link->master, false, depth + 1); - genpd_unlock(link->master); + list_for_each_entry(link, &genpd->child_links, child_node) { + genpd_sd_counter_dec(link->parent); + genpd_lock_nested(link->parent, depth + 1); + genpd_power_off(link->parent, false, depth + 1); + genpd_unlock(link->parent); } return 0; } /** - * genpd_power_on - Restore power to a given PM domain and its masters. + * genpd_power_on - Restore power to a given PM domain and its parents. * @genpd: PM domain to power up. * @depth: nesting count for lockdep. * - * Restore power to @genpd and all of its masters so that it is possible to + * Restore power to @genpd and all of its parents so that it is possible to * resume a device belonging to it. */ static int genpd_power_on(struct generic_pm_domain *genpd, unsigned int depth) @@ -594,20 +594,20 @@ static int genpd_power_on(struct generic_pm_domain *genpd, unsigned int depth) /* * The list is guaranteed not to change while the loop below is being - * executed, unless one of the masters' .power_on() callbacks fiddles + * executed, unless one of the parents' .power_on() callbacks fiddles * with it. */ - list_for_each_entry(link, &genpd->slave_links, slave_node) { - struct generic_pm_domain *master = link->master; + list_for_each_entry(link, &genpd->child_links, child_node) { + struct generic_pm_domain *parent = link->parent; - genpd_sd_counter_inc(master); + genpd_sd_counter_inc(parent); - genpd_lock_nested(master, depth + 1); - ret = genpd_power_on(master, depth + 1); - genpd_unlock(master); + genpd_lock_nested(parent, depth + 1); + ret = genpd_power_on(parent, depth + 1); + genpd_unlock(parent); if (ret) { - genpd_sd_counter_dec(master); + genpd_sd_counter_dec(parent); goto err; } } @@ -623,12 +623,12 @@ static int genpd_power_on(struct generic_pm_domain *genpd, unsigned int depth) err: list_for_each_entry_continue_reverse(link, - &genpd->slave_links, - slave_node) { - genpd_sd_counter_dec(link->master); - genpd_lock_nested(link->master, depth + 1); - genpd_power_off(link->master, false, depth + 1); - genpd_unlock(link->master); + &genpd->child_links, + child_node) { + genpd_sd_counter_dec(link->parent); + genpd_lock_nested(link->parent, depth + 1); + genpd_power_off(link->parent, false, depth + 1); + genpd_unlock(link->parent); } return ret; @@ -932,13 +932,13 @@ late_initcall(genpd_power_off_unused); #ifdef CONFIG_PM_SLEEP /** - * genpd_sync_power_off - Synchronously power off a PM domain and its masters. + * genpd_sync_power_off - Synchronously power off a PM domain and its parents. * @genpd: PM domain to power off, if possible. * @use_lock: use the lock. * @depth: nesting count for lockdep. * * Check if the given PM domain can be powered off (during system suspend or - * hibernation) and do that if so. Also, in that case propagate to its masters. + * hibernation) and do that if so. Also, in that case propagate to its parents. * * This function is only called in "noirq" and "syscore" stages of system power * transitions. The "noirq" callbacks may be executed asynchronously, thus in @@ -963,21 +963,21 @@ static void genpd_sync_power_off(struct generic_pm_domain *genpd, bool use_lock, genpd->status = GPD_STATE_POWER_OFF; - list_for_each_entry(link, &genpd->slave_links, slave_node) { - genpd_sd_counter_dec(link->master); + list_for_each_entry(link, &genpd->child_links, child_node) { + genpd_sd_counter_dec(link->parent); if (use_lock) - genpd_lock_nested(link->master, depth + 1); + genpd_lock_nested(link->parent, depth + 1); - genpd_sync_power_off(link->master, use_lock, depth + 1); + genpd_sync_power_off(link->parent, use_lock, depth + 1); if (use_lock) - genpd_unlock(link->master); + genpd_unlock(link->parent); } } /** - * genpd_sync_power_on - Synchronously power on a PM domain and its masters. + * genpd_sync_power_on - Synchronously power on a PM domain and its parents. * @genpd: PM domain to power on. * @use_lock: use the lock. * @depth: nesting count for lockdep. @@ -994,16 +994,16 @@ static void genpd_sync_power_on(struct generic_pm_domain *genpd, bool use_lock, if (genpd_status_on(genpd)) return; - list_for_each_entry(link, &genpd->slave_links, slave_node) { - genpd_sd_counter_inc(link->master); + list_for_each_entry(link, &genpd->child_links, child_node) { + genpd_sd_counter_inc(link->parent); if (use_lock) - genpd_lock_nested(link->master, depth + 1); + genpd_lock_nested(link->parent, depth + 1); - genpd_sync_power_on(link->master, use_lock, depth + 1); + genpd_sync_power_on(link->parent, use_lock, depth + 1); if (use_lock) - genpd_unlock(link->master); + genpd_unlock(link->parent); } _genpd_power_on(genpd, false); @@ -1443,12 +1443,12 @@ static void genpd_update_cpumask(struct generic_pm_domain *genpd, if (!genpd_is_cpu_domain(genpd)) return; - list_for_each_entry(link, &genpd->slave_links, slave_node) { - struct generic_pm_domain *master = link->master; + list_for_each_entry(link, &genpd->child_links, child_node) { + struct generic_pm_domain *parent = link->parent; - genpd_lock_nested(master, depth + 1); - genpd_update_cpumask(master, cpu, set, depth + 1); - genpd_unlock(master); + genpd_lock_nested(parent, depth + 1); + genpd_update_cpumask(parent, cpu, set, depth + 1); + genpd_unlock(parent); } if (set) @@ -1636,17 +1636,17 @@ static int genpd_add_subdomain(struct generic_pm_domain *genpd, goto out; } - list_for_each_entry(itr, &genpd->master_links, master_node) { - if (itr->slave == subdomain && itr->master == genpd) { + list_for_each_entry(itr, &genpd->parent_links, parent_node) { + if (itr->child == subdomain && itr->parent == genpd) { ret = -EINVAL; goto out; } } - link->master = genpd; - list_add_tail(&link->master_node, &genpd->master_links); - link->slave = subdomain; - list_add_tail(&link->slave_node, &subdomain->slave_links); + link->parent = genpd; + list_add_tail(&link->parent_node, &genpd->parent_links); + link->child = subdomain; + list_add_tail(&link->child_node, &subdomain->child_links); if (genpd_status_on(subdomain)) genpd_sd_counter_inc(genpd); @@ -1660,7 +1660,7 @@ static int genpd_add_subdomain(struct generic_pm_domain *genpd, /** * pm_genpd_add_subdomain - Add a subdomain to an I/O PM domain. - * @genpd: Master PM domain to add the subdomain to. + * @genpd: Leader PM domain to add the subdomain to. * @subdomain: Subdomain to be added. */ int pm_genpd_add_subdomain(struct generic_pm_domain *genpd, @@ -1678,7 +1678,7 @@ EXPORT_SYMBOL_GPL(pm_genpd_add_subdomain); /** * pm_genpd_remove_subdomain - Remove a subdomain from an I/O PM domain. - * @genpd: Master PM domain to remove the subdomain from. + * @genpd: Leader PM domain to remove the subdomain from. * @subdomain: Subdomain to be removed. */ int pm_genpd_remove_subdomain(struct generic_pm_domain *genpd, @@ -1693,19 +1693,19 @@ int pm_genpd_remove_subdomain(struct generic_pm_domain *genpd, genpd_lock(subdomain); genpd_lock_nested(genpd, SINGLE_DEPTH_NESTING); - if (!list_empty(&subdomain->master_links) || subdomain->device_count) { + if (!list_empty(&subdomain->parent_links) || subdomain->device_count) { pr_warn("%s: unable to remove subdomain %s\n", genpd->name, subdomain->name); ret = -EBUSY; goto out; } - list_for_each_entry_safe(link, l, &genpd->master_links, master_node) { - if (link->slave != subdomain) + list_for_each_entry_safe(link, l, &genpd->parent_links, parent_node) { + if (link->child != subdomain) continue; - list_del(&link->master_node); - list_del(&link->slave_node); + list_del(&link->parent_node); + list_del(&link->child_node); kfree(link); if (genpd_status_on(subdomain)) genpd_sd_counter_dec(genpd); @@ -1770,8 +1770,8 @@ int pm_genpd_init(struct generic_pm_domain *genpd, if (IS_ERR_OR_NULL(genpd)) return -EINVAL; - INIT_LIST_HEAD(&genpd->master_links); - INIT_LIST_HEAD(&genpd->slave_links); + INIT_LIST_HEAD(&genpd->parent_links); + INIT_LIST_HEAD(&genpd->child_links); INIT_LIST_HEAD(&genpd->dev_list); genpd_lock_init(genpd); genpd->gov = gov; @@ -1848,15 +1848,15 @@ static int genpd_remove(struct generic_pm_domain *genpd) return -EBUSY; } - if (!list_empty(&genpd->master_links) || genpd->device_count) { + if (!list_empty(&genpd->parent_links) || genpd->device_count) { genpd_unlock(genpd); pr_err("%s: unable to remove %s\n", __func__, genpd->name); return -EBUSY; } - list_for_each_entry_safe(link, l, &genpd->slave_links, slave_node) { - list_del(&link->master_node); - list_del(&link->slave_node); + list_for_each_entry_safe(link, l, &genpd->child_links, child_node) { + list_del(&link->parent_node); + list_del(&link->child_node); kfree(link); } @@ -2827,12 +2827,12 @@ static int genpd_summary_one(struct seq_file *s, /* * Modifications on the list require holding locks on both - * master and slave, so we are safe. + * parent and child, so we are safe. * Also genpd->name is immutable. */ - list_for_each_entry(link, &genpd->master_links, master_node) { - seq_printf(s, "%s", link->slave->name); - if (!list_is_last(&link->master_node, &genpd->master_links)) + list_for_each_entry(link, &genpd->parent_links, parent_node) { + seq_printf(s, "%s", link->child->name); + if (!list_is_last(&link->parent_node, &genpd->parent_links)) seq_puts(s, ", "); } @@ -2860,7 +2860,7 @@ static int summary_show(struct seq_file *s, void *data) struct generic_pm_domain *genpd; int ret = 0; - seq_puts(s, "domain status slaves\n"); + seq_puts(s, "domain status children\n"); seq_puts(s, " /device runtime status\n"); seq_puts(s, "-----------------------------------------------------------