diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2015-06-25 17:51:55 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2015-06-25 17:51:55 -0700 |
commit | 0db9723cacf4d62bc3685fb15179b39ee4e17679 (patch) | |
tree | 7de16280234a3d98d8f7dd95e623ec381fd5af36 /drivers | |
parent | 4570a37169d4b44d316f40b2ccc681dc93fedc7b (diff) | |
parent | 111b23cf895b5cbcdc1b2c6580be1bb78a577d05 (diff) |
Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/rzhang/linux
Pull thermal management updates from Zhang Rui:
"Specifics:
- enhance Thermal Framework with several new capabilities:
* use power estimates
* compute weights with relative integers instead of percentages
* allow governors to have private data in thermal zones
* export thermal zone parameters through sysfs
Thanks to the ARM thermal team (Javi, Punit, KP).
- introduce a new thermal governor: power allocator. First in kernel
closed loop PI(D) controller for thermal control. Thanks to ARM
thermal team.
- enhance OF thermal to allow thermal zones to have sustainable power
HW specification. Thanks to Punit.
- introduce thermal driver for Intel Quark SoC x1000platform. Thanks
to Ong, Boon Leong.
- introduce QPNP PMIC temperature alarm driver. Thanks to Ivan T. I.
- introduce thermal driver for Hisilicon hi6220. Thanks to
kongxinwei.
- enhance Exynos thermal driver to handle Exynos5433 TMU. Thanks to
Chanwoo C.
- TI thermal driver now has a better implementation for EOCZ bit.
From Pavel M.
- add id for Skylake processors in int340x processor thermal driver.
- a couple of small fixes and cleanups."
* 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/rzhang/linux: (36 commits)
thermal: hisilicon: add new hisilicon thermal sensor driver
dt-bindings: Document the hi6220 thermal sensor bindings
thermal: of-thermal: add support for reading coefficients property
thermal: support slope and offset coefficients
thermal: power_allocator: round the division when divvying up power
thermal: exynos: Add the support for Exynos5433 TMU
thermal: cpu_cooling: Fix power calculation when CPUs are offline
thermal: cpu_cooling: Remove cpu_dev update on policy CPU update
thermal: export thermal_zone_parameters to sysfs
thermal: cpu_cooling: Check memory allocation of power_table
ti-soc-thermal: request temperature periodically if hw can't do that itself
ti-soc-thermal: implement eocz bit to make driver useful on omap3
cleanup ti-soc-thermal
thermal: remove stale THERMAL_POWER_ACTOR select
thermal: Default OF created trip points to writable
thermal: core: Add Kconfig option to enable writable trips
thermal: x86_pkg_temp: drop const for thermal_zone_parameters
of: thermal: Introduce sustainable power for a thermal zone
thermal: add trace events to the power allocator governor
thermal: introduce the Power Allocator governor
...
Diffstat (limited to 'drivers')
25 files changed, 3620 insertions, 533 deletions
diff --git a/drivers/acpi/thermal.c b/drivers/acpi/thermal.c index d24fa1964eb8..6d4e44ea74ac 100644 --- a/drivers/acpi/thermal.c +++ b/drivers/acpi/thermal.c @@ -800,7 +800,8 @@ static int acpi_thermal_cooling_device_cb(struct thermal_zone_device *thermal, result = thermal_zone_bind_cooling_device (thermal, trip, cdev, - THERMAL_NO_LIMIT, THERMAL_NO_LIMIT); + THERMAL_NO_LIMIT, THERMAL_NO_LIMIT, + THERMAL_WEIGHT_DEFAULT); else result = thermal_zone_unbind_cooling_device @@ -824,7 +825,8 @@ static int acpi_thermal_cooling_device_cb(struct thermal_zone_device *thermal, if (bind) result = thermal_zone_bind_cooling_device (thermal, trip, cdev, - THERMAL_NO_LIMIT, THERMAL_NO_LIMIT); + THERMAL_NO_LIMIT, THERMAL_NO_LIMIT, + THERMAL_WEIGHT_DEFAULT); else result = thermal_zone_unbind_cooling_device (thermal, trip, cdev); @@ -841,7 +843,8 @@ static int acpi_thermal_cooling_device_cb(struct thermal_zone_device *thermal, result = thermal_zone_bind_cooling_device (thermal, THERMAL_TRIPS_NONE, cdev, THERMAL_NO_LIMIT, - THERMAL_NO_LIMIT); + THERMAL_NO_LIMIT, + THERMAL_WEIGHT_DEFAULT); else result = thermal_zone_unbind_cooling_device (thermal, THERMAL_TRIPS_NONE, diff --git a/drivers/platform/x86/acerhdf.c b/drivers/platform/x86/acerhdf.c index 594c918b553d..1ef02daddb60 100644 --- a/drivers/platform/x86/acerhdf.c +++ b/drivers/platform/x86/acerhdf.c @@ -372,7 +372,8 @@ static int acerhdf_bind(struct thermal_zone_device *thermal, return 0; if (thermal_zone_bind_cooling_device(thermal, 0, cdev, - THERMAL_NO_LIMIT, THERMAL_NO_LIMIT)) { + THERMAL_NO_LIMIT, THERMAL_NO_LIMIT, + THERMAL_WEIGHT_DEFAULT)) { pr_err("error binding cooling dev\n"); return -EINVAL; } diff --git a/drivers/thermal/Kconfig b/drivers/thermal/Kconfig index af40db0df58e..118938ee8552 100644 --- a/drivers/thermal/Kconfig +++ b/drivers/thermal/Kconfig @@ -42,6 +42,17 @@ config THERMAL_OF Say 'Y' here if you need to build thermal infrastructure based on device tree. +config THERMAL_WRITABLE_TRIPS + bool "Enable writable trip points" + help + This option allows the system integrator to choose whether + trip temperatures can be changed from userspace. The + writable trips need to be specified when setting up the + thermal zone but the choice here takes precedence. + + Say 'Y' here if you would like to allow userspace tools to + change trip temperatures. + choice prompt "Default Thermal governor" default THERMAL_DEFAULT_GOV_STEP_WISE @@ -71,6 +82,14 @@ config THERMAL_DEFAULT_GOV_USER_SPACE Select this if you want to let the user space manage the platform thermals. +config THERMAL_DEFAULT_GOV_POWER_ALLOCATOR + bool "power_allocator" + select THERMAL_GOV_POWER_ALLOCATOR + help + Select this if you want to control temperature based on + system and device power allocation. This governor can only + operate on cooling devices that implement the power API. + endchoice config THERMAL_GOV_FAIR_SHARE @@ -99,6 +118,12 @@ config THERMAL_GOV_USER_SPACE help Enable this to let the user space manage the platform thermals. +config THERMAL_GOV_POWER_ALLOCATOR + bool "Power allocator thermal governor" + help + Enable this to manage platform thermals by dynamically + allocating and limiting power to devices. + config CPU_THERMAL bool "generic cpu cooling support" depends on CPU_FREQ @@ -136,6 +161,14 @@ config THERMAL_EMULATION because userland can easily disable the thermal policy by simply flooding this sysfs node with low temperature values. +config HISI_THERMAL + tristate "Hisilicon thermal driver" + depends on ARCH_HISI && CPU_THERMAL && OF + help + Enable this to plug hisilicon's thermal sensor driver into the Linux + thermal framework. cpufreq is used as the cooling device to throttle + CPUs when the passive trip is crossed. + config IMX_THERMAL tristate "Temperature sensor driver for Freescale i.MX SoCs" depends on CPU_THERMAL @@ -249,9 +282,20 @@ config X86_PKG_TEMP_THERMAL two trip points which can be set by user to get notifications via thermal notification methods. +config INTEL_SOC_DTS_IOSF_CORE + tristate + depends on X86 + select IOSF_MBI + help + This is becoming a common feature for Intel SoCs to expose the additional + digital temperature sensors (DTSs) using side band interface (IOSF). This + implements the common set of helper functions to register, get temperature + and get/set thresholds on DTSs. + config INTEL_SOC_DTS_THERMAL tristate "Intel SoCs DTS thermal driver" - depends on X86 && IOSF_MBI + depends on X86 + select INTEL_SOC_DTS_IOSF_CORE help Enable this to register Intel SoCs (e.g. Bay Trail) platform digital temperature sensor (DTS). These SoCs have two additional DTSs in @@ -261,12 +305,23 @@ config INTEL_SOC_DTS_THERMAL notification methods.The other trip is a critical trip point, which was set by the driver based on the TJ MAX temperature. +config INTEL_QUARK_DTS_THERMAL + tristate "Intel Quark DTS thermal driver" + depends on X86_INTEL_QUARK + help + Enable this to register Intel Quark SoC (e.g. X1000) platform digital + temperature sensor (DTS). For X1000 SoC, it has one on-die DTS. + The DTS will be registered as a thermal zone. There are two trip points: + hot & critical. The critical trip point default value is set by + underlying BIOS/Firmware. + config INT340X_THERMAL tristate "ACPI INT340X thermal drivers" depends on X86 && ACPI select THERMAL_GOV_USER_SPACE select ACPI_THERMAL_REL select ACPI_FAN + select INTEL_SOC_DTS_IOSF_CORE help Newer laptops and tablets that use ACPI may have thermal sensors and other devices with thermal control capabilities outside the core @@ -299,4 +354,15 @@ depends on ARCH_STI && OF source "drivers/thermal/st/Kconfig" endmenu +config QCOM_SPMI_TEMP_ALARM + tristate "Qualcomm SPMI PMIC Temperature Alarm" + depends on OF && SPMI && IIO + select REGMAP_SPMI + help + This enables a thermal sysfs driver for Qualcomm plug-and-play (QPNP) + PMIC devices. It shows up in sysfs as a thermal sensor with multiple + trip points. The temperature reported by the thermal sensor reflects the + real time die temperature if an ADC is present or an estimate of the + temperature based upon the over temperature stage value. + endif diff --git a/drivers/thermal/Makefile b/drivers/thermal/Makefile index fa0dc486790f..535dfee1496f 100644 --- a/drivers/thermal/Makefile +++ b/drivers/thermal/Makefile @@ -14,6 +14,7 @@ thermal_sys-$(CONFIG_THERMAL_GOV_FAIR_SHARE) += fair_share.o thermal_sys-$(CONFIG_THERMAL_GOV_BANG_BANG) += gov_bang_bang.o thermal_sys-$(CONFIG_THERMAL_GOV_STEP_WISE) += step_wise.o thermal_sys-$(CONFIG_THERMAL_GOV_USER_SPACE) += user_space.o +thermal_sys-$(CONFIG_THERMAL_GOV_POWER_ALLOCATOR) += power_allocator.o # cpufreq cooling thermal_sys-$(CONFIG_CPU_THERMAL) += cpu_cooling.o @@ -22,6 +23,7 @@ thermal_sys-$(CONFIG_CPU_THERMAL) += cpu_cooling.o thermal_sys-$(CONFIG_CLOCK_THERMAL) += clock_cooling.o # platform thermal drivers +obj-$(CONFIG_QCOM_SPMI_TEMP_ALARM) += qcom-spmi-temp-alarm.o obj-$(CONFIG_SPEAR_THERMAL) += spear_thermal.o obj-$(CONFIG_ROCKCHIP_THERMAL) += rockchip_thermal.o obj-$(CONFIG_RCAR_THERMAL) += rcar_thermal.o @@ -34,8 +36,11 @@ obj-$(CONFIG_IMX_THERMAL) += imx_thermal.o obj-$(CONFIG_DB8500_CPUFREQ_COOLING) += db8500_cpufreq_cooling.o obj-$(CONFIG_INTEL_POWERCLAMP) += intel_powerclamp.o obj-$(CONFIG_X86_PKG_TEMP_THERMAL) += x86_pkg_temp_thermal.o +obj-$(CONFIG_INTEL_SOC_DTS_IOSF_CORE) += intel_soc_dts_iosf.o obj-$(CONFIG_INTEL_SOC_DTS_THERMAL) += intel_soc_dts_thermal.o +obj-$(CONFIG_INTEL_QUARK_DTS_THERMAL) += intel_quark_dts_thermal.o obj-$(CONFIG_TI_SOC_THERMAL) += ti-soc-thermal/ obj-$(CONFIG_INT340X_THERMAL) += int340x_thermal/ obj-$(CONFIG_ST_THERMAL) += st/ obj-$(CONFIG_TEGRA_SOCTHERM) += tegra_soctherm.o +obj-$(CONFIG_HISI_THERMAL) += hisi_thermal.o diff --git a/drivers/thermal/cpu_cooling.c b/drivers/thermal/cpu_cooling.c index f65f0d109fc8..6509c61b9648 100644 --- a/drivers/thermal/cpu_cooling.c +++ b/drivers/thermal/cpu_cooling.c @@ -26,10 +26,13 @@ #include <linux/thermal.h> #include <linux/cpufreq.h> #include <linux/err.h> +#include <linux/pm_opp.h> #include <linux/slab.h> #include <linux/cpu.h> #include <linux/cpu_cooling.h> +#include <trace/events/thermal.h> + /* * Cooling state <-> CPUFreq frequency * @@ -45,6 +48,19 @@ */ /** + * struct power_table - frequency to power conversion + * @frequency: frequency in KHz + * @power: power in mW + * + * This structure is built when the cooling device registers and helps + * in translating frequency to power and viceversa. + */ +struct power_table { + u32 frequency; + u32 power; +}; + +/** * struct cpufreq_cooling_device - data for cooling device with cpufreq * @id: unique integer value corresponding to each cpufreq_cooling_device * registered. @@ -58,6 +74,15 @@ * cpufreq frequencies. * @allowed_cpus: all the cpus involved for this cpufreq_cooling_device. * @node: list_head to link all cpufreq_cooling_device together. + * @last_load: load measured by the latest call to cpufreq_get_actual_power() + * @time_in_idle: previous reading of the absolute time that this cpu was idle + * @time_in_idle_timestamp: wall time of the last invocation of + * get_cpu_idle_time_us() + * @dyn_power_table: array of struct power_table for frequency to power + * conversion, sorted in ascending order. + * @dyn_power_table_entries: number of entries in the @dyn_power_table array + * @cpu_dev: the first cpu_device from @allowed_cpus that has OPPs registered + * @plat_get_static_power: callback to calculate the static power * * This structure is required for keeping information of each registered * cpufreq_cooling_device. @@ -71,6 +96,13 @@ struct cpufreq_cooling_device { unsigned int *freq_table; /* In descending order */ struct cpumask allowed_cpus; struct list_head node; + u32 last_load; + u64 *time_in_idle; + u64 *time_in_idle_timestamp; + struct power_table *dyn_power_table; + int dyn_power_table_entries; + struct device *cpu_dev; + get_static_t plat_get_static_power; }; static DEFINE_IDR(cpufreq_idr); static DEFINE_MUTEX(cooling_cpufreq_lock); @@ -186,23 +218,237 @@ static int cpufreq_thermal_notifier(struct notifier_block *nb, unsigned long max_freq = 0; struct cpufreq_cooling_device *cpufreq_dev; - if (event != CPUFREQ_ADJUST) - return 0; + switch (event) { - mutex_lock(&cooling_cpufreq_lock); - list_for_each_entry(cpufreq_dev, &cpufreq_dev_list, node) { - if (!cpumask_test_cpu(policy->cpu, - &cpufreq_dev->allowed_cpus)) + case CPUFREQ_ADJUST: + mutex_lock(&cooling_cpufreq_lock); + list_for_each_entry(cpufreq_dev, &cpufreq_dev_list, node) { + if (!cpumask_test_cpu(policy->cpu, + &cpufreq_dev->allowed_cpus)) + continue; + + max_freq = cpufreq_dev->cpufreq_val; + + if (policy->max != max_freq) + cpufreq_verify_within_limits(policy, 0, + max_freq); + } + mutex_unlock(&cooling_cpufreq_lock); + break; + default: + return NOTIFY_DONE; + } + + return NOTIFY_OK; +} + +/** + * build_dyn_power_table() - create a dynamic power to frequency table + * @cpufreq_device: the cpufreq cooling device in which to store the table + * @capacitance: dynamic power coefficient for these cpus + * + * Build a dynamic power to frequency table for this cpu and store it + * in @cpufreq_device. This table will be used in cpu_power_to_freq() and + * cpu_freq_to_power() to convert between power and frequency + * efficiently. Power is stored in mW, frequency in KHz. The + * resulting table is in ascending order. + * + * Return: 0 on success, -E* on error. + */ +static int build_dyn_power_table(struct cpufreq_cooling_device *cpufreq_device, + u32 capacitance) +{ + struct power_table *power_table; + struct dev_pm_opp *opp; + struct device *dev = NULL; + int num_opps = 0, cpu, i, ret = 0; + unsigned long freq; + + rcu_read_lock(); + + for_each_cpu(cpu, &cpufreq_device->allowed_cpus) { + dev = get_cpu_device(cpu); + if (!dev) { + dev_warn(&cpufreq_device->cool_dev->device, + "No cpu device for cpu %d\n", cpu); continue; + } + + num_opps = dev_pm_opp_get_opp_count(dev); + if (num_opps > 0) { + break; + } else if (num_opps < 0) { + ret = num_opps; + goto unlock; + } + } - max_freq = cpufreq_dev->cpufreq_val; + if (num_opps == 0) { + ret = -EINVAL; + goto unlock; + } - if (policy->max != max_freq) - cpufreq_verify_within_limits(policy, 0, max_freq); + power_table = kcalloc(num_opps, sizeof(*power_table), GFP_KERNEL); + if (!power_table) { + ret = -ENOMEM; + goto unlock; } - mutex_unlock(&cooling_cpufreq_lock); - return 0; + for (freq = 0, i = 0; + opp = dev_pm_opp_find_freq_ceil(dev, &freq), !IS_ERR(opp); + freq++, i++) { + u32 freq_mhz, voltage_mv; + u64 power; + + freq_mhz = freq / 1000000; + voltage_mv = dev_pm_opp_get_voltage(opp) / 1000; + + /* + * Do the multiplication with MHz and millivolt so as + * to not overflow. + */ + power = (u64)capacitance * freq_mhz * voltage_mv * voltage_mv; + do_div(power, 1000000000); + + /* frequency is stored in power_table in KHz */ + power_table[i].frequency = freq / 1000; + + /* power is stored in mW */ + power_table[i].power = power; + } + + if (i == 0) { + ret = PTR_ERR(opp); + goto unlock; + } + + cpufreq_device->cpu_dev = dev; + cpufreq_device->dyn_power_table = power_table; + cpufreq_device->dyn_power_table_entries = i; + +unlock: + rcu_read_unlock(); + return ret; +} + +static u32 cpu_freq_to_power(struct cpufreq_cooling_device *cpufreq_device, + u32 freq) +{ + int i; + struct power_table *pt = cpufreq_device->dyn_power_table; + + for (i = 1; i < cpufreq_device->dyn_power_table_entries; i++) + if (freq < pt[i].frequency) + break; + + return pt[i - 1].power; +} + +static u32 cpu_power_to_freq(struct cpufreq_cooling_device *cpufreq_device, + u32 power) +{ + int i; + struct power_table *pt = cpufreq_device->dyn_power_table; + + for (i = 1; i < cpufreq_device->dyn_power_table_entries; i++) + if (power < pt[i].power) + break; + + return pt[i - 1].frequency; +} + +/** + * get_load() - get load for a cpu since last updated + * @cpufreq_device: &struct cpufreq_cooling_device for this cpu + * @cpu: cpu number + * + * Return: The average load of cpu @cpu in percentage since this + * function was last called. + */ +static u32 get_load(struct cpufreq_cooling_device *cpufreq_device, int cpu) +{ + u32 load; + u64 now, now_idle, delta_time, delta_idle; + + now_idle = get_cpu_idle_time(cpu, &now, 0); + delta_idle = now_idle - cpufreq_device->time_in_idle[cpu]; + delta_time = now - cpufreq_device->time_in_idle_timestamp[cpu]; + + if (delta_time <= delta_idle) + load = 0; + else + load = div64_u64(100 * (delta_time - delta_idle), delta_time); + + cpufreq_device->time_in_idle[cpu] = now_idle; + cpufreq_device->time_in_idle_timestamp[cpu] = now; + + return load; +} + +/** + * get_static_power() - calculate the static power consumed by the cpus + * @cpufreq_device: struct &cpufreq_cooling_device for this cpu cdev + * @tz: thermal zone device in which we're operating + * @freq: frequency in KHz + * @power: pointer in which to store the calculated static power + * + * Calculate the static power consumed by the cpus described by + * @cpu_actor running at frequency @freq. This function relies on a + * platform specific function that should have been provided when the + * actor was registered. If it wasn't, the static power is assumed to + * be negligible. The calculated static power is stored in @power. + * + * Return: 0 on success, -E* on failure. + */ +static int get_static_power(struct cpufreq_cooling_device *cpufreq_device, + struct thermal_zone_device *tz, unsigned long freq, + u32 *power) +{ + struct dev_pm_opp *opp; + unsigned long voltage; + struct cpumask *cpumask = &cpufreq_device->allowed_cpus; + unsigned long freq_hz = freq * 1000; + + if (!cpufreq_device->plat_get_static_power || + !cpufreq_device->cpu_dev) { + *power = 0; + return 0; + } + + rcu_read_lock(); + + opp = dev_pm_opp_find_freq_exact(cpufreq_device->cpu_dev, freq_hz, + true); + voltage = dev_pm_opp_get_voltage(opp); + + rcu_read_unlock(); + + if (voltage == 0) { + dev_warn_ratelimited(cpufreq_device->cpu_dev, + "Failed to get voltage for frequency %lu: %ld\n", + freq_hz, IS_ERR(opp) ? PTR_ERR(opp) : 0); + return -EINVAL; + } + + return cpufreq_device->plat_get_static_power(cpumask, tz->passive_delay, + voltage, power); +} + +/** + * get_dynamic_power() - calculate the dynamic power + * @cpufreq_device: &cpufreq_cooling_device for this cdev + * @freq: current frequency + * + * Return: the dynamic power consumed by the cpus described by + * @cpufreq_device. + */ +static u32 get_dynamic_power(struct cpufreq_cooling_device *cpufreq_device, + unsigned long freq) +{ + u32 raw_cpu_power; + + raw_cpu_power = cpu_freq_to_power(cpufreq_device, freq); + return (raw_cpu_power * cpufreq_device->last_load) / 100; } /* cpufreq cooling device callback functions are defined below */ @@ -280,8 +526,205 @@ static int cpufreq_set_cur_state(struct thermal_cooling_device *cdev, return 0; } +/** + * cpufreq_get_requested_power() - get the current power + * @cdev: &thermal_cooling_device pointer + * @tz: a valid thermal zone device pointer + * @power: pointer in which to store the resulting power + * + * Calculate the current power consumption of the cpus in milliwatts + * and store it in @power. This function should actually calculate + * the requested power, but it's hard to get the frequency that + * cpufreq would have assigned if there were no thermal limits. + * Instead, we calculate the current power on the assumption that the + * immediate future will look like the immediate past. + * + * We use the current frequency and the average load since this + * function was last called. In reality, there could have been + * multiple opps since this function was last called and that affects + * the load calculation. While it's not perfectly accurate, this + * simplification is good enough and works. REVISIT this, as more + * complex code may be needed if experiments show that it's not + * accurate enough. + * + * Return: 0 on success, -E* if getting the static power failed. + */ +static int cpufreq_get_requested_power(struct thermal_cooling_device *cdev, + struct thermal_zone_device *tz, + u32 *power) +{ + unsigned long freq; + int i = 0, cpu, ret; + u32 static_power, dynamic_power, total_load = 0; + struct cpufreq_cooling_device *cpufreq_device = cdev->devdata; + u32 *load_cpu = NULL; + + cpu = cpumask_any_and(&cpufreq_device->allowed_cpus, cpu_online_mask); + + /* + * All the CPUs are offline, thus the requested power by + * the cdev is 0 + */ + if (cpu >= nr_cpu_ids) { + *power = 0; + return 0; + } + + freq = cpufreq_quick_get(cpu); + + if (trace_thermal_power_cpu_get_power_enabled()) { + u32 ncpus = cpumask_weight(&cpufreq_device->allowed_cpus); + + load_cpu = devm_kcalloc(&cdev->device, ncpus, sizeof(*load_cpu), + GFP_KERNEL); + } + + for_each_cpu(cpu, &cpufreq_device->allowed_cpus) { + u32 load; + + if (cpu_online(cpu)) + load = get_load(cpufreq_device, cpu); + else + load = 0; + + total_load += load; + if (trace_thermal_power_cpu_limit_enabled() && load_cpu) + load_cpu[i] = load; + + i++; + } + + cpufreq_device->last_load = total_load; + + dynamic_power = get_dynamic_power(cpufreq_device, freq); + ret = get_static_power(cpufreq_device, tz, freq, &static_power); + if (ret) { + if (load_cpu) + devm_kfree(&cdev->device, load_cpu); + return ret; + } + + if (load_cpu) { + trace_thermal_power_cpu_get_power( + &cpufreq_device->allowed_cpus, + freq, load_cpu, i, dynamic_power, static_power); + + devm_kfree(&cdev->device, load_cpu); + } + + *power = static_power + dynamic_power; + return 0; +} + +/** + * cpufreq_state2power() - convert a cpu cdev state to power consumed + * @cdev: &thermal_cooling_device pointer + * @tz: a valid thermal zone device pointer + * @state: cooling device state to be converted + * @power: pointer in which to store the resulting power + * + * Convert cooling device state @state into power consumption in + * milliwatts assuming 100% load. Store the calculated power in + * @power. + * + * Return: 0 on success, -EINVAL if the cooling device state could not + * be converted into a frequency or other -E* if there was an error + * when calculating the static power. + */ +static int cpufreq_state2power(struct thermal_cooling_device *cdev, + struct thermal_zone_device *tz, + unsigned long state, u32 *power) +{ + unsigned int freq, num_cpus; + cpumask_t cpumask; + u32 static_power, dynamic_power; + int ret; + struct cpufreq_cooling_device *cpufreq_device = cdev->devdata; + + cpumask_and(&cpumask, &cpufreq_device->allowed_cpus, cpu_online_mask); + num_cpus = cpumask_weight(&cpumask); + + /* None of our cpus are online, so no power */ + if (num_cpus == 0) { + *power = 0; + return 0; + } + + freq = cpufreq_device->freq_table[state]; + if (!freq) + return -EINVAL; + + dynamic_power = cpu_freq_to_power(cpufreq_device, freq) * num_cpus; + ret = get_static_power(cpufreq_device, tz, freq, &static_power); + if (ret) + return ret; + + *power = static_power + dynamic_power; + return 0; +} + +/** + * cpufreq_power2state() - convert power to a cooling device state + * @cdev: &thermal_cooling_device pointer + * @tz: a valid thermal zone device pointer + * @power: power in milliwatts to be converted + * @state: pointer in which to store the resulting state + * + * Calculate a cooling device state for the cpus described by @cdev + * that would allow them to consume at most @power mW and store it in + * @state. Note that this calculation depends on external factors + * such as the cpu load or the current static power. Calling this + * function with the same power as input can yield different cooling + * device states depending on those external factors. + * + * Return: 0 on success, -ENODEV if no cpus are online or -EINVAL if + * the calculated frequency could not be converted to a valid state. + * The latter should not happen unless the frequencies available to + * cpufreq have changed since the initialization of the cpu cooling + * device. + */ +static int cpufreq_power2state(struct thermal_cooling_device *cdev, + struct thermal_zone_device *tz, u32 power, + unsigned long *state) +{ + unsigned int cpu, cur_freq, target_freq; + int ret; + s32 dyn_power; + u32 last_load, normalised_power, static_power; + struct cpufreq_cooling_device *cpufreq_device = cdev->devdata; + + cpu = cpumask_any_and(&cpufreq_device->allowed_cpus, cpu_online_mask); + + /* None of our cpus are online */ + if (cpu >= nr_cpu_ids) + return -ENODEV; + + cur_freq = cpufreq_quick_get(cpu); + ret = get_static_power(cpufreq_device, tz, cur_freq, &static_power); + if (ret) + return ret; + + dyn_power = power - static_power; + dyn_power = dyn_power > 0 ? dyn_power : 0; + last_load = cpufreq_device->last_load ?: 1; + normalised_power = (dyn_power * 100) / last_load; + target_freq = cpu_power_to_freq(cpufreq_device, normalised_power); + + *state = cpufreq_cooling_get_level(cpu, target_freq); + if (*state == THERMAL_CSTATE_INVALID) { + dev_warn_ratelimited(&cdev->device, + "Failed to convert %dKHz for cpu %d into a cdev state\n", + target_freq, cpu); + return -EINVAL; + } + + trace_thermal_power_cpu_limit(&cpufreq_device->allowed_cpus, + target_freq, *state, power); + return 0; +} + /* Bind cpufreq callbacks to thermal cooling device ops */ -static struct thermal_cooling_device_ops const cpufreq_cooling_ops = { +static struct thermal_cooling_device_ops cpufreq_cooling_ops = { .get_max_state = cpufreq_get_max_state, .get_cur_state = cpufreq_get_cur_state, .set_cur_state = cpufreq_set_cur_state, @@ -311,6 +754,9 @@ static unsigned int find_next_max(struct cpufreq_frequency_table *table, * @np: a valid struct device_node to the cooling device device tree node * @clip_cpus: cpumask of cpus where the frequency constraints will happen. * Normally this should be same as cpufreq policy->related_cpus. + * @capacitance: dynamic power coefficient for these cpus + * @plat_static_func: function to calculate the static power consumed by these + * cpus (optional) * * This interface function registers the cpufreq cooling device with the name * "thermal-cpufreq-%x". This api can support multiple instances of cpufreq @@ -322,13 +768,14 @@ static unsigned int find_next_max(struct cpufreq_frequency_table *table, */ static struct thermal_cooling_device * __cpufreq_cooling_register(struct device_node *np, - const struct cpumask *clip_cpus) + const struct cpumask *clip_cpus, u32 capacitance, + get_static_t plat_static_func) { struct thermal_cooling_device *cool_dev; struct cpufreq_cooling_device *cpufreq_dev; char dev_name[THERMAL_NAME_LENGTH]; struct cpufreq_frequency_table *pos, *table; - unsigned int freq, i; + unsigned int freq, i, num_cpus; int ret; table = cpufreq_frequency_get_table(cpumask_first(clip_cpus)); @@ -341,6 +788,23 @@ __cpufreq_cooling_register(struct device_node *np, if (!cpufreq_dev) return ERR_PTR(-ENOMEM); + num_cpus = cpumask_weight(clip_cpus); + cpufreq_dev->time_in_idle = kcalloc(num_cpus, + sizeof(*cpufreq_dev->time_in_idle), + GFP_KERNEL); + if (!cpufreq_dev->time_in_idle) { + cool_dev = ERR_PTR(-ENOMEM); + goto free_cdev; + } + + cpufreq_dev->time_in_idle_timestamp = + kcalloc(num_cpus, sizeof(*cpufreq_dev->time_in_idle_timestamp), + GFP_KERNEL); + if (!cpufreq_dev->time_in_idle_timestamp) { + cool_dev = ERR_PTR(-ENOMEM); + goto free_time_in_idle; + } + /* Find max levels */ cpufreq_for_each_valid_entry(pos, table) cpufreq_dev->max_level++; @@ -349,7 +813,7 @@ __cpufreq_cooling_register(struct device_node *np, cpufreq_dev->max_level, GFP_KERNEL); if (!cpufreq_dev->freq_table) { cool_dev = ERR_PTR(-ENOMEM); - goto free_cdev; + goto free_time_in_idle_timestamp; } /* max_level is an index, not a counter */ @@ -357,6 +821,20 @@ __cpufreq_cooling_register(struct device_node *np, cpumask_copy(&cpufreq_dev->allowed_cpus, clip_cpus); + if (capacitance) { + cpufreq_cooling_ops.get_requested_power = + cpufreq_get_requested_power; + cpufreq_cooling_ops.state2power = cpufreq_state2power; + cpufreq_cooling_ops.power2state = cpufreq_power2state; + cpufreq_dev->plat_get_static_power = plat_static_func; + + ret = build_dyn_power_table(cpufreq_dev, capacitance); + if (ret) { + cool_dev = ERR_PTR(ret); + goto free_table; + } + } + ret = get_idr(&cpufreq_idr, &cpufreq_dev->id); if (ret) { cool_dev = ERR_PTR(ret); @@ -402,6 +880,10 @@ remove_idr: release_idr(&cpufreq_idr, cpufreq_dev-&g |