From f36963c9d3f6f415732710da3acdd8608a9fa0e5 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Sat, 9 May 2015 03:14:13 +0930 Subject: cpumask_set_cpu_local_first => cpumask_local_spread, lament da91309e0a7e (cpumask: Utility function to set n'th cpu...) created a genuinely weird function. I never saw it before, it went through DaveM. (He only does this to make us other maintainers feel better about our own mistakes.) cpumask_set_cpu_local_first's purpose is say "I need to spread things across N online cpus, choose the ones on this numa node first"; you call it in a loop. It can fail. One of the two callers ignores this, the other aborts and fails the device open. It can fail in two ways: allocating the off-stack cpumask, or through a convoluted codepath which AFAICT can only occur if cpu_online_mask changes. Which shouldn't happen, because if cpu_online_mask can change while you call this, it could return a now-offline cpu anyway. It contains a nonsensical test "!cpumask_of_node(numa_node)". This was drawn to my attention by Geert, who said this causes a warning on Sparc. It sets a single bit in a cpumask instead of returning a cpu number, because that's what the callers want. It could be made more efficient by passing the previous cpu rather than an index, but that would be more invasive to the callers. Fixes: da91309e0a7e8966d916a74cce42ed170fde06bf Signed-off-by: Rusty Russell (then rebased) Tested-by: Amir Vadai Acked-by: Amir Vadai Acked-by: David S. Miller --- lib/cpumask.c | 74 +++++++++++++++++++++-------------------------------------- 1 file changed, 26 insertions(+), 48 deletions(-) (limited to 'lib') diff --git a/lib/cpumask.c b/lib/cpumask.c index 830dd5dec40f..5f627084f2e9 100644 --- a/lib/cpumask.c +++ b/lib/cpumask.c @@ -139,64 +139,42 @@ void __init free_bootmem_cpumask_var(cpumask_var_t mask) #endif /** - * cpumask_set_cpu_local_first - set i'th cpu with local numa cpu's first - * + * cpumask_local_spread - select the i'th cpu with local numa cpu's first * @i: index number - * @numa_node: local numa_node - * @dstp: cpumask with the relevant cpu bit set according to the policy + * @node: local numa_node * - * This function sets the cpumask according to a numa aware policy. - * cpumask could be used as an affinity hint for the IRQ related to a - * queue. When the policy is to spread queues across cores - local cores - * first. + * This function selects an online CPU according to a numa aware policy; + * local cpus are returned first, followed by non-local ones, then it + * wraps around. * - * Returns 0 on success, -ENOMEM for no memory, and -EAGAIN when failed to set - * the cpu bit and need to re-call the function. + * It's not very efficient, but useful for setup. */ -int cpumask_set_cpu_local_first(int i, int numa_node, cpumask_t *dstp) +unsigned int cpumask_local_spread(unsigned int i, int node) { - cpumask_var_t mask; int cpu; - int ret = 0; - - if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) - return -ENOMEM; + /* Wrap: we always want a cpu. */ i %= num_online_cpus(); - if (numa_node == -1 || !cpumask_of_node(numa_node)) { - /* Use all online cpu's for non numa aware system */ - cpumask_copy(mask, cpu_online_mask); + if (node == -1) { + for_each_cpu(cpu, cpu_online_mask) + if (i-- == 0) + return cpu; } else { - int n; - - cpumask_and(mask, - cpumask_of_node(numa_node), cpu_online_mask); - - n = cpumask_weight(mask); - if (i >= n) { - i -= n; - - /* If index > number of local cpu's, mask out local - * cpu's - */ - cpumask_andnot(mask, cpu_online_mask, mask); + /* NUMA first. */ + for_each_cpu_and(cpu, cpumask_of_node(node), cpu_online_mask) + if (i-- == 0) + return cpu; + + for_each_cpu(cpu, cpu_online_mask) { + /* Skip NUMA nodes, done above. */ + if (cpumask_test_cpu(cpu, cpumask_of_node(node))) + continue; + + if (i-- == 0) + return cpu; } } - - for_each_cpu(cpu, mask) { - if (--i < 0) - goto out; - } - - ret = -EAGAIN; - -out: - free_cpumask_var(mask); - - if (!ret) - cpumask_set_cpu(cpu, dstp); - - return ret; + BUG(); } -EXPORT_SYMBOL(cpumask_set_cpu_local_first); +EXPORT_SYMBOL(cpumask_local_spread); -- cgit v1.2.3 From 80188b0d77d7426b494af739ac129e0e684acb84 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 29 May 2015 07:39:34 +1000 Subject: percpu_counter: batch size aware __percpu_counter_compare() XFS uses non-stanard batch sizes for avoiding frequent global counter updates on it's allocated inode counters, as they increment or decrement in batches of 64 inodes. Hence the standard percpu counter batch of 32 means that the counter is effectively a global counter. Currently Xfs uses a batch size of 128 so that it doesn't take the global lock on every single modification. However, Xfs also needs to compare accurately against zero, which means we need to use percpu_counter_compare(), and that has a hard-coded batch size of 32, and hence will spuriously fail to detect when it is supposed to use precise comparisons and hence the accounting goes wrong. Add __percpu_counter_compare() to take a custom batch size so we can use it sanely in XFS and factor percpu_counter_compare() to use it. Signed-off-by: Dave Chinner Acked-by: Tejun Heo Signed-off-by: Dave Chinner --- lib/percpu_counter.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'lib') diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c index 48144cdae819..f051d69f0910 100644 --- a/lib/percpu_counter.c +++ b/lib/percpu_counter.c @@ -197,13 +197,13 @@ static int percpu_counter_hotcpu_callback(struct notifier_block *nb, * Compare counter against given value. * Return 1 if greater, 0 if equal and -1 if less */ -int percpu_counter_compare(struct percpu_counter *fbc, s64 rhs) +int __percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch) { s64 count; count = percpu_counter_read(fbc); /* Check to see if rough count will be sufficient for comparison */ - if (abs(count - rhs) > (percpu_counter_batch*num_online_cpus())) { + if (abs(count - rhs) > (batch * num_online_cpus())) { if (count > rhs) return 1; else @@ -218,7 +218,7 @@ int percpu_counter_compare(struct percpu_counter *fbc, s64 rhs) else return 0; } -EXPORT_SYMBOL(percpu_counter_compare); +EXPORT_SYMBOL(__percpu_counter_compare); static int __init percpu_counter_startup(void) { -- cgit v1.2.3