From b977fcf477c176e5f41775f0ea139f935b0f25b7 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 31 Jul 2019 15:13:19 +0100 Subject: irqdomain/debugfs: Use PAs to generate fwnode names Booting a large arm64 server (HiSi D05) leads to the following shouting at boot time: [ 20.722132] debugfs: File 'irqchip@(____ptrval____)-3' in directory 'domains' already present! [ 20.730851] debugfs: File 'irqchip@(____ptrval____)-3' in directory 'domains' already present! [ 20.739560] debugfs: File 'irqchip@(____ptrval____)-3' in directory 'domains' already present! [ 20.748267] debugfs: File 'irqchip@(____ptrval____)-3' in directory 'domains' already present! [ 20.756975] debugfs: File 'irqchip@(____ptrval____)-3' in directory 'domains' already present! [ 20.765683] debugfs: File 'irqchip@(____ptrval____)-3' in directory 'domains' already present! [ 20.774391] debugfs: File 'irqchip@(____ptrval____)-3' in directory 'domains' already present! and many more... Evidently, we expect something a bit more informative than ____ptrval____, and certainly we want all of our domains, not just the first one. For that, turn the %p used to generate the fwnode name into something that won't be repainted (%pa). Given that we've now fixed all users to pass a pointer to a PA, it will actually do the right thing. Acked-by: Thomas Gleixner Signed-off-by: Marc Zyngier --- kernel/irq/irqdomain.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 3078d0e48bba..e7bbab149750 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -31,7 +31,7 @@ struct irqchip_fwid { struct fwnode_handle fwnode; unsigned int type; char *name; - void *data; + phys_addr_t *pa; }; #ifdef CONFIG_GENERIC_IRQ_DEBUGFS @@ -62,7 +62,8 @@ EXPORT_SYMBOL_GPL(irqchip_fwnode_ops); * domain struct. */ struct fwnode_handle *__irq_domain_alloc_fwnode(unsigned int type, int id, - const char *name, void *data) + const char *name, + phys_addr_t *pa) { struct irqchip_fwid *fwid; char *n; @@ -77,7 +78,7 @@ struct fwnode_handle *__irq_domain_alloc_fwnode(unsigned int type, int id, n = kasprintf(GFP_KERNEL, "%s-%d", name, id); break; default: - n = kasprintf(GFP_KERNEL, "irqchip@%p", data); + n = kasprintf(GFP_KERNEL, "irqchip@%pa", pa); break; } @@ -89,7 +90,7 @@ struct fwnode_handle *__irq_domain_alloc_fwnode(unsigned int type, int id, fwid->type = type; fwid->name = n; - fwid->data = data; + fwid->pa = pa; fwid->fwnode.ops = &irqchip_fwnode_ops; return &fwid->fwnode; } -- cgit v1.2.3 From b6a32bbd8735def2d0d696ba59205d1874b7800f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 16 Aug 2019 18:09:23 +0200 Subject: genirq: Force interrupt threading on RT Switch force_irqthreads from a boot time modifiable variable to a compile time constant when CONFIG_PREEMPT_RT is enabled. Signed-off-by: Thomas Gleixner Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20190816160923.12855-1-bigeasy@linutronix.de --- kernel/irq/manage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index e8f7f179bf77..97de1b7d43af 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -23,7 +23,7 @@ #include "internals.h" -#ifdef CONFIG_IRQ_FORCED_THREADING +#if defined(CONFIG_IRQ_FORCED_THREADING) && !defined(CONFIG_PREEMPT_RT) __read_mostly bool force_irqthreads; EXPORT_SYMBOL_GPL(force_irqthreads); -- cgit v1.2.3 From 53c1788b7d7720565214a466afffdc818d8c6e5f Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 16 Aug 2019 10:28:48 +0800 Subject: genirq/affinity: Improve __irq_build_affinity_masks() One invariant of __irq_build_affinity_masks() is that all CPUs in the specified masks (cpu_mask AND node_to_cpumask for each node) should be covered during the spread. Even though all requested vectors have been reached, it's still required to spread vectors among remained CPUs. A similar policy has been taken in case of 'numvecs <= nodes' already. So remove the following check inside the loop: if (done >= numvecs) break; Meantime assign at least 1 vector for remaining nodes if 'numvecs' vectors have been handled already. Also, if the specified cpumask for one numa node is empty, simply do not spread vectors on this node. Signed-off-by: Ming Lei Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20190816022849.14075-2-ming.lei@redhat.com --- kernel/irq/affinity.c | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index 6fef48033f96..c7cca942bd8a 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -129,14 +129,26 @@ static int __irq_build_affinity_masks(unsigned int startvec, for_each_node_mask(n, nodemsk) { unsigned int ncpus, v, vecs_to_assign, vecs_per_node; - /* Spread the vectors per node */ - vecs_per_node = (numvecs - (curvec - firstvec)) / nodes; - /* Get the cpus on this node which are in the mask */ cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); - - /* Calculate the number of cpus per vector */ ncpus = cpumask_weight(nmsk); + if (!ncpus) + continue; + + /* + * Calculate the number of cpus per vector + * + * Spread the vectors evenly per node. If the requested + * vector number has been reached, simply allocate one + * vector for each remaining node so that all nodes can + * be covered + */ + if (numvecs > done) + vecs_per_node = max_t(unsigned, + (numvecs - done) / nodes, 1); + else + vecs_per_node = 1; + vecs_to_assign = min(vecs_per_node, ncpus); /* Account for rounding errors */ @@ -156,13 +168,11 @@ static int __irq_build_affinity_masks(unsigned int startvec, } done += v; - if (done >= numvecs) - break; if (curvec >= last_affv) curvec = firstvec; --nodes; } - return done; + return done < numvecs ? done : numvecs; } /* -- cgit v1.2.3 From b1a5a73e64e99faa5f4deef2ae96d7371a0fb5d0 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 16 Aug 2019 10:28:49 +0800 Subject: genirq/affinity: Spread vectors on node according to nr_cpu ratio Now __irq_build_affinity_masks() spreads vectors evenly per node, but there is a case that not all vectors have been spread when each numa node has a different number of CPUs which triggers the warning in the spreading code. Improve the spreading algorithm by - assigning vectors according to the ratio of the number of CPUs on a node to the number of remaining CPUs. - running the assignment from smaller nodes to bigger nodes to guarantee that every active node gets allocated at least one vector. This ensures that all vectors are spread out. Asided of that the spread becomes more fair if the nodes have different number of CPUs. For example, on the following machine: CPU(s): 16 On-line CPU(s) list: 0-15 Thread(s) per core: 1 Core(s) per socket: 8 Socket(s): 2 NUMA node(s): 2 ... NUMA node0 CPU(s): 0,1,3,5-9,11,13-15 NUMA node1 CPU(s): 2,4,10,12 When a driver requests to allocate 8 vectors, the following spread results: irq 31, cpu list 2,4 irq 32, cpu list 10,12 irq 33, cpu list 0-1 irq 34, cpu list 3,5 irq 35, cpu list 6-7 irq 36, cpu list 8-9 irq 37, cpu list 11,13 irq 38, cpu list 14-15 So Node 0 has now 6 and Node 1 has 2 vectors assigned. The original algorithm assigned 4 vectors on each node which was unfair versus Node 0. [ tglx: Massaged changelog ] Reported-by: Jon Derrick Signed-off-by: Ming Lei Signed-off-by: Thomas Gleixner Reviewed-by: Keith Busch Reviewed-by: Jon Derrick Link: https://lkml.kernel.org/r/20190816022849.14075-3-ming.lei@redhat.com --- kernel/irq/affinity.c | 239 ++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 200 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index c7cca942bd8a..d905e844bf3a 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -7,6 +7,7 @@ #include #include #include +#include static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk, unsigned int cpus_per_vec) @@ -94,6 +95,155 @@ static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask, return nodes; } +struct node_vectors { + unsigned id; + + union { + unsigned nvectors; + unsigned ncpus; + }; +}; + +static int ncpus_cmp_func(const void *l, const void *r) +{ + const struct node_vectors *ln = l; + const struct node_vectors *rn = r; + + return ln->ncpus - rn->ncpus; +} + +/* + * Allocate vector number for each node, so that for each node: + * + * 1) the allocated number is >= 1 + * + * 2) the allocated numbver is <= active CPU number of this node + * + * The actual allocated total vectors may be less than @numvecs when + * active total CPU number is less than @numvecs. + * + * Active CPUs means the CPUs in '@cpu_mask AND @node_to_cpumask[]' + * for each node. + */ +static void alloc_nodes_vectors(unsigned int numvecs, + const cpumask_var_t *node_to_cpumask, + const struct cpumask *cpu_mask, + const nodemask_t nodemsk, + struct cpumask *nmsk, + struct node_vectors *node_vectors) +{ + unsigned n, remaining_ncpus = 0; + + for (n = 0; n < nr_node_ids; n++) { + node_vectors[n].id = n; + node_vectors[n].ncpus = UINT_MAX; + } + + for_each_node_mask(n, nodemsk) { + unsigned ncpus; + + cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); + ncpus = cpumask_weight(nmsk); + + if (!ncpus) + continue; + remaining_ncpus += ncpus; + node_vectors[n].ncpus = ncpus; + } + + numvecs = min_t(unsigned, remaining_ncpus, numvecs); + + sort(node_vectors, nr_node_ids, sizeof(node_vectors[0]), + ncpus_cmp_func, NULL); + + /* + * Allocate vectors for each node according to the ratio of this + * node's nr_cpus to remaining un-assigned ncpus. 'numvecs' is + * bigger than number of active numa nodes. Always start the + * allocation from the node with minimized nr_cpus. + * + * This way guarantees that each active node gets allocated at + * least one vector, and the theory is simple: over-allocation + * is only done when this node is assigned by one vector, so + * other nodes will be allocated >= 1 vector, since 'numvecs' is + * bigger than number of numa nodes. + * + * One perfect invariant is that number of allocated vectors for + * each node is <= CPU count of this node: + * + * 1) suppose there are two nodes: A and B + * ncpu(X) is CPU count of node X + * vecs(X) is the vector count allocated to node X via this + * algorithm + * + * ncpu(A) <= ncpu(B) + * ncpu(A) + ncpu(B) = N + * vecs(A) + vecs(B) = V + * + * vecs(A) = max(1, round_down(V * ncpu(A) / N)) + * vecs(B) = V - vecs(A) + * + * both N and V are integer, and 2 <= V <= N, suppose + * V = N - delta, and 0 <= delta <= N - 2 + * + * 2) obviously vecs(A) <= ncpu(A) because: + * + * if vecs(A) is 1, then vecs(A) <= ncpu(A) given + * ncpu(A) >= 1 + * + * otherwise, + * vecs(A) <= V * ncpu(A) / N <= ncpu(A), given V <= N + * + * 3) prove how vecs(B) <= ncpu(B): + * + * if round_down(V * ncpu(A) / N) == 0, vecs(B) won't be + * over-allocated, so vecs(B) <= ncpu(B), + * + * otherwise: + * + * vecs(A) = + * round_down(V * ncpu(A) / N) = + * round_down((N - delta) * ncpu(A) / N) = + * round_down((N * ncpu(A) - delta * ncpu(A)) / N) >= + * round_down((N * ncpu(A) - delta * N) / N) = + * cpu(A) - delta + * + * then: + * + * vecs(A) - V >= ncpu(A) - delta - V + * => + * V - vecs(A) <= V + delta - ncpu(A) + * => + * vecs(B) <= N - ncpu(A) + * => + * vecs(B) <= cpu(B) + * + * For nodes >= 3, it can be thought as one node and another big + * node given that is exactly what this algorithm is implemented, + * and we always re-calculate 'remaining_ncpus' & 'numvecs', and + * finally for each node X: vecs(X) <= ncpu(X). + * + */ + for (n = 0; n < nr_node_ids; n++) { + unsigned nvectors, ncpus; + + if (node_vectors[n].ncpus == UINT_MAX) + continue; + + WARN_ON_ONCE(numvecs == 0); + + ncpus = node_vectors[n].ncpus; + nvectors = max_t(unsigned, 1, + numvecs * ncpus / remaining_ncpus); + WARN_ON_ONCE(nvectors > ncpus); + + node_vectors[n].nvectors = nvectors; + + remaining_ncpus -= ncpus; + numvecs -= nvectors; + } +} + static int __irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs, unsigned int firstvec, @@ -102,10 +252,11 @@ static int __irq_build_affinity_masks(unsigned int startvec, struct cpumask *nmsk, struct irq_affinity_desc *masks) { - unsigned int n, nodes, cpus_per_vec, extra_vecs, done = 0; + unsigned int i, n, nodes, cpus_per_vec, extra_vecs, done = 0; unsigned int last_affv = firstvec + numvecs; unsigned int curvec = startvec; nodemask_t nodemsk = NODE_MASK_NONE; + struct node_vectors *node_vectors; if (!cpumask_weight(cpu_mask)) return 0; @@ -126,53 +277,57 @@ static int __irq_build_affinity_masks(unsigned int startvec, return numvecs; } - for_each_node_mask(n, nodemsk) { - unsigned int ncpus, v, vecs_to_assign, vecs_per_node; + node_vectors = kcalloc(nr_node_ids, + sizeof(struct node_vectors), + GFP_KERNEL); + if (!node_vectors) + return -ENOMEM; + + /* allocate vector number for each node */ + alloc_nodes_vectors(numvecs, node_to_cpumask, cpu_mask, + nodemsk, nmsk, node_vectors); + + for (i = 0; i < nr_node_ids; i++) { + unsigned int ncpus, v; + struct node_vectors *nv = &node_vectors[i]; + + if (nv->nvectors == UINT_MAX) + continue; /* Get the cpus on this node which are in the mask */ - cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); + cpumask_and(nmsk, cpu_mask, node_to_cpumask[nv->id]); ncpus = cpumask_weight(nmsk); if (!ncpus) continue; - /* - * Calculate the number of cpus per vector - * - * Spread the vectors evenly per node. If the requested - * vector number has been reached, simply allocate one - * vector for each remaining node so that all nodes can - * be covered - */ - if (numvecs > done) - vecs_per_node = max_t(unsigned, - (numvecs - done) / nodes, 1); - else - vecs_per_node = 1; - - vecs_to_assign = min(vecs_per_node, ncpus); + WARN_ON_ONCE(nv->nvectors > ncpus); /* Account for rounding errors */ - extra_vecs = ncpus - vecs_to_assign * (ncpus / vecs_to_assign); + extra_vecs = ncpus - nv->nvectors * (ncpus / nv->nvectors); - for (v = 0; curvec < last_affv && v < vecs_to_assign; - curvec++, v++) { - cpus_per_vec = ncpus / vecs_to_assign; + /* Spread allocated vectors on CPUs of the current node */ + for (v = 0; v < nv->nvectors; v++, curvec++) { + cpus_per_vec = ncpus / nv->nvectors; /* Account for extra vectors to compensate rounding errors */ if (extra_vecs) { cpus_per_vec++; --extra_vecs; } + + /* + * wrapping has to be considered given 'startvec' + * may start anywhere + */ + if (curvec >= last_affv) + curvec = firstvec; irq_spread_init_one(&masks[curvec].mask, nmsk, cpus_per_vec); } - - done += v; - if (curvec >= last_affv) - curvec = firstvec; - --nodes; + done += nv->nvectors; } - return done < numvecs ? done : numvecs; + kfree(node_vectors); + return done; } /* @@ -184,7 +339,7 @@ static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs, unsigned int firstvec, struct irq_affinity_desc *masks) { - unsigned int curvec = startvec, nr_present, nr_others; + unsigned int curvec = startvec, nr_present = 0, nr_others = 0; cpumask_var_t *node_to_cpumask; cpumask_var_t nmsk, npresmsk; int ret = -ENOMEM; @@ -199,15 +354,17 @@ static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs, if (!node_to_cpumask) goto fail_npresmsk; - ret = 0; /* Stabilize the cpumasks */ get_online_cpus(); build_node_to_cpumask(node_to_cpumask); /* Spread on present CPUs starting from affd->pre_vectors */ - nr_present = __irq_build_affinity_masks(curvec, numvecs, - firstvec, node_to_cpumask, - cpu_present_mask, nmsk, masks); + ret = __irq_build_affinity_masks(curvec, numvecs, firstvec, + node_to_cpumask, cpu_present_mask, + nmsk, masks); + if (ret < 0) + goto fail_build_affinity; + nr_present = ret; /* * Spread on non present CPUs starting from the next vector to be @@ -220,12 +377,16 @@ static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs, else curvec = firstvec + nr_present; cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask); - nr_others = __irq_build_affinity_masks(curvec, numvecs, - firstvec, node_to_cpumask, - npresmsk, nmsk, masks); + ret = __irq_build_affinity_masks(curvec, numvecs, firstvec, + node_to_cpumask, npresmsk, nmsk, + masks); + if (ret >= 0) + nr_others = ret; + + fail_build_affinity: put_online_cpus(); - if (nr_present < numvecs) + if (ret >= 0) WARN_ON(nr_present + nr_others < numvecs); free_node_to_cpumask(node_to_cpumask); @@ -235,7 +396,7 @@ static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs, fail_nmsk: free_cpumask_var(nmsk); - return ret; + return ret < 0 ? ret : 0; } static void default_calc_sets(struct irq_affinity *affd, unsigned int affvecs) -- cgit v1.2.3 From 101f85b56d03b36418bbf867f67d81710839b0ec Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 28 Aug 2019 16:58:15 +0800 Subject: genirq/affinity: Remove const qualifier from node_to_cpumask argument When CONFIG_CPUMASK_OFFSTACK isn't enabled, 'cpumask_var_t' is as 'typedef struct cpumask cpumask_var_t[1]', so the argument 'node_to_cpumask' alloc_nodes_vectors() can't be declared as 'const cpumask_var_t *' Fixes the following warning: kernel/irq/affinity.c: In function '__irq_build_affinity_masks': alloc_nodes_vectors(numvecs, node_to_cpumask, cpu_mask, ^ kernel/irq/affinity.c:128:13: note: expected 'const struct cpumask (*)[1]' but argument is of type 'struct cpumask (*)[1]' static void alloc_nodes_vectors(unsigned int numvecs, ^ Fixes: b1a5a73e64e9 ("genirq/affinity: Spread vectors on node according to nr_cpu ratio") Reported-by: kbuild test robot Signed-off-by: Ming Lei Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20190828085815.19931-1-ming.lei@redhat.com --- kernel/irq/affinity.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index d905e844bf3a..4d89ad4fae3b 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -126,7 +126,7 @@ static int ncpus_cmp_func(const void *l, const void *r) * for each node. */ static void alloc_nodes_vectors(unsigned int numvecs, - const cpumask_var_t *node_to_cpumask, + cpumask_var_t *node_to_cpumask, const struct cpumask *cpu_mask, const nodemask_t nodemsk, struct cpumask *nmsk, -- cgit v1.2.3 From 711419e504ebd68c8f03656616829c8ad7829389 Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Mon, 2 Sep 2019 23:14:56 +0000 Subject: irqdomain: Add the missing assignment of domain->fwnode for named fwnode Recently device pass-through stops working for Linux VM running on Hyper-V. git-bisect shows the regression is caused by the recent commit 467a3bb97432 ("PCI: hv: Allocate a named fwnode ..."), but the root cause is that the commit d59f6617eef0 forgets to set the domain->fwnode for IRQCHIP_FWNODE_NAMED*, and as a result: 1. The domain->fwnode remains to be NULL. 2. irq_find_matching_fwspec() returns NULL since "h->fwnode == fwnode" is false, and pci_set_bus_msi_domain() sets the Hyper-V PCI root bus's msi_domain to NULL. 3. When the device is added onto the root bus, the device's dev->msi_domain is set to NULL in pci_set_msi_domain(). 4. When a device driver tries to enable MSI-X, pci_msi_setup_msi_irqs() calls arch_setup_msi_irqs(), which uses the native MSI chip (i.e. arch/x86/kernel/apic/msi.c: pci_msi_controller) to set up the irqs, but actually pci_msi_setup_msi_irqs() is supposed to call msi_domain_alloc_irqs() with the hbus->irq_domain, which is created in hv_pcie_init_irq_domain() and is associated with the Hyper-V chip hv_msi_irq_chip. Consequently, the irq line is not properly set up, and the device driver can not receive any interrupt. Fixes: d59f6617eef0 ("genirq: Allow fwnode to carry name information only") Fixes: 467a3bb97432 ("PCI: hv: Allocate a named fwnode instead of an address-based one") Reported-by: Lili Deng Signed-off-by: Dexuan Cui Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/PU1P153MB01694D9AF625AC335C600C5FBFBE0@PU1P153MB0169.APCP153.PROD.OUTLOOK.COM --- kernel/irq/irqdomain.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index e7bbab149750..132672b74e4b 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -149,6 +149,7 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size, switch (fwid->type) { case IRQCHIP_FWNODE_NAMED: case IRQCHIP_FWNODE_NAMED_ID: + domain->fwnode = fwnode; domain->name = kstrdup(fwid->name, GFP_KERNEL); if (!domain->name) { kfree(domain); -- cgit v1.2.3