summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2019-09-18 09:49:13 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2019-09-18 09:49:13 -0700
commitfe38bd6862074c0a2b9be7f31f043aaa70b2af5f (patch)
tree34edf3f546188b108c513b3f8499e45afe37aad9
parent404e634fdb96a3c99c7517353bfafbd88e04ab41 (diff)
parentfb3925d06c285e1acb248addc5d80b33ea771b0f (diff)
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull KVM updates from Paolo Bonzini: "s390: - ioctl hardening - selftests ARM: - ITS translation cache - support for 512 vCPUs - various cleanups and bugfixes PPC: - various minor fixes and preparation x86: - bugfixes all over the place (posted interrupts, SVM, emulation corner cases, blocked INIT) - some IPI optimizations" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (75 commits) KVM: X86: Use IPI shorthands in kvm guest when support KVM: x86: Fix INIT signal handling in various CPU states KVM: VMX: Introduce exit reason for receiving INIT signal on guest-mode KVM: VMX: Stop the preemption timer during vCPU reset KVM: LAPIC: Micro optimize IPI latency kvm: Nested KVM MMUs need PAE root too KVM: x86: set ctxt->have_exception in x86_decode_insn() KVM: x86: always stop emulation on page fault KVM: nVMX: trace nested VM-Enter failures detected by H/W KVM: nVMX: add tracepoint for failed nested VM-Enter x86: KVM: svm: Fix a check in nested_svm_vmrun() KVM: x86: Return to userspace with internal error on unexpected exit reason KVM: x86: Add kvm_emulate_{rd,wr}msr() to consolidate VXM/SVM code KVM: x86: Refactor up kvm_{g,s}et_msr() to simplify callers doc: kvm: Fix return description of KVM_SET_MSRS KVM: X86: Tune PLE Window tracepoint KVM: VMX: Change ple_window type to unsigned int KVM: X86: Remove tailing newline for tracepoints KVM: X86: Trace vcpu_id for vmexit KVM: x86: Manually calculate reserved bits when loading PDPTRS ...
-rw-r--r--Documentation/virt/kvm/api.txt33
-rw-r--r--Documentation/virt/kvm/mmu.txt4
-rw-r--r--arch/arm/include/uapi/asm/kvm.h4
-rw-r--r--arch/arm64/include/asm/pgtable-prot.h2
-rw-r--r--arch/arm64/include/uapi/asm/kvm.h4
-rw-r--r--arch/arm64/kvm/hyp/tlb.c14
-rw-r--r--arch/powerpc/include/asm/kvm_host.h22
-rw-r--r--arch/powerpc/include/asm/kvm_ppc.h1
-rw-r--r--arch/powerpc/include/asm/xive.h9
-rw-r--r--arch/powerpc/kvm/book3s.c8
-rw-r--r--arch/powerpc/kvm/book3s_hv.c24
-rw-r--r--arch/powerpc/kvm/book3s_hv_rm_mmu.c2
-rw-r--r--arch/powerpc/kvm/book3s_hv_rmhandlers.S38
-rw-r--r--arch/powerpc/kvm/book3s_xive.c60
-rw-r--r--arch/powerpc/kvm/book3s_xive.h2
-rw-r--r--arch/powerpc/kvm/book3s_xive_native.c23
-rw-r--r--arch/powerpc/kvm/e500.c3
-rw-r--r--arch/powerpc/kvm/emulate.c1
-rw-r--r--arch/powerpc/kvm/emulate_loadstore.c6
-rw-r--r--arch/powerpc/kvm/powerpc.c3
-rw-r--r--arch/powerpc/sysdev/xive/common.c87
-rw-r--r--arch/powerpc/sysdev/xive/native.c7
-rw-r--r--arch/s390/include/uapi/asm/kvm.h6
-rw-r--r--arch/s390/kvm/kvm-s390.c6
-rw-r--r--arch/x86/include/asm/kvm_emulate.h3
-rw-r--r--arch/x86/include/asm/kvm_host.h19
-rw-r--r--arch/x86/include/asm/vmx.h14
-rw-r--r--arch/x86/include/uapi/asm/vmx.h2
-rw-r--r--arch/x86/kernel/kvm.c12
-rw-r--r--arch/x86/kvm/cpuid.c27
-rw-r--r--arch/x86/kvm/emulate.c27
-rw-r--r--arch/x86/kvm/lapic.c20
-rw-r--r--arch/x86/kvm/mmu.c61
-rw-r--r--arch/x86/kvm/mmu.h2
-rw-r--r--arch/x86/kvm/svm.c198
-rw-r--r--arch/x86/kvm/trace.h74
-rw-r--r--arch/x86/kvm/vmx/nested.c305
-rw-r--r--arch/x86/kvm/vmx/vmenter.S4
-rw-r--r--arch/x86/kvm/vmx/vmx.c94
-rw-r--r--arch/x86/kvm/vmx/vmx.h2
-rw-r--r--arch/x86/kvm/x86.c197
-rw-r--r--arch/x86/kvm/x86.h2
-rw-r--r--include/kvm/arm_vgic.h4
-rw-r--r--include/uapi/linux/kvm.h3
-rw-r--r--tools/testing/selftests/kvm/Makefile10
-rw-r--r--tools/testing/selftests/kvm/dirty_log_test.c61
-rw-r--r--tools/testing/selftests/kvm/include/kvm_util.h8
-rw-r--r--tools/testing/selftests/kvm/lib/aarch64/ucall.c112
-rw-r--r--tools/testing/selftests/kvm/lib/s390x/ucall.c56
-rw-r--r--tools/testing/selftests/kvm/lib/ucall.c157
-rw-r--r--tools/testing/selftests/kvm/lib/x86_64/ucall.c56
-rw-r--r--tools/testing/selftests/kvm/s390x/memop.c166
-rw-r--r--tools/testing/selftests/kvm/s390x/sync_regs_test.c36
-rw-r--r--virt/kvm/arm/arm.c2
-rw-r--r--virt/kvm/arm/vgic/vgic-init.c8
-rw-r--r--virt/kvm/arm/vgic/vgic-irqfd.c36
-rw-r--r--virt/kvm/arm/vgic/vgic-its.c207
-rw-r--r--virt/kvm/arm/vgic/vgic-mmio-v3.c85
-rw-r--r--virt/kvm/arm/vgic/vgic-v2.c7
-rw-r--r--virt/kvm/arm/vgic/vgic-v3.c7
-rw-r--r--virt/kvm/arm/vgic/vgic.c26
-rw-r--r--virt/kvm/arm/vgic/vgic.h5
-rw-r--r--virt/kvm/kvm_main.c7
63 files changed, 1694 insertions, 797 deletions
diff --git a/Documentation/virt/kvm/api.txt b/Documentation/virt/kvm/api.txt
index 2d067767b617..136f1eef3712 100644
--- a/Documentation/virt/kvm/api.txt
+++ b/Documentation/virt/kvm/api.txt
@@ -586,7 +586,7 @@ Capability: basic
Architectures: x86
Type: vcpu ioctl
Parameters: struct kvm_msrs (in)
-Returns: 0 on success, -1 on error
+Returns: number of msrs successfully set (see below), -1 on error
Writes model-specific registers to the vcpu. See KVM_GET_MSRS for the
data structures.
@@ -595,6 +595,11 @@ Application code should set the 'nmsrs' member (which indicates the
size of the entries array), and the 'index' and 'data' members of each
array entry.
+It tries to set the MSRs in array entries[] one by one. If setting an MSR
+fails, e.g., due to setting reserved bits, the MSR isn't supported/emulated
+by KVM, etc..., it stops processing the MSR list and returns the number of
+MSRs that have been set successfully.
+
4.20 KVM_SET_CPUID
@@ -753,8 +758,8 @@ in-kernel irqchip (GIC), and for in-kernel irqchip can tell the GIC to
use PPIs designated for specific cpus. The irq field is interpreted
like this:
-  bits: | 31 ... 24 | 23 ... 16 | 15 ... 0 |
- field: | irq_type | vcpu_index | irq_id |
+  bits: | 31 ... 28 | 27 ... 24 | 23 ... 16 | 15 ... 0 |
+ field: | vcpu2_index | irq_type | vcpu_index | irq_id |
The irq_type field has the following values:
- irq_type[0]: out-of-kernel GIC: irq_id 0 is IRQ, irq_id 1 is FIQ
@@ -766,6 +771,14 @@ The irq_type field has the following values:
In both cases, level is used to assert/deassert the line.
+When KVM_CAP_ARM_IRQ_LINE_LAYOUT_2 is supported, the target vcpu is
+identified as (256 * vcpu2_index + vcpu_index). Otherwise, vcpu2_index
+must be zero.
+
+Note that on arm/arm64, the KVM_CAP_IRQCHIP capability only conditions
+injection of interrupts for the in-kernel irqchip. KVM_IRQ_LINE can always
+be used for a userspace interrupt controller.
+
struct kvm_irq_level {
union {
__u32 irq; /* GSI */
@@ -3079,12 +3092,14 @@ This exception is also raised directly at the corresponding VCPU if the
flag KVM_S390_MEMOP_F_INJECT_EXCEPTION is set in the "flags" field.
The start address of the memory region has to be specified in the "gaddr"
-field, and the length of the region in the "size" field. "buf" is the buffer
-supplied by the userspace application where the read data should be written
-to for KVM_S390_MEMOP_LOGICAL_READ, or where the data that should be written
-is stored for a KVM_S390_MEMOP_LOGICAL_WRITE. "buf" is unused and can be NULL
-when KVM_S390_MEMOP_F_CHECK_ONLY is specified. "ar" designates the access
-register number to be used.
+field, and the length of the region in the "size" field (which must not
+be 0). The maximum value for "size" can be obtained by checking the
+KVM_CAP_S390_MEM_OP capability. "buf" is the buffer supplied by the
+userspace application where the read data should be written to for
+KVM_S390_MEMOP_LOGICAL_READ, or where the data that should be written is
+stored for a KVM_S390_MEMOP_LOGICAL_WRITE. When KVM_S390_MEMOP_F_CHECK_ONLY
+is specified, "buf" is unused and can be NULL. "ar" designates the access
+register number to be used; the valid range is 0..15.
The "reserved" field is meant for future extensions. It is not used by
KVM with the currently defined set of flags.
diff --git a/Documentation/virt/kvm/mmu.txt b/Documentation/virt/kvm/mmu.txt
index 1b9880dfba0a..dadb29e8738f 100644
--- a/Documentation/virt/kvm/mmu.txt
+++ b/Documentation/virt/kvm/mmu.txt
@@ -294,7 +294,7 @@ Handling a page fault is performed as follows:
- walk shadow page table
- check for valid generation number in the spte (see "Fast invalidation of
MMIO sptes" below)
- - cache the information to vcpu->arch.mmio_gva, vcpu->arch.access and
+ - cache the information to vcpu->arch.mmio_gva, vcpu->arch.mmio_access and
vcpu->arch.mmio_gfn, and call the emulator
- If both P bit and R/W bit of error code are set, this could possibly
be handled as a "fast page fault" (fixed without taking the MMU lock). See
@@ -304,7 +304,7 @@ Handling a page fault is performed as follows:
- if permissions are insufficient, reflect the fault back to the guest
- determine the host page
- if this is an mmio request, there is no host page; cache the info to
- vcpu->arch.mmio_gva, vcpu->arch.access and vcpu->arch.mmio_gfn
+ vcpu->arch.mmio_gva, vcpu->arch.mmio_access and vcpu->arch.mmio_gfn
- walk the shadow page table to find the spte for the translation,
instantiating missing intermediate page tables as necessary
- If this is an mmio request, cache the mmio info to the spte and set some
diff --git a/arch/arm/include/uapi/asm/kvm.h b/arch/arm/include/uapi/asm/kvm.h
index a4217c1a5d01..2769360f195c 100644
--- a/arch/arm/include/uapi/asm/kvm.h
+++ b/arch/arm/include/uapi/asm/kvm.h
@@ -266,8 +266,10 @@ struct kvm_vcpu_events {
#define KVM_DEV_ARM_ITS_CTRL_RESET 4
/* KVM_IRQ_LINE irq field index values */
+#define KVM_ARM_IRQ_VCPU2_SHIFT 28
+#define KVM_ARM_IRQ_VCPU2_MASK 0xf
#define KVM_ARM_IRQ_TYPE_SHIFT 24
-#define KVM_ARM_IRQ_TYPE_MASK 0xff
+#define KVM_ARM_IRQ_TYPE_MASK 0xf
#define KVM_ARM_IRQ_VCPU_SHIFT 16
#define KVM_ARM_IRQ_VCPU_MASK 0xff
#define KVM_ARM_IRQ_NUM_SHIFT 0
diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h
index 92d2e9f28f28..9a21b84536f2 100644
--- a/arch/arm64/include/asm/pgtable-prot.h
+++ b/arch/arm64/include/asm/pgtable-prot.h
@@ -77,7 +77,7 @@
})
#define PAGE_S2 __pgprot(_PROT_DEFAULT | PAGE_S2_MEMATTR(NORMAL) | PTE_S2_RDONLY | PAGE_S2_XN)
-#define PAGE_S2_DEVICE __pgprot(_PROT_DEFAULT | PAGE_S2_MEMATTR(DEVICE_nGnRE) | PTE_S2_RDONLY | PAGE_S2_XN)
+#define PAGE_S2_DEVICE __pgprot(_PROT_DEFAULT | PAGE_S2_MEMATTR(DEVICE_nGnRE) | PTE_S2_RDONLY | PTE_S2_XN)
#define PAGE_NONE __pgprot(((_PAGE_DEFAULT) & ~PTE_VALID) | PTE_PROT_NONE | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_UXN)
#define PAGE_SHARED __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN | PTE_UXN | PTE_WRITE)
diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
index 9a507716ae2f..67c21f9bdbad 100644
--- a/arch/arm64/include/uapi/asm/kvm.h
+++ b/arch/arm64/include/uapi/asm/kvm.h
@@ -325,8 +325,10 @@ struct kvm_vcpu_events {
#define KVM_ARM_VCPU_TIMER_IRQ_PTIMER 1
/* KVM_IRQ_LINE irq field index values */
+#define KVM_ARM_IRQ_VCPU2_SHIFT 28
+#define KVM_ARM_IRQ_VCPU2_MASK 0xf
#define KVM_ARM_IRQ_TYPE_SHIFT 24
-#define KVM_ARM_IRQ_TYPE_MASK 0xff
+#define KVM_ARM_IRQ_TYPE_MASK 0xf
#define KVM_ARM_IRQ_VCPU_SHIFT 16
#define KVM_ARM_IRQ_VCPU_MASK 0xff
#define KVM_ARM_IRQ_NUM_SHIFT 0
diff --git a/arch/arm64/kvm/hyp/tlb.c b/arch/arm64/kvm/hyp/tlb.c
index d49a14497715..c466060b76d6 100644
--- a/arch/arm64/kvm/hyp/tlb.c
+++ b/arch/arm64/kvm/hyp/tlb.c
@@ -193,6 +193,18 @@ void __hyp_text __kvm_flush_vm_context(void)
{
dsb(ishst);
__tlbi(alle1is);
- asm volatile("ic ialluis" : : );
+
+ /*
+ * VIPT and PIPT caches are not affected by VMID, so no maintenance
+ * is necessary across a VMID rollover.
+ *
+ * VPIPT caches constrain lookup and maintenance to the active VMID,
+ * so we need to invalidate lines with a stale VMID to avoid an ABA
+ * race after multiple rollovers.
+ *
+ */
+ if (icache_is_vpipt())
+ asm volatile("ic ialluis");
+
dsb(ish);
}
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index e6e5f59aaa97..6fb5fb4779e0 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -232,11 +232,25 @@ struct revmap_entry {
};
/*
- * We use the top bit of each memslot->arch.rmap entry as a lock bit,
- * and bit 32 as a present flag. The bottom 32 bits are the
- * index in the guest HPT of a HPTE that points to the page.
+ * The rmap array of size number of guest pages is allocated for each memslot.
+ * This array is used to store usage specific information about the guest page.
+ * Below are the encodings of the various possible usage types.
*/
-#define KVMPPC_RMAP_LOCK_BIT 63
+/* Free bits which can be used to define a new usage */
+#define KVMPPC_RMAP_TYPE_MASK 0xff00000000000000
+#define KVMPPC_RMAP_NESTED 0xc000000000000000 /* Nested rmap array */
+#define KVMPPC_RMAP_HPT 0x0100000000000000 /* HPT guest */
+
+/*
+ * rmap usage definition for a hash page table (hpt) guest:
+ * 0x0000080000000000 Lock bit
+ * 0x0000018000000000 RC bits
+ * 0x0000000100000000 Present bit
+ * 0x00000000ffffffff HPT index bits
+ * The bottom 32 bits are the index in the guest HPT of a HPTE that points to
+ * the page.
+ */
+#define KVMPPC_RMAP_LOCK_BIT 43
#define KVMPPC_RMAP_RC_SHIFT 32
#define KVMPPC_RMAP_REFERENCED (HPTE_R_R << KVMPPC_RMAP_RC_SHIFT)
#define KVMPPC_RMAP_PRESENT 0x100000000ul
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 2484e6a8f5ca..8e8514efb124 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -598,6 +598,7 @@ extern int kvmppc_xive_native_get_vp(struct kvm_vcpu *vcpu,
union kvmppc_one_reg *val);
extern int kvmppc_xive_native_set_vp(struct kvm_vcpu *vcpu,
union kvmppc_one_reg *val);
+extern bool kvmppc_xive_native_supported(void);
#else
static inline int kvmppc_xive_set_xive(struct kvm *kvm, u32 irq, u32 server,
diff --git a/arch/powerpc/include/asm/xive.h b/arch/powerpc/include/asm/xive.h
index e4016985764e..818989e11678 100644
--- a/arch/powerpc/include/asm/xive.h
+++ b/arch/powerpc/include/asm/xive.h
@@ -46,7 +46,15 @@ struct xive_irq_data {
/* Setup/used by frontend */
int target;
+ /*
+ * saved_p means that there is a queue entry for this interrupt
+ * in some CPU's queue (not including guest vcpu queues), even
+ * if P is not set in the source ESB.
+ * stale_p means that there is no queue entry for this interrupt
+ * in some CPU's queue, even if P is set in the source ESB.
+ */
bool saved_p;
+ bool stale_p;
};
#define XIVE_IRQ_FLAG_STORE_EOI 0x01
#define XIVE_IRQ_FLAG_LSI 0x02
@@ -127,6 +135,7 @@ extern int xive_native_get_queue_state(u32 vp_id, uint32_t prio, u32 *qtoggle,
extern int xive_native_set_queue_state(u32 vp_id, uint32_t prio, u32 qtoggle,
u32 qindex);
extern int xive_native_get_vp_state(u32 vp_id, u64 *out_state);
+extern bool xive_native_has_queue_state_support(void);
#else
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 9524d92bc45d..d7fcdfa7fee4 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -1083,9 +1083,11 @@ static int kvmppc_book3s_init(void)
if (xics_on_xive()) {
kvmppc_xive_init_module();
kvm_register_device_ops(&kvm_xive_ops, KVM_DEV_TYPE_XICS);
- kvmppc_xive_native_init_module();
- kvm_register_device_ops(&kvm_xive_native_ops,
- KVM_DEV_TYPE_XIVE);
+ if (kvmppc_xive_native_supported()) {
+ kvmppc_xive_native_init_module();
+ kvm_register_device_ops(&kvm_xive_native_ops,
+ KVM_DEV_TYPE_XIVE);
+ }
} else
#endif
kvm_register_device_ops(&kvm_xics_ops, KVM_DEV_TYPE_XICS);
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index cde3f5a4b3e4..f8975c620f41 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -1678,7 +1678,14 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
*val = get_reg_val(id, vcpu->arch.pspb);
break;
case KVM_REG_PPC_DPDES:
- *val = get_reg_val(id, vcpu->arch.vcore->dpdes);
+ /*
+ * On POWER9, where we are emulating msgsndp etc.,
+ * we return 1 bit for each vcpu, which can come from
+ * either vcore->dpdes or doorbell_request.
+ * On POWER8, doorbell_request is 0.
+ */
+ *val = get_reg_val(id, vcpu->arch.vcore->dpdes |
+ vcpu->arch.doorbell_request);
break;
case KVM_REG_PPC_VTB:
*val = get_reg_val(id, vcpu->arch.vcore->vtb);
@@ -2860,7 +2867,7 @@ static void collect_piggybacks(struct core_info *cip, int target_threads)
if (!spin_trylock(&pvc->lock))
continue;
prepare_threads(pvc);
- if (!pvc->n_runnable) {
+ if (!pvc->n_runnable || !pvc->kvm->arch.mmu_ready) {
list_del_init(&pvc->preempt_list);
if (pvc->runner == NULL) {
pvc->vcore_state = VCORE_INACTIVE;
@@ -2881,15 +2888,20 @@ static void collect_piggybacks(struct core_info *cip, int target_threads)
spin_unlock(&lp->lock);
}
-static bool recheck_signals(struct core_info *cip)
+static bool recheck_signals_and_mmu(struct core_info *cip)
{
int sub, i;
struct kvm_vcpu *vcpu;
+ struct kvmppc_vcore *vc;
- for (sub = 0; sub < cip->n_subcores; ++sub)
- for_each_runnable_thread(i, vcpu, cip->vc[sub])
+ for (sub = 0; sub < cip->n_subcores; ++sub) {
+ vc = cip->vc[sub];
+ if (!vc->kvm->arch.mmu_ready)
+ return true;
+ for_each_runnable_thread(i, vcpu, vc)
if (signal_pending(vcpu->arch.run_task))
return true;
+ }
return false;
}
@@ -3119,7 +3131,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
local_irq_disable();
hard_irq_disable();
if (lazy_irq_pending() || need_resched() ||
- recheck_signals(&core_info) || !vc->kvm->arch.mmu_ready) {
+ recheck_signals_and_mmu(&core_info)) {
local_irq_enable();
vc->vcore_state = VCORE_INACTIVE;
/* Unlock all except the primary vcore */
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 63e0ce91e29d..7186c65c61c9 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -99,7 +99,7 @@ void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev,
} else {
rev->forw = rev->back = pte_index;
*rmap = (*rmap & ~KVMPPC_RMAP_INDEX) |
- pte_index | KVMPPC_RMAP_PRESENT;
+ pte_index | KVMPPC_RMAP_PRESENT | KVMPPC_RMAP_HPT;
}
unlock_rmap(rmap);
}
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 337e64468d78..07181d0dfcb7 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -942,6 +942,8 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
ld r11, VCPU_XIVE_SAVED_STATE(r4)
li r9, TM_QW1_OS
lwz r8, VCPU_XIVE_CAM_WORD(r4)
+ cmpwi r8, 0
+ beq no_xive
li r7, TM_QW1_OS + TM_WORD2
mfmsr r0
andi. r0, r0, MSR_DR /* in real mode? */
@@ -2831,29 +2833,39 @@ kvm_cede_prodded:
kvm_cede_exit:
ld r9, HSTATE_KVM_VCPU(r13)
#ifdef CONFIG_KVM_XICS
- /* Abort if we still have a pending escalation */
+ /* are we using XIVE with single escalation? */
+ ld r10, VCPU_XIVE_ESC_VADDR(r9)
+ cmpdi r10, 0
+ beq 3f
+ li r6, XIVE_ESB_SET_PQ_00
+ /*
+ * If we still have a pending escalation, abort the cede,
+ * and we must set PQ to 10 rather than 00 so that we don't
+ * potentially end up with two entries for the escalation
+ * interrupt in the XIVE interrupt queue. In that case
+ * we also don't want to set xive_esc_on to 1 here in
+ * case we race with xive_esc_irq().
+ */
lbz r5, VCPU_XIVE_ESC_ON(r9)
cmpwi r5, 0
- beq 1f
+ beq 4f
li r0, 0
stb r0, VCPU_CEDED(r9)
-1: /* Enable XIVE escalation */
- li r5, XIVE_ESB_SET_PQ_00
+ li r6, XIVE_ESB_SET_PQ_10
+ b 5f
+4: li r0, 1
+ stb r0, VCPU_XIVE_ESC_ON(r9)
+ /* make sure store to xive_esc_on is seen before xive_esc_irq runs */
+ sync
+5: /* Enable XIVE escalation */
mfmsr r0
andi. r0, r0, MSR_DR /* in real mode? */
beq 1f
- ld r10, VCPU_XIVE_ESC_VADDR(r9)
- cmpdi r10, 0
- beq 3f
- ldx r0, r10, r5
+ ldx r0, r10, r6
b 2f
1: ld r10, VCPU_XIVE_ESC_RADDR(r9)
- cmpdi r10, 0
- beq 3f
- ldcix r0, r10, r5
+ ldcix r0, r10, r6
2: sync
- li r0, 1
- stb r0, VCPU_XIVE_ESC_ON(r9)
#endif /* CONFIG_KVM_XICS */
3: b guest_exit_cont
diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
index e3ba67095895..591bfb4bfd0f 100644
--- a/arch/powerpc/kvm/book3s_xive.c
+++ b/arch/powerpc/kvm/book3s_xive.c
@@ -67,8 +67,14 @@ void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu)
void __iomem *tima = local_paca->kvm_hstate.xive_tima_virt;
u64 pq;
- if (!tima)
+ /*
+ * Nothing to do if the platform doesn't have a XIVE
+ * or this vCPU doesn't have its own XIVE context
+ * (e.g. because it's not using an in-kernel interrupt controller).
+ */
+ if (!tima || !vcpu->arch.xive_cam_word)
return;
+
eieio();
__raw_writeq(vcpu->arch.xive_saved_state.w01, tima + TM_QW1_OS);
__raw_writel(vcpu->arch.xive_cam_word, tima + TM_QW1_OS + TM_WORD2);
@@ -160,6 +166,9 @@ static irqreturn_t xive_esc_irq(int irq, void *data)
*/
vcpu->arch.xive_esc_on = false;
+ /* This orders xive_esc_on = false vs. subsequent stale_p = true */
+ smp_wmb(); /* goes with smp_mb() in cleanup_single_escalation */
+
return IRQ_HANDLED;
}
@@ -1113,6 +1122,31 @@ void kvmppc_xive_disable_vcpu_interrupts(struct kvm_vcpu *vcpu)
vcpu->arch.xive_esc_raddr = 0;
}
+/*
+ * In single escalation mode, the escalation interrupt is marked so
+ * that EOI doesn't re-enable it, but just sets the stale_p flag to
+ * indicate that the P bit has already been dealt with. However, the
+ * assembly code that enters the guest sets PQ to 00 without clearing
+ * stale_p (because it has no easy way to address it). Hence we have
+ * to adjust stale_p before shutting down the interrupt.
+ */
+void xive_cleanup_single_escalation(struct kvm_vcpu *vcpu,
+ struct kvmppc_xive_vcpu *xc, int irq)
+{
+ struct irq_data *d = irq_get_irq_data(irq);
+ struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
+
+ /*
+ * This slightly odd sequence gives the right result
+ * (i.e. stale_p set if xive_esc_on is false) even if
+ * we race with xive_esc_irq() and xive_irq_eoi().
+ */
+ xd->stale_p = false;
+ smp_mb(); /* paired with smb_wmb in xive_esc_irq */
+ if (!vcpu->arch.xive_esc_on)
+ xd->stale_p = true;
+}
+
void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu)
{
struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
@@ -1134,20 +1168,28 @@ void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu)
/* Mask the VP IPI */
xive_vm_esb_load(&xc->vp_ipi_data, XIVE_ESB_SET_PQ_01);
- /* Disable the VP */
- xive_native_disable_vp(xc->vp_id);
-
- /* Free the queues & associated interrupts */
+ /* Free escalations */
for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
- struct xive_q *q = &xc->queues[i];
-
- /* Free the escalation irq */
if (xc->esc_virq[i]) {
+ if (xc->xive->single_escalation)
+ xive_cleanup_single_escalation(vcpu, xc,
+ xc->esc_virq[i]);
free_irq(xc->esc_virq[i], vcpu);
irq_dispose_mapping(xc->esc_virq[i]);
kfree(xc->esc_virq_names[i]);
}
- /* Free the queue */
+ }
+
+ /* Disable the VP */
+ xive_native_disable_vp(xc->vp_id);
+
+ /* Clear the cam word so guest entry won't try to push context */
+ vcpu->arch.xive_cam_word = 0;
+