summaryrefslogtreecommitdiffstats
path: root/arch/powerpc/kvm/book3s_hv_nested.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2018-10-25 17:57:35 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2018-10-25 17:57:35 -0700
commit0d1e8b8d2bcd3150d51754d8d0fdbf44dc88b0d3 (patch)
tree2794cb2347daa76b00160a6ffb68663f4138dcc7 /arch/powerpc/kvm/book3s_hv_nested.c
parent83c4087ce468601501ecde4d0ec5b2abd5f57c31 (diff)
parent22a7cdcae6a4a3c8974899e62851d270956f58ce (diff)
Merge tag 'kvm-4.20-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull KVM updates from Radim Krčmář: "ARM: - Improved guest IPA space support (32 to 52 bits) - RAS event delivery for 32bit - PMU fixes - Guest entry hardening - Various cleanups - Port of dirty_log_test selftest PPC: - Nested HV KVM support for radix guests on POWER9. The performance is much better than with PR KVM. Migration and arbitrary level of nesting is supported. - Disable nested HV-KVM on early POWER9 chips that need a particular hardware bug workaround - One VM per core mode to prevent potential data leaks - PCI pass-through optimization - merge ppc-kvm topic branch and kvm-ppc-fixes to get a better base s390: - Initial version of AP crypto virtualization via vfio-mdev - Improvement for vfio-ap - Set the host program identifier - Optimize page table locking x86: - Enable nested virtualization by default - Implement Hyper-V IPI hypercalls - Improve #PF and #DB handling - Allow guests to use Enlightened VMCS - Add migration selftests for VMCS and Enlightened VMCS - Allow coalesced PIO accesses - Add an option to perform nested VMCS host state consistency check through hardware - Automatic tuning of lapic_timer_advance_ns - Many fixes, minor improvements, and cleanups" * tag 'kvm-4.20-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (204 commits) KVM/nVMX: Do not validate that posted_intr_desc_addr is page aligned Revert "kvm: x86: optimize dr6 restore" KVM: PPC: Optimize clearing TCEs for sparse tables x86/kvm/nVMX: tweak shadow fields selftests/kvm: add missing executables to .gitignore KVM: arm64: Safety check PSTATE when entering guest and handle IL KVM: PPC: Book3S HV: Don't use streamlined entry path on early POWER9 chips arm/arm64: KVM: Enable 32 bits kvm vcpu events support arm/arm64: KVM: Rename function kvm_arch_dev_ioctl_check_extension() KVM: arm64: Fix caching of host MDCR_EL2 value KVM: VMX: enable nested virtualization by default KVM/x86: Use 32bit xor to clear registers in svm.c kvm: x86: Introduce KVM_CAP_EXCEPTION_PAYLOAD kvm: vmx: Defer setting of DR6 until #DB delivery kvm: x86: Defer setting of CR2 until #PF delivery kvm: x86: Add payload operands to kvm_multiple_exception kvm: x86: Add exception payload fields to kvm_vcpu_events kvm: x86: Add has_payload and payload to kvm_queued_exception KVM: Documentation: Fix omission in struct kvm_vcpu_events KVM: selftests: add Enlightened VMCS test ...
Diffstat (limited to 'arch/powerpc/kvm/book3s_hv_nested.c')
-rw-r--r--arch/powerpc/kvm/book3s_hv_nested.c1291
1 files changed, 1291 insertions, 0 deletions
diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c
new file mode 100644
index 000000000000..401d2ecbebc5
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_hv_nested.c
@@ -0,0 +1,1291 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright IBM Corporation, 2018
+ * Authors Suraj Jitindar Singh <sjitindarsingh@gmail.com>
+ * Paul Mackerras <paulus@ozlabs.org>
+ *
+ * Description: KVM functions specific to running nested KVM-HV guests
+ * on Book3S processors (specifically POWER9 and later).
+ */
+
+#include <linux/kernel.h>
+#include <linux/kvm_host.h>
+#include <linux/llist.h>
+
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+#include <asm/mmu.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/pte-walk.h>
+#include <asm/reg.h>
+
+static struct patb_entry *pseries_partition_tb;
+
+static void kvmhv_update_ptbl_cache(struct kvm_nested_guest *gp);
+static void kvmhv_free_memslot_nest_rmap(struct kvm_memory_slot *free);
+
+void kvmhv_save_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr)
+{
+ struct kvmppc_vcore *vc = vcpu->arch.vcore;
+
+ hr->pcr = vc->pcr;
+ hr->dpdes = vc->dpdes;
+ hr->hfscr = vcpu->arch.hfscr;
+ hr->tb_offset = vc->tb_offset;
+ hr->dawr0 = vcpu->arch.dawr;
+ hr->dawrx0 = vcpu->arch.dawrx;
+ hr->ciabr = vcpu->arch.ciabr;
+ hr->purr = vcpu->arch.purr;
+ hr->spurr = vcpu->arch.spurr;
+ hr->ic = vcpu->arch.ic;
+ hr->vtb = vc->vtb;
+ hr->srr0 = vcpu->arch.shregs.srr0;
+ hr->srr1 = vcpu->arch.shregs.srr1;
+ hr->sprg[0] = vcpu->arch.shregs.sprg0;
+ hr->sprg[1] = vcpu->arch.shregs.sprg1;
+ hr->sprg[2] = vcpu->arch.shregs.sprg2;
+ hr->sprg[3] = vcpu->arch.shregs.sprg3;
+ hr->pidr = vcpu->arch.pid;
+ hr->cfar = vcpu->arch.cfar;
+ hr->ppr = vcpu->arch.ppr;
+}
+
+static void byteswap_pt_regs(struct pt_regs *regs)
+{
+ unsigned long *addr = (unsigned long *) regs;
+
+ for (; addr < ((unsigned long *) (regs + 1)); addr++)
+ *addr = swab64(*addr);
+}
+
+static void byteswap_hv_regs(struct hv_guest_state *hr)
+{
+ hr->version = swab64(hr->version);
+ hr->lpid = swab32(hr->lpid);
+ hr->vcpu_token = swab32(hr->vcpu_token);
+ hr->lpcr = swab64(hr->lpcr);
+ hr->pcr = swab64(hr->pcr);
+ hr->amor = swab64(hr->amor);
+ hr->dpdes = swab64(hr->dpdes);
+ hr->hfscr = swab64(hr->hfscr);
+ hr->tb_offset = swab64(hr->tb_offset);
+ hr->dawr0 = swab64(hr->dawr0);
+ hr->dawrx0 = swab64(hr->dawrx0);
+ hr->ciabr = swab64(hr->ciabr);
+ hr->hdec_expiry = swab64(hr->hdec_expiry);
+ hr->purr = swab64(hr->purr);
+ hr->spurr = swab64(hr->spurr);
+ hr->ic = swab64(hr->ic);
+ hr->vtb = swab64(hr->vtb);
+ hr->hdar = swab64(hr->hdar);
+ hr->hdsisr = swab64(hr->hdsisr);
+ hr->heir = swab64(hr->heir);
+ hr->asdr = swab64(hr->asdr);
+ hr->srr0 = swab64(hr->srr0);
+ hr->srr1 = swab64(hr->srr1);
+ hr->sprg[0] = swab64(hr->sprg[0]);
+ hr->sprg[1] = swab64(hr->sprg[1]);
+ hr->sprg[2] = swab64(hr->sprg[2]);
+ hr->sprg[3] = swab64(hr->sprg[3]);
+ hr->pidr = swab64(hr->pidr);
+ hr->cfar = swab64(hr->cfar);
+ hr->ppr = swab64(hr->ppr);
+}
+
+static void save_hv_return_state(struct kvm_vcpu *vcpu, int trap,
+ struct hv_guest_state *hr)
+{
+ struct kvmppc_vcore *vc = vcpu->arch.vcore;
+
+ hr->dpdes = vc->dpdes;
+ hr->hfscr = vcpu->arch.hfscr;
+ hr->purr = vcpu->arch.purr;
+ hr->spurr = vcpu->arch.spurr;
+ hr->ic = vcpu->arch.ic;
+ hr->vtb = vc->vtb;
+ hr->srr0 = vcpu->arch.shregs.srr0;
+ hr->srr1 = vcpu->arch.shregs.srr1;
+ hr->sprg[0] = vcpu->arch.shregs.sprg0;
+ hr->sprg[1] = vcpu->arch.shregs.sprg1;
+ hr->sprg[2] = vcpu->arch.shregs.sprg2;
+ hr->sprg[3] = vcpu->arch.shregs.sprg3;
+ hr->pidr = vcpu->arch.pid;
+ hr->cfar = vcpu->arch.cfar;
+ hr->ppr = vcpu->arch.ppr;
+ switch (trap) {
+ case BOOK3S_INTERRUPT_H_DATA_STORAGE:
+ hr->hdar = vcpu->arch.fault_dar;
+ hr->hdsisr = vcpu->arch.fault_dsisr;
+ hr->asdr = vcpu->arch.fault_gpa;
+ break;
+ case BOOK3S_INTERRUPT_H_INST_STORAGE:
+ hr->asdr = vcpu->arch.fault_gpa;
+ break;
+ case BOOK3S_INTERRUPT_H_EMUL_ASSIST:
+ hr->heir = vcpu->arch.emul_inst;
+ break;
+ }
+}
+
+static void sanitise_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr)
+{
+ /*
+ * Don't let L1 enable features for L2 which we've disabled for L1,
+ * but preserve the interrupt cause field.
+ */
+ hr->hfscr &= (HFSCR_INTR_CAUSE | vcpu->arch.hfscr);
+
+ /* Don't let data address watchpoint match in hypervisor state */
+ hr->dawrx0 &= ~DAWRX_HYP;
+
+ /* Don't let completed instruction address breakpt match in HV state */
+ if ((hr->ciabr & CIABR_PRIV) == CIABR_PRIV_HYPER)
+ hr->ciabr &= ~CIABR_PRIV;
+}
+
+static void restore_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr)
+{
+ struct kvmppc_vcore *vc = vcpu->arch.vcore;
+
+ vc->pcr = hr->pcr;
+ vc->dpdes = hr->dpdes;
+ vcpu->arch.hfscr = hr->hfscr;
+ vcpu->arch.dawr = hr->dawr0;
+ vcpu->arch.dawrx = hr->dawrx0;
+ vcpu->arch.ciabr = hr->ciabr;
+ vcpu->arch.purr = hr->purr;
+ vcpu->arch.spurr = hr->spurr;
+ vcpu->arch.ic = hr->ic;
+ vc->vtb = hr->vtb;
+ vcpu->arch.shregs.srr0 = hr->srr0;
+ vcpu->arch.shregs.srr1 = hr->srr1;
+ vcpu->arch.shregs.sprg0 = hr->sprg[0];
+ vcpu->arch.shregs.sprg1 = hr->sprg[1];
+ vcpu->arch.shregs.sprg2 = hr->sprg[2];
+ vcpu->arch.shregs.sprg3 = hr->sprg[3];
+ vcpu->arch.pid = hr->pidr;
+ vcpu->arch.cfar = hr->cfar;
+ vcpu->arch.ppr = hr->ppr;
+}
+
+void kvmhv_restore_hv_return_state(struct kvm_vcpu *vcpu,
+ struct hv_guest_state *hr)
+{
+ struct kvmppc_vcore *vc = vcpu->arch.vcore;
+
+ vc->dpdes = hr->dpdes;
+ vcpu->arch.hfscr = hr->hfscr;
+ vcpu->arch.purr = hr->purr;
+ vcpu->arch.spurr = hr->spurr;
+ vcpu->arch.ic = hr->ic;
+ vc->vtb = hr->vtb;
+ vcpu->arch.fault_dar = hr->hdar;
+ vcpu->arch.fault_dsisr = hr->hdsisr;
+ vcpu->arch.fault_gpa = hr->asdr;
+ vcpu->arch.emul_inst = hr->heir;
+ vcpu->arch.shregs.srr0 = hr->srr0;
+ vcpu->arch.shregs.srr1 = hr->srr1;
+ vcpu->arch.shregs.sprg0 = hr->sprg[0];
+ vcpu->arch.shregs.sprg1 = hr->sprg[1];
+ vcpu->arch.shregs.sprg2 = hr->sprg[2];
+ vcpu->arch.shregs.sprg3 = hr->sprg[3];
+ vcpu->arch.pid = hr->pidr;
+ vcpu->arch.cfar = hr->cfar;
+ vcpu->arch.ppr = hr->ppr;
+}
+
+long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu)
+{
+ long int err, r;
+ struct kvm_nested_guest *l2;
+ struct pt_regs l2_regs, saved_l1_regs;
+ struct hv_guest_state l2_hv, saved_l1_hv;
+ struct kvmppc_vcore *vc = vcpu->arch.vcore;
+ u64 hv_ptr, regs_ptr;
+ u64 hdec_exp;
+ s64 delta_purr, delta_spurr, delta_ic, delta_vtb;
+ u64 mask;
+ unsigned long lpcr;
+
+ if (vcpu->kvm->arch.l1_ptcr == 0)
+ return H_NOT_AVAILABLE;
+
+ /* copy parameters in */
+ hv_ptr = kvmppc_get_gpr(vcpu, 4);
+ err = kvm_vcpu_read_guest(vcpu, hv_ptr, &l2_hv,
+ sizeof(struct hv_guest_state));
+ if (err)
+ return H_PARAMETER;
+ if (kvmppc_need_byteswap(vcpu))
+ byteswap_hv_regs(&l2_hv);
+ if (l2_hv.version != HV_GUEST_STATE_VERSION)
+ return H_P2;
+
+ regs_ptr = kvmppc_get_gpr(vcpu, 5);
+ err = kvm_vcpu_read_guest(vcpu, regs_ptr, &l2_regs,
+ sizeof(struct pt_regs));
+ if (err)
+ return H_PARAMETER;
+ if (kvmppc_need_byteswap(vcpu))
+ byteswap_pt_regs(&l2_regs);
+ if (l2_hv.vcpu_token >= NR_CPUS)
+ return H_PARAMETER;
+
+ /* translate lpid */
+ l2 = kvmhv_get_nested(vcpu->kvm, l2_hv.lpid, true);
+ if (!l2)
+ return H_PARAMETER;
+ if (!l2->l1_gr_to_hr) {
+ mutex_lock(&l2->tlb_lock);
+ kvmhv_update_ptbl_cache(l2);
+ mutex_unlock(&l2->tlb_lock);
+ }
+
+ /* save l1 values of things */
+ vcpu->arch.regs.msr = vcpu->arch.shregs.msr;
+ saved_l1_regs = vcpu->arch.regs;
+ kvmhv_save_hv_regs(vcpu, &saved_l1_hv);
+
+ /* convert TB values/offsets to host (L0) values */
+ hdec_exp = l2_hv.hdec_expiry - vc->tb_offset;
+ vc->tb_offset += l2_hv.tb_offset;
+
+ /* set L1 state to L2 state */
+ vcpu->arch.nested = l2;
+ vcpu->arch.nested_vcpu_id = l2_hv.vcpu_token;
+ vcpu->arch.regs = l2_regs;
+ vcpu->arch.shregs.msr = vcpu->arch.regs.msr;
+ mask = LPCR_DPFD | LPCR_ILE | LPCR_TC | LPCR_AIL | LPCR_LD |
+ LPCR_LPES | LPCR_MER;
+ lpcr = (vc->lpcr & ~mask) | (l2_hv.lpcr & mask);
+ sanitise_hv_regs(vcpu, &l2_hv);
+ restore_hv_regs(vcpu, &l2_hv);
+
+ vcpu->arch.ret = RESUME_GUEST;
+ vcpu->arch.trap = 0;
+ do {
+ if (mftb() >= hdec_exp) {
+ vcpu->arch.trap = BOOK3S_INTERRUPT_HV_DECREMENTER;
+ r = RESUME_HOST;
+ break;
+ }
+ r = kvmhv_run_single_vcpu(vcpu->arch.kvm_run, vcpu, hdec_exp,
+ lpcr);
+ } while (is_kvmppc_resume_guest(r));
+
+ /* save L2 state for return */
+ l2_regs = vcpu->arch.regs;
+ l2_regs.msr = vcpu->arch.shregs.msr;
+ delta_purr = vcpu->arch.purr - l2_hv.purr;
+ delta_spurr = vcpu->arch.spurr - l2_hv.spurr;
+ delta_ic = vcpu->arch.ic - l2_hv.ic;
+ delta_vtb = vc->vtb - l2_hv.vtb;
+ save_hv_return_state(vcpu, vcpu->arch.trap, &l2_hv);
+
+ /* restore L1 state */
+ vcpu->arch.nested = NULL;
+ vcpu->arch.regs = saved_l1_regs;
+ vcpu->arch.shregs.msr = saved_l1_regs.msr & ~MSR_TS_MASK;
+ /* set L1 MSR TS field according to L2 transaction state */
+ if (l2_regs.msr & MSR_TS_MASK)
+ vcpu->arch.shregs.msr |= MSR_TS_S;
+ vc->tb_offset = saved_l1_hv.tb_offset;
+ restore_hv_regs(vcpu, &saved_l1_hv);
+ vcpu->arch.purr += delta_purr;
+ vcpu->arch.spurr += delta_spurr;
+ vcpu->arch.ic += delta_ic;
+ vc->vtb += delta_vtb;
+
+ kvmhv_put_nested(l2);
+
+ /* copy l2_hv_state and regs back to guest */
+ if (kvmppc_need_byteswap(vcpu)) {
+ byteswap_hv_regs(&l2_hv);
+ byteswap_pt_regs(&l2_regs);
+ }
+ err = kvm_vcpu_write_guest(vcpu, hv_ptr, &l2_hv,
+ sizeof(struct hv_guest_state));
+ if (err)
+ return H_AUTHORITY;
+ err = kvm_vcpu_write_guest(vcpu, regs_ptr, &l2_regs,
+ sizeof(struct pt_regs));
+ if (err)
+ return H_AUTHORITY;
+
+ if (r == -EINTR)
+ return H_INTERRUPT;
+
+ return vcpu->arch.trap;
+}
+
+long kvmhv_nested_init(void)
+{
+ long int ptb_order;
+ unsigned long ptcr;
+ long rc;
+
+ if (!kvmhv_on_pseries())
+ return 0;
+ if (!radix_enabled())
+ return -ENODEV;
+
+ /* find log base 2 of KVMPPC_NR_LPIDS, rounding up */
+ ptb_order = __ilog2(KVMPPC_NR_LPIDS - 1) + 1;
+ if (ptb_order < 8)
+ ptb_order = 8;
+ pseries_partition_tb = kmalloc(sizeof(struct patb_entry) << ptb_order,
+ GFP_KERNEL);
+ if (!pseries_partition_tb) {
+ pr_err("kvm-hv: failed to allocated nested partition table\n");
+ return -ENOMEM;
+ }
+
+ ptcr = __pa(pseries_partition_tb) | (ptb_order - 8);
+ rc = plpar_hcall_norets(H_SET_PARTITION_TABLE, ptcr);
+ if (rc != H_SUCCESS) {
+ pr_err("kvm-hv: Parent hypervisor does not support nesting (rc=%ld)\n",
+ rc);
+ kfree(pseries_partition_tb);
+ pseries_partition_tb = NULL;
+ return -ENODEV;
+ }
+
+ return 0;
+}
+
+void kvmhv_nested_exit(void)
+{
+ /*
+ * N.B. the kvmhv_on_pseries() test is there because it enables
+ * the compiler to remove the call to plpar_hcall_norets()
+ * when CONFIG_PPC_PSERIES=n.
+ */
+ if (kvmhv_on_pseries() && pseries_partition_tb) {
+ plpar_hcall_norets(H_SET_PARTITION_TABLE, 0);
+ kfree(pseries_partition_tb);
+ pseries_partition_tb = NULL;
+ }
+}
+
+static void kvmhv_flush_lpid(unsigned int lpid)
+{
+ long rc;
+
+ if (!kvmhv_on_pseries()) {
+ radix__flush_tlb_lpid(lpid);
+ return;
+ }
+
+ rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(2, 0, 1),
+ lpid, TLBIEL_INVAL_SET_LPID);
+ if (rc)
+ pr_err("KVM: TLB LPID invalidation hcall failed, rc=%ld\n", rc);
+}
+
+void kvmhv_set_ptbl_entry(unsigned int lpid, u64 dw0, u64 dw1)
+{
+ if (!kvmhv_on_pseries()) {
+ mmu_partition_table_set_entry(lpid, dw0, dw1);
+ return;
+ }
+
+ pseries_partition_tb[lpid].patb0 = cpu_to_be64(dw0);
+ pseries_partition_tb[lpid].patb1 = cpu_to_be64(dw1);
+ /* L0 will do the necessary barriers */
+ kvmhv_flush_lpid(lpid);
+}
+
+static void kvmhv_set_nested_ptbl(struct kvm_nested_guest *gp)
+{
+ unsigned long dw0;
+
+ dw0 = PATB_HR | radix__get_tree_size() |
+ __pa(gp->shadow_pgtable) | RADIX_PGD_INDEX_SIZE;
+ kvmhv_set_ptbl_entry(gp->shadow_lpid, dw0, gp->process_table);
+}
+
+void kvmhv_vm_nested_init(struct kvm *kvm)
+{
+ kvm->arch.max_nested_lpid = -1;
+}
+
+/*
+ * Handle the H_SET_PARTITION_TABLE hcall.
+ * r4 = guest real address of partition table + log_2(size) - 12
+ * (formatted as for the PTCR).
+ */
+long kvmhv_set_partition_table(struct kvm_vcpu *vcpu)
+{
+ struct kvm *kvm = vcpu->kvm;
+ unsigned long ptcr = kvmppc_get_gpr(vcpu, 4);
+ int srcu_idx;
+ long ret = H_SUCCESS;
+
+ srcu_idx = srcu_read_lock(&kvm->srcu);
+ /*
+ * Limit the partition table to 4096 entries (because that's what
+ * hardware supports), and check the base address.
+ */
+ if ((ptcr & PRTS_MASK) > 12 - 8 ||
+ !kvm_is_visible_gfn(vcpu->kvm, (ptcr & PRTB_MASK) >> PAGE_SHIFT))
+ ret = H_PARAMETER;
+ srcu_read_unlock(&kvm->srcu, srcu_idx);
+ if (ret == H_SUCCESS)
+ kvm->arch.l1_ptcr = ptcr;
+ return ret;
+}
+
+/*
+ * Reload the partition table entry for a guest.
+ * Caller must hold gp->tlb_lock.
+ */
+static void kvmhv_update_ptbl_cache(struct kvm_nested_guest *gp)
+{
+ int ret;
+ struct patb_entry ptbl_entry;
+ unsigned long ptbl_addr;
+ struct kvm *kvm = gp->l1_host;
+
+ ret = -EFAULT;
+ ptbl_addr = (kvm->arch.l1_ptcr & PRTB_MASK) + (gp->l1_lpid << 4);
+ if (gp->l1_lpid < (1ul << ((kvm->arch.l1_ptcr & PRTS_MASK) + 8)))
+ ret = kvm_read_guest(kvm, ptbl_addr,
+ &ptbl_entry, sizeof(ptbl_entry));
+ if (ret) {
+ gp->l1_gr_to_hr = 0;
+ gp->process_table = 0;
+ } else {
+ gp->l1_gr_to_hr = be64_to_cpu(ptbl_entry.patb0);
+ gp->process_table = be64_to_cpu(ptbl_entry.patb1);
+ }
+ kvmhv_set_nested_ptbl(gp);
+}
+
+struct kvm_nested_guest *kvmhv_alloc_nested(struct kvm *kvm, unsigned int lpid)
+{
+ struct kvm_nested_guest *gp;
+ long shadow_lpid;
+
+ gp = kzalloc(sizeof(*gp), GFP_KERNEL);
+ if (!gp)
+ return NULL;
+ gp->l1_host = kvm;
+ gp->l1_lpid = lpid;
+ mutex_init(&gp->tlb_lock);
+ gp->shadow_pgtable = pgd_alloc(kvm->mm);
+ if (!gp->shadow_pgtable)
+ goto out_free;
+ shadow_lpid = kvmppc_alloc_lpid();
+ if (shadow_lpid < 0)
+ goto out_free2;
+ gp->shadow_lpid = shadow_lpid;
+
+ memset(gp->prev_cpu, -1, sizeof(gp->prev_cpu));
+
+ return gp;
+
+ out_free2:
+ pgd_free(kvm->mm, gp->shadow_pgtable);
+ out_free:
+ kfree(gp);
+ return NULL;
+}
+
+/*
+ * Free up any resources allocated for a nested guest.
+ */
+static void kvmhv_release_nested(struct kvm_nested_guest *gp)
+{
+ struct kvm *kvm = gp->l1_host;
+
+ if (gp->shadow_pgtable) {
+ /*
+ * No vcpu is using this struct and no call to
+ * kvmhv_get_nested can find this struct,
+ * so we don't need to hold kvm->mmu_lock.
+ */
+ kvmppc_free_pgtable_radix(kvm, gp->shadow_pgtable,
+ gp->shadow_lpid);
+ pgd_free(kvm->mm, gp->shadow_pgtable);
+ }
+ kvmhv_set_ptbl_entry(gp->shadow_lpid, 0, 0);
+ kvmppc_free_lpid(gp->shadow_lpid);
+ kfree(gp);
+}
+
+static void kvmhv_remove_nested(struct kvm_nested_guest *gp)
+{
+ struct kvm *kvm = gp->l1_host;
+ int lpid = gp->l1_lpid;
+ long ref;
+
+ spin_lock(&kvm->mmu_lock);
+ if (gp == kvm->arch.nested_guests[lpid]) {
+ kvm->arch.nested_guests[lpid] = NULL;
+ if (lpid == kvm->arch.max_nested_lpid) {
+ while (--lpid >= 0 && !kvm->arch.nested_guests[lpid])
+ ;
+ kvm->arch.max_nested_lpid = lpid;
+ }
+ --gp->refcnt;
+ }
+ ref = gp->refcnt;
+ spin_unlock(&kvm->mmu_lock);
+ if (ref == 0)
+ kvmhv_release_nested(gp);
+}
+
+/*
+ * Free up all nested resources allocated for this guest.
+ * This is called with no vcpus of the guest running, when
+ * switching the guest to HPT mode or when destroying the
+ * guest.
+ */
+void kvmhv_release_all_nested(struct kvm *kvm)
+{
+ int i;
+ struct kvm_nested_guest *gp;
+ struct kvm_nested_guest *freelist = NULL;
+ struct kvm_memory_slot *memslot;
+ int srcu_idx;
+
+ spin_lock(&kvm->mmu_lock);
+ for (i = 0; i <= kvm->arch.max_nested_lpid; i++) {
+ gp = kvm->arch.nested_guests[i];
+ if (!gp)
+ continue;
+ kvm->arch.nested_guests[i] = NULL;
+ if (--gp->refcnt == 0) {
+ gp->next = freelist;
+ freelist = gp;
+ }
+ }
+ kvm->arch.max_nested_lpid = -1;
+ spin_unlock(&kvm->mmu_lock);
+ while ((gp = freelist) != NULL) {
+ freelist = gp->next;
+ kvmhv_release_nested(gp);
+ }
+
+ srcu_idx = srcu_read_lock(&kvm->srcu);
+ kvm_for_each_memslot(memslot, kvm_memslots(kvm))
+ kvmhv_free_memslot_nest_rmap(memslot);
+ srcu_read_unlock(&kvm->srcu, srcu_idx);
+}
+
+/* caller must hold gp->tlb_lock */
+static void kvmhv_flush_nested(struct kvm_nested_guest *gp)
+{
+ struct kvm *kvm = gp->l1_host;
+
+ spin_lock(&kvm->mmu_lock);
+ kvmppc_free_pgtable_radix(kvm, gp->shadow_pgtable, gp->shadow_lpid);
+ spin_unlock(&kvm->mmu_lock);
+ kvmhv_flush_lpid(gp->shadow_lpid);
+ kvmhv_update_ptbl_cache(gp);
+ if (gp->l1_gr_to_hr == 0)
+ kvmhv_remove_nested(gp);
+}
+
+struct kvm_nested_guest *kvmhv_get_nested(struct kvm *kvm, int l1_lpid,
+ bool create)
+{
+ struct kvm_nested_guest *gp, *newgp;
+
+ if (l1_lpid >= KVM_MAX_NESTED_GUESTS ||
+ l1_lpid >= (1ul << ((kvm->arch.l1_ptcr & PRTS_MASK) + 12 - 4)))
+ return NULL;
+
+ spin_lock(&kvm->mmu_lock);
+ gp = kvm->arch.nested_guests[l1_lpid];
+ if (gp)
+ ++gp->refcnt;
+ spin_unlock(&kvm->mmu_lock);
+
+ if (gp || !create)
+ return gp;
+
+ newgp = kvmhv_alloc_nested(kvm, l1_lpid);
+ if (!newgp)
+ return NULL;
+ spin_lock(&kvm->mmu_lock);
+ if (kvm->arch.nested_guests[l1_lpid]) {
+ /* someone else beat us to it */
+ gp = kvm->arch.nested_guests[l1_lpid];
+ } else {
+ kvm->arch.nested_guests[l1_lpid] = newgp;
+ ++newgp->refcnt;
+ gp = newgp;
+ newgp = NULL;
+ if (l1_lpid > kvm->arch.max_nested_lpid)
+ kvm->arch.max_nested_lpid = l1_lpid;
+ }
+ ++gp->refcnt;
+ spin_unlock(&kvm->mmu_lock);
+
+ if (newgp)
+ kvmhv_release_nested(newgp);
+
+ return gp;
+}
+
+void kvmhv_put_nested(struct kvm_nested_guest *gp)
+{
+ struct kvm *kvm = gp->l1_host;
+ long ref;
+
+ spin_lock(&kvm->mmu_lock);
+ ref = --gp->refcnt;
+ spin_unlock(&kvm->mmu_lock);
+ if (ref == 0)
+ kvmhv_release_nested(gp);
+}
+
+static struct kvm_nested_guest *kvmhv_find_nested(struct kvm *kvm, int lpid)
+{
+ if (lpid > kvm->arch.max_nested_lpid)
+ return NULL;
+ return kvm->arch.nested_guests[lpid];
+}
+
+static inline bool kvmhv_n_rmap_is_equal(u64 rmap_1, u64 rmap_2)
+{
+ return !((rmap_1 ^ rmap_2) & (RMAP_NESTED_LPID_MASK |
+ RMAP_NESTED_GPA_MASK));
+}
+
+void kvmhv_insert_nest_rmap(struct kvm *kvm, unsigned long *rmapp,
+ struct rmap_nested **n_rmap)
+{
+ struct llist_node *entry = ((struct llist_head *) rmapp)->first;
+ struct rmap_nested *cursor;
+ u64 rmap, new_rmap = (*n_rmap)->rmap;
+
+ /* Are there any existing entries? */
+ if (!(*rmapp)) {
+ /* No -> use the rmap as a single entry */
+ *rmapp = new_rmap | RMAP_NESTED_IS_SINGLE_ENTRY;
+ return;
+ }
+
+ /* Do any entries match what we're trying to insert? */
+ for_each_nest_rmap_safe(cursor, entry, &rmap) {
+ if (kvmhv_n_rmap_is_equal(rmap, new_rmap))
+ return;
+ }
+
+ /* Do we need to create a list or just add the new entry? */
+ rmap = *rmapp;
+ if (rmap & RMAP_NESTED_IS_SINGLE_ENTRY) /* Not previously a list */
+ *rmapp = 0UL;
+ llist_add(&((*n_rmap)->list), (struct llist_head *) rmapp);
+ if (rmap & RMAP_NESTED_IS_SINGLE_ENTRY) /* Not previously a list */
+ (*n_rmap)->list.next = (struct llist_node *) rmap;
+
+ /* Set NULL so not freed by caller */
+ *n_rmap = NULL;
+}
+
+static void kvmhv_remove_nest_rmap(struct kvm *kvm, u64 n_rmap,
+ unsigned long hpa, unsigned long mask)
+{
+ struct kvm_nested_guest *gp;
+ unsigned long gpa;
+ unsigned int shift, lpid;
+ pte_t *ptep;
+
+ gpa = n_rmap & RMAP_NESTED_GPA_MASK;
+ lpid = (n_rmap & RMAP_NESTED_LPID_MASK) >> RMAP_NESTED_LPID_SHIFT;
+ gp = kvmhv_find_nested(kvm, lpid);
+ if (!gp)
+ return;
+
+ /* Find and invalidate the pte */
+ ptep = __find_linux_pte(gp->shadow_pgtable, gpa, NULL, &shift);
+ /* Don't spuriously invalidate ptes if the pfn has changed */
+ if (ptep && pte_present(*ptep) && ((pte_val(*ptep) & mask) == hpa))
+ kvmppc_unmap_pte(kvm, ptep, gpa, shift, NULL, gp->shadow_lpid);
+}
+
+static void kvmhv_remove_nest_rmap_list(struct kvm *kvm, unsigned long *rmapp,
+ unsigned long hpa, unsigned long mask)
+{
+ struct llist_node *entry = llist_del_all((struct llist_head *) rmapp);
+ struct rmap_nested *cursor;
+ unsigned long rmap;
+
+ for_each_nest_rmap_safe(cursor, entry, &rmap) {
+ kvmhv_remove_nest_rmap(kvm, rmap, hpa, mask);
+ kfree(cursor);
+ }
+}
+
+/* called with kvm->mmu_lock held */
+void kvmhv_remove_nest_rmap_range(struct kvm *kvm,
+ struct kvm_memory_slot *memslot,
+ unsigned long gpa, unsigned long hpa,
+ unsigned long nbytes)
+{
+ unsigned long gfn, end_gfn;
+ unsigned long addr_mask;
+
+ if (!memslot)
+ return;
+ gfn = (gpa >> PAGE_SHIFT) - memslot->base_gfn;
+ end_gfn = gfn + (nbytes >> PAGE_SHIFT);
+
+ addr_mask = PTE_RPN_MASK & ~(nbytes - 1);
+ hpa &= addr_mask;
+
+ for (; gfn < end_gfn; gfn++) {
+ unsigned long *rmap = &memslot->arch.rmap[gfn];
+ kvmhv_remove_nest_rmap_list(kvm, rmap, hpa, addr_mask);
+ }
+}
+
+static void kvmhv_free_memslot_nest_rmap(struct kvm_memory_slot *free)
+{
+ unsigned long page;
+
+ for (page = 0; page < free->npages; page++) {
+ unsigned long rmap, *rmapp = &free->arch.rmap[page];
+ struct rmap_nested *cursor;
+ struct llist_node *entry;
+
+ entry = llist_del_all((struct llist_head *) rmapp);
+ for_each_nest_rmap_safe(cursor, entry, &rmap)
+ kfree(cursor);
+ }
+}
+
+static bool kvmhv_invalidate_shadow_pte(struct kvm_vcpu *vcpu,
+ struct kvm_nested_guest *gp,
+ long gpa, int *shift_ret)
+{
+ struct kvm *kvm = vcpu->kvm;
+ bool ret = false;
+ pte_t *ptep;
+ int shift;
+
+ spin_lock(&kvm->mmu_lock);
+ ptep = __find_linux_pte(gp->shadow_pgtable, gpa, NULL, &shift);
+ if (!shift)
+ shift = PAGE_SHIFT;
+ if (ptep && pte_present(*ptep)) {
+ kvmppc_unmap_pte(kvm, ptep, gpa, shift, NULL, gp->shadow_lpid);
+ ret = true;
+ }
+ spin_unlock(&kvm->mmu_lock);
+
+ if (shift_ret)
+ *shift_ret = shift;
+ return ret;
+}
+
+static inline int get_ric(unsigned int instr)
+{
+ return (instr >> 18) & 0x3;
+}
+
+static inline int get_prs(unsigned int instr)
+{
+ return (instr >> 17) & 0x1;
+}
+
+static inline int get_r(unsigned int instr)
+{
+ return (instr >> 16) & 0x1;
+}
+
+static inline int get_lpid(unsigned long r_val)
+{
+ return r_val & 0xffffffff;
+}
+
+static inline int get_is(unsigned long r_val)
+{
+ return (r_val >> 10) & 0x3;
+}
+
+static inline int get_ap(unsigned long r_val)
+{
+ return (r_val >> 5) & 0x7;
+}
+
+static inline long get_epn(unsigned long r_val)
+{
+ return r_val >> 12;
+}
+
+static int kvmhv_emulate_tlbie_tlb_addr(struct kvm_vcpu *vcpu, int lpid,
+ int ap, long epn)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_nested_guest *gp;
+ long npages;
+ int shift, shadow_shift;
+ unsigned long addr;
+
+ shift = ap_to_shift(ap);
+ addr = epn << 12;
+ if (shift < 0)
+ /* Invalid ap encoding */
+ return -EINVAL;
+
+ addr &= ~((1UL << shift) - 1);
+ npages = 1UL << (shift - PAGE_SHIFT);
+
+ gp = kvmhv_get_nested(kvm, lpid, false);
+ if (!gp) /* No such guest -> nothing to do */
+ return 0;
+ mutex_lock(&gp->tlb_lock);
+
+ /* There may be more than one host page backing this single guest pte */
+ do {
+ kvmhv_invalidate_shadow_pte(vcpu, gp, addr, &shadow_shift);
+
+ npages -= 1UL << (shadow_shift - PAGE_SHIFT);
+ addr += 1UL << shadow_shift;
+ } while (npages > 0);
+
+ mutex_unlock(&gp->tlb_lock);
+ kvmhv_put_nested(gp);
+ return 0;
+}
+
+static void kvmhv_emulate_tlbie_lpid(struct kvm_vcpu *vcpu,
+ struct kvm_nested_guest *gp, int ric)
+{
+ struct kvm *kvm = vcpu->kvm;
+
+ mutex_lock(&gp->tlb_lock);
+ switch (ric) {
+ case 0:
+ /* Invalidate TLB */
+ spin_lock(&kvm->mmu_lock);
+ kvmppc_free_pgtable_radix(kvm, gp->shadow_pgtable,
+ gp->shadow_lpid);
+ kvmhv_flush_lpid(gp->shadow_lpid);
+ spin_unlock(&kvm->mmu_lock);
+ break;
+ case 1:
+ /*
+ * Invalidate PWC
+ * We don't cache this -> nothing to do
+ */
+ break;
+ case 2:
+ /* Invalidate TLB, PWC and caching of partition table entries */
+ kvmhv_flush_nested(gp);
+ break;
+ default:
+ break;
+ }
+ mutex_unlock(&gp->tlb_lock);
+}
+
+static void kvmhv_emulate_tlbie_all_lpid(struct kvm_vcpu *vcpu, int ric)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_nested_guest *gp;
+ int i;
+
+ spin_lock(&kvm->mmu_lock);
+ for (i = 0; i <= kvm->arch.max_nested_lpid; i++) {
+ gp = kvm->arch.nested_guests[i];
+ if (gp) {
+ spin_unlock(&kvm->mmu_lock);
+ kvmhv_emulate_tlbie_lpid(vcpu, gp, ric);
+ spin_lock(&kvm->mmu_lock);
+ }
+ }
+ spin_unlock(&kvm->mmu_lock);
+}
+
+static int kvmhv_emulate_priv_tlbie(struct kvm_vcpu *vcpu, unsigned int instr,
+ unsigned long rsval, unsigned long rbval)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_nested_guest *gp;
+ int r, ric, prs, is, ap;
+ int lpid;
+ long epn;
+ int ret = 0;
+
+ ric = get_ric(instr);
+ prs = get_prs(instr);
+ r = get_r(instr);
+ lpid = get_lpid(rsval);
+ is = get_is(rbval);
+
+ /*
+ * These cases are invalid and are not handled:
+ * r != 1 -> Only radix supported
+ * prs == 1 -> Not HV privileged
+ * ric == 3 -> No cluster bombs for radix
+ * is == 1 -> Partition scoped translations not associated with pid
+ * (!is) && (ric == 1 || ric == 2) -> Not supported by ISA
+ */
+ if ((!r) || (prs) || (ric == 3) || (is == 1) ||
+ ((!is) && (ric == 1 || ric == 2)))
+ return -EINVAL;
+
+ switch (is) {
+ case 0:
+ /*
+ * We know ric == 0
+ * Invalidate TLB for a given target address
+ */
+ epn = get_epn(rbval);
+ ap = get_ap(rbval);
+ ret = kvmhv_emulate_tlbie_tlb_addr(vcpu, lpid, ap, epn);
+ break;
+ case 2:
+ /* Invalidate matching LPID */
+ gp = kvmhv_get_nested(kvm, lpid, false);
+ if (gp) {
+ kvmhv_emulate_tlbie_lpid(vcpu, gp, ric);
+ kvmhv_put_nested(gp);
+ }
+ break;
+ case 3:
+ /* Invalidate ALL LPIDs */
+ kvmhv_emulate_tlbie_all_lpid(vcpu, ric);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ return ret;
+}
+
+/*
+ * This handles the H_TLB_INVALIDATE hcall.
+ * Parameters are (r4) tlbie instruction code, (r5) rS contents,
+ * (r6) rB contents.
+ */
+long kvmhv_do_nested_tlbie(struct kvm_vcpu *vcpu)
+{
+ int ret;
+
+ ret = kvmhv_emulate_priv_tlbie(vcpu, kvmppc_get_gpr(vcpu, 4),
+ kvmppc_get_gpr(vcpu, 5), kvmppc_get_gpr(vcpu, 6));
+ if (ret)
+ return H_PARAMETER;
+ return H_SUCCESS;
+}
+
+/* Used to convert a nested guest real address to a L1 guest real address */
+static int kvmhv_translate_addr_nested(struct kvm_vcpu *vcpu,
+ struct kvm_nested_guest *gp,
+ unsigned long n_gpa, unsigned long dsisr,
+ struct kvmppc_pte *gpte_p)
+{
+ u64 fault_addr, flags = dsisr & DSISR_ISSTORE;
+ int ret;
+
+ ret = kvmppc_mmu_walk_radix_tree(vcpu, n_gpa, gpte_p, gp->l1_gr_to_hr,
+ &fault_addr);
+
+ if (ret) {
+ /* We didn't find a pte */
+ if (ret == -EINVAL) {
+ /* Unsupported mmu config */
+ flags |= DSISR_UNSUPP_MMU;
+ } else if (ret == -ENOENT) {
+ /* No translation found */
+ flags |= DSISR_NOHPTE;
+ } else if (ret == -EFAULT) {
+ /* Couldn't access L1 real address */
+ flags |= DSISR_PRTABLE_FAULT;
+ vcpu->arch.fault_gpa = fault_addr;
+ } else {
+ /* Unknown error */
+ return ret;
+ }
+ goto forward_to_l1;
+ } else {
+ /* We found a pte -> check permissions */
+ if (dsisr & DSISR_ISSTORE) {
+ /* Can we write? */
+ if (!gpte_p->may_write) {
+ flags |= DSISR_PROTFAULT;
+ goto forward_to_l1;
+ }
+ } else if (vcpu->arch.trap == BOOK3S_INTERRUPT_H_INST_STORAGE) {
+ /* Can we execute? */
+ if (!gpte_p->may_execute) {
+ flags |= SRR1_ISI_N_OR_G;
+ goto forward_to_l1;
+ }
+ } else {
+ /* Can we read? */
+ if (!gpte_p->may_read && !gpte_p->may_write) {
+ flags |= DSISR_PROTFAULT;
+ goto forward_to_l1;
+ }
+ }
+ }
+
+ return 0;
+
+forward_to_l1:
+ vcpu->arch.fault_dsisr = flags;
+ if (vcpu->arch.trap == BOOK3S_INTERRUPT_H_INST_STORAGE) {
+ vcpu->arch.shregs.msr &= ~0x783f0000ul;
+ vcpu->arch.shregs.msr |= flags;
+ }
+ return RESUME_HOST;
+}
+
+static long kvmhv_handle_nested_set_rc(struct kvm_vcpu *vcpu,
+ struct kvm_nested_guest *gp,
+ unsigned long n_gpa,
+ struct kvmppc_pte gpte,
+ unsigned long dsisr)
+{
+ struct kvm *kvm = vcpu->kvm;
+ bool writing = !!(dsisr & DSISR_ISSTORE);
+ u64 pgflags;
+ bool ret;
+
+ /* Are the rc bits set in the L1 partition scoped pte? */
+ pgflags = _PAGE_ACCESSED;
+ if (writing)
+ pgflags |= _PAGE_DIRTY;
+ if (pgflags & ~gpte.rc)
+ return RESUME_HOST;
+
+ spin_lock(&kvm->mmu_lock);
+ /* Set the rc bit in the pte of our (L0) pgtable for the L1 guest */
+ ret = kvmppc_hv_handle_set_rc(kvm, kvm->arch.pgtable, writing,
+ gpte.raddr, kvm->arch.lpid);
+ spin_unlock(&kvm->mmu_lock);
+ if (!ret)
+ return -EINVAL;
+
+ /* Set the rc bit in the pte of the shadow_pgtable for the nest guest */
+ ret = kvmppc_hv_handle_set_rc(kvm, gp->shadow_pgtable, writing, n_gpa,
+ gp->shadow_lpid);
+ if (!ret)
+ return -EINVAL;
+ return 0;
+}
+
+static inline int kvmppc_radix_level_to_shift(int level)
+{
+ switch (level) {
+ case 2:
+ return PUD_SHIFT;
+ case 1:
+ return PMD_SHIFT;
+ default:
+ return PAGE_SHIFT;
+ }
+}
+
+static inline int kvmppc_radix_shift_to_level(int shift)
+{
+ if (shift == PUD_SHIFT)
+ return 2;
+ if (shift == PMD_SHIFT)
+ return 1;
+ if (shift == PAGE_SHIFT)
+ return 0;
+ WARN_ON_ONCE(1);
+ return 0;
+}
+
+/* called with gp->tlb_lock held */
+static long int __kvmhv_nested_page_fault(struct kvm_vcpu *vcpu,
+ struct kvm_nested_guest *gp)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm