summaryrefslogtreecommitdiffstats
path: root/drivers/vfio
diff options
context:
space:
mode:
authorAlex Williamson <alex.williamson@redhat.com>2012-07-31 08:16:24 -0600
committerAlex Williamson <alex.williamson@redhat.com>2012-07-31 08:16:24 -0600
commit89e1f7d4c66d85f42c3d52ea3866eb10cadf6153 (patch)
tree6bea54ae5eaea48c17d309855d36d801259b64d1 /drivers/vfio
parent73fa0d10d077d9521ee2dace2307ae2c9a965336 (diff)
vfio: Add PCI device driver
Add PCI device support for VFIO. PCI devices expose regions for accessing config space, I/O port space, and MMIO areas of the device. PCI config access is virtualized in the kernel, allowing us to ensure the integrity of the system, by preventing various accesses while reducing duplicate support across various userspace drivers. I/O port supports read/write access while MMIO also supports mmap of sufficiently sized regions. Support for INTx, MSI, and MSI-X interrupts are provided using eventfds to userspace. Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Diffstat (limited to 'drivers/vfio')
-rw-r--r--drivers/vfio/Kconfig2
-rw-r--r--drivers/vfio/pci/Kconfig8
-rw-r--r--drivers/vfio/pci/Makefile4
-rw-r--r--drivers/vfio/pci/vfio_pci.c579
-rw-r--r--drivers/vfio/pci/vfio_pci_config.c1540
-rw-r--r--drivers/vfio/pci/vfio_pci_intrs.c740
-rw-r--r--drivers/vfio/pci/vfio_pci_private.h91
-rw-r--r--drivers/vfio/pci/vfio_pci_rdwr.c269
8 files changed, 3233 insertions, 0 deletions
diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
index 128b97910b8e..7cd5dec0abd1 100644
--- a/drivers/vfio/Kconfig
+++ b/drivers/vfio/Kconfig
@@ -12,3 +12,5 @@ menuconfig VFIO
See Documentation/vfio.txt for more details.
If you don't know what to do here, say N.
+
+source "drivers/vfio/pci/Kconfig"
diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig
new file mode 100644
index 000000000000..5980758563eb
--- /dev/null
+++ b/drivers/vfio/pci/Kconfig
@@ -0,0 +1,8 @@
+config VFIO_PCI
+ tristate "VFIO support for PCI devices"
+ depends on VFIO && PCI && EVENTFD
+ help
+ Support for the PCI VFIO bus driver. This is required to make
+ use of PCI drivers using the VFIO framework.
+
+ If you don't know what to do here, say N.
diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile
new file mode 100644
index 000000000000..131079255fd9
--- /dev/null
+++ b/drivers/vfio/pci/Makefile
@@ -0,0 +1,4 @@
+
+vfio-pci-y := vfio_pci.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o
+
+obj-$(CONFIG_VFIO_PCI) += vfio-pci.o
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
new file mode 100644
index 000000000000..6968b7232232
--- /dev/null
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -0,0 +1,579 @@
+/*
+ * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
+ * Author: Alex Williamson <alex.williamson@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Derived from original vfio:
+ * Copyright 2010 Cisco Systems, Inc. All rights reserved.
+ * Author: Tom Lyon, pugs@cisco.com
+ */
+
+#include <linux/device.h>
+#include <linux/eventfd.h>
+#include <linux/interrupt.h>
+#include <linux/iommu.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/notifier.h>
+#include <linux/pci.h>
+#include <linux/pm_runtime.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+#include <linux/vfio.h>
+
+#include "vfio_pci_private.h"
+
+#define DRIVER_VERSION "0.2"
+#define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>"
+#define DRIVER_DESC "VFIO PCI - User Level meta-driver"
+
+static bool nointxmask;
+module_param_named(nointxmask, nointxmask, bool, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(nointxmask,
+ "Disable support for PCI 2.3 style INTx masking. If this resolves problems for specific devices, report lspci -vvvxxx to linux-pci@vger.kernel.org so the device can be fixed automatically via the broken_intx_masking flag.");
+
+static int vfio_pci_enable(struct vfio_pci_device *vdev)
+{
+ struct pci_dev *pdev = vdev->pdev;
+ int ret;
+ u16 cmd;
+ u8 msix_pos;
+
+ vdev->reset_works = (pci_reset_function(pdev) == 0);
+ pci_save_state(pdev);
+ vdev->pci_saved_state = pci_store_saved_state(pdev);
+ if (!vdev->pci_saved_state)
+ pr_debug("%s: Couldn't store %s saved state\n",
+ __func__, dev_name(&pdev->dev));
+
+ ret = vfio_config_init(vdev);
+ if (ret)
+ goto out;
+
+ if (likely(!nointxmask))
+ vdev->pci_2_3 = pci_intx_mask_supported(pdev);
+
+ pci_read_config_word(pdev, PCI_COMMAND, &cmd);
+ if (vdev->pci_2_3 && (cmd & PCI_COMMAND_INTX_DISABLE)) {
+ cmd &= ~PCI_COMMAND_INTX_DISABLE;
+ pci_write_config_word(pdev, PCI_COMMAND, cmd);
+ }
+
+ msix_pos = pci_find_capability(pdev, PCI_CAP_ID_MSIX);
+ if (msix_pos) {
+ u16 flags;
+ u32 table;
+
+ pci_read_config_word(pdev, msix_pos + PCI_MSIX_FLAGS, &flags);
+ pci_read_config_dword(pdev, msix_pos + PCI_MSIX_TABLE, &table);
+
+ vdev->msix_bar = table & PCI_MSIX_FLAGS_BIRMASK;
+ vdev->msix_offset = table & ~PCI_MSIX_FLAGS_BIRMASK;
+ vdev->msix_size = ((flags & PCI_MSIX_FLAGS_QSIZE) + 1) * 16;
+ } else
+ vdev->msix_bar = 0xFF;
+
+ ret = pci_enable_device(pdev);
+ if (ret)
+ goto out;
+
+ return ret;
+
+out:
+ kfree(vdev->pci_saved_state);
+ vdev->pci_saved_state = NULL;
+ vfio_config_free(vdev);
+ return ret;
+}
+
+static void vfio_pci_disable(struct vfio_pci_device *vdev)
+{
+ int bar;
+
+ pci_disable_device(vdev->pdev);
+
+ vfio_pci_set_irqs_ioctl(vdev, VFIO_IRQ_SET_DATA_NONE |
+ VFIO_IRQ_SET_ACTION_TRIGGER,
+ vdev->irq_type, 0, 0, NULL);
+
+ vdev->virq_disabled = false;
+
+ vfio_config_free(vdev);
+
+ pci_reset_function(vdev->pdev);
+
+ if (pci_load_and_free_saved_state(vdev->pdev,
+ &vdev->pci_saved_state) == 0)
+ pci_restore_state(vdev->pdev);
+ else
+ pr_info("%s: Couldn't reload %s saved state\n",
+ __func__, dev_name(&vdev->pdev->dev));
+
+ for (bar = PCI_STD_RESOURCES; bar <= PCI_STD_RESOURCE_END; bar++) {
+ if (!vdev->barmap[bar])
+ continue;
+ pci_iounmap(vdev->pdev, vdev->barmap[bar]);
+ pci_release_selected_regions(vdev->pdev, 1 << bar);
+ vdev->barmap[bar] = NULL;
+ }
+}
+
+static void vfio_pci_release(void *device_data)
+{
+ struct vfio_pci_device *vdev = device_data;
+
+ if (atomic_dec_and_test(&vdev->refcnt))
+ vfio_pci_disable(vdev);
+
+ module_put(THIS_MODULE);
+}
+
+static int vfio_pci_open(void *device_data)
+{
+ struct vfio_pci_device *vdev = device_data;
+
+ if (!try_module_get(THIS_MODULE))
+ return -ENODEV;
+
+ if (atomic_inc_return(&vdev->refcnt) == 1) {
+ int ret = vfio_pci_enable(vdev);
+ if (ret) {
+ module_put(THIS_MODULE);
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type)
+{
+ if (irq_type == VFIO_PCI_INTX_IRQ_INDEX) {
+ u8 pin;
+ pci_read_config_byte(vdev->pdev, PCI_INTERRUPT_PIN, &pin);
+ if (pin)
+ return 1;
+
+ } else if (irq_type == VFIO_PCI_MSI_IRQ_INDEX) {
+ u8 pos;
+ u16 flags;
+
+ pos = pci_find_capability(vdev->pdev, PCI_CAP_ID_MSI);
+ if (pos) {
+ pci_read_config_word(vdev->pdev,
+ pos + PCI_MSI_FLAGS, &flags);
+
+ return 1 << (flags & PCI_MSI_FLAGS_QMASK);
+ }
+ } else if (irq_type == VFIO_PCI_MSIX_IRQ_INDEX) {
+ u8 pos;
+ u16 flags;
+
+ pos = pci_find_capability(vdev->pdev, PCI_CAP_ID_MSIX);
+ if (pos) {
+ pci_read_config_word(vdev->pdev,
+ pos + PCI_MSIX_FLAGS, &flags);
+
+ return (flags & PCI_MSIX_FLAGS_QSIZE) + 1;
+ }
+ }
+
+ return 0;
+}
+
+static long vfio_pci_ioctl(void *device_data,
+ unsigned int cmd, unsigned long arg)
+{
+ struct vfio_pci_device *vdev = device_data;
+ unsigned long minsz;
+
+ if (cmd == VFIO_DEVICE_GET_INFO) {
+ struct vfio_device_info info;
+
+ minsz = offsetofend(struct vfio_device_info, num_irqs);
+
+ if (copy_from_user(&info, (void __user *)arg, minsz))
+ return -EFAULT;
+
+ if (info.argsz < minsz)
+ return -EINVAL;
+
+ info.flags = VFIO_DEVICE_FLAGS_PCI;
+
+ if (vdev->reset_works)
+ info.flags |= VFIO_DEVICE_FLAGS_RESET;
+
+ info.num_regions = VFIO_PCI_NUM_REGIONS;
+ info.num_irqs = VFIO_PCI_NUM_IRQS;
+
+ return copy_to_user((void __user *)arg, &info, minsz);
+
+ } else if (cmd == VFIO_DEVICE_GET_REGION_INFO) {
+ struct pci_dev *pdev = vdev->pdev;
+ struct vfio_region_info info;
+
+ minsz = offsetofend(struct vfio_region_info, offset);
+
+ if (copy_from_user(&info, (void __user *)arg, minsz))
+ return -EFAULT;
+
+ if (info.argsz < minsz)
+ return -EINVAL;
+
+ switch (info.index) {
+ case VFIO_PCI_CONFIG_REGION_INDEX:
+ info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
+ info.size = pdev->cfg_size;
+ info.flags = VFIO_REGION_INFO_FLAG_READ |
+ VFIO_REGION_INFO_FLAG_WRITE;
+ break;
+ case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
+ info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
+ info.size = pci_resource_len(pdev, info.index);
+ if (!info.size) {
+ info.flags = 0;
+ break;
+ }
+
+ info.flags = VFIO_REGION_INFO_FLAG_READ |
+ VFIO_REGION_INFO_FLAG_WRITE;
+ if (pci_resource_flags(pdev, info.index) &
+ IORESOURCE_MEM && info.size >= PAGE_SIZE)
+ info.flags |= VFIO_REGION_INFO_FLAG_MMAP;
+ break;
+ case VFIO_PCI_ROM_REGION_INDEX:
+ {
+ void __iomem *io;
+ size_t size;
+
+ info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
+ info.flags = 0;
+
+ /* Report the BAR size, not the ROM size */
+ info.size = pci_resource_len(pdev, info.index);
+ if (!info.size)
+ break;
+
+ /* Is it really there? */
+ io = pci_map_rom(pdev, &size);
+ if (!io || !size) {
+ info.size = 0;
+ break;
+ }
+ pci_unmap_rom(pdev, io);
+
+ info.flags = VFIO_REGION_INFO_FLAG_READ;
+ break;
+ }
+ default:
+ return -EINVAL;
+ }
+
+ return copy_to_user((void __user *)arg, &info, minsz);
+
+ } else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) {
+ struct vfio_irq_info info;
+
+ minsz = offsetofend(struct vfio_irq_info, count);
+
+ if (copy_from_user(&info, (void __user *)arg, minsz))
+ return -EFAULT;
+
+ if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS)
+ return -EINVAL;
+
+ info.flags = VFIO_IRQ_INFO_EVENTFD;
+
+ info.count = vfio_pci_get_irq_count(vdev, info.index);
+
+ if (info.index == VFIO_PCI_INTX_IRQ_INDEX)
+ info.flags |= (VFIO_IRQ_INFO_MASKABLE |
+ VFIO_IRQ_INFO_AUTOMASKED);
+ else
+ info.flags |= VFIO_IRQ_INFO_NORESIZE;
+
+ return copy_to_user((void __user *)arg, &info, minsz);
+
+ } else if (cmd == VFIO_DEVICE_SET_IRQS) {
+ struct vfio_irq_set hdr;
+ u8 *data = NULL;
+ int ret = 0;
+
+ minsz = offsetofend(struct vfio_irq_set, count);
+
+ if (copy_from_user(&hdr, (void __user *)arg, minsz))
+ return -EFAULT;
+
+ if (hdr.argsz < minsz || hdr.index >= VFIO_PCI_NUM_IRQS ||
+ hdr.flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
+ VFIO_IRQ_SET_ACTION_TYPE_MASK))
+ return -EINVAL;
+
+ if (!(hdr.flags & VFIO_IRQ_SET_DATA_NONE)) {
+ size_t size;
+
+ if (hdr.flags & VFIO_IRQ_SET_DATA_BOOL)
+ size = sizeof(uint8_t);
+ else if (hdr.flags & VFIO_IRQ_SET_DATA_EVENTFD)
+ size = sizeof(int32_t);
+ else
+ return -EINVAL;
+
+ if (hdr.argsz - minsz < hdr.count * size ||
+ hdr.count > vfio_pci_get_irq_count(vdev, hdr.index))
+ return -EINVAL;
+
+ data = kmalloc(hdr.count * size, GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+
+ if (copy_from_user(data, (void __user *)(arg + minsz),
+ hdr.count * size)) {
+ kfree(data);
+ return -EFAULT;
+ }
+ }
+
+ mutex_lock(&vdev->igate);
+
+ ret = vfio_pci_set_irqs_ioctl(vdev, hdr.flags, hdr.index,
+ hdr.start, hdr.count, data);
+
+ mutex_unlock(&vdev->igate);
+ kfree(data);
+
+ return ret;
+
+ } else if (cmd == VFIO_DEVICE_RESET)
+ return vdev->reset_works ?
+ pci_reset_function(vdev->pdev) : -EINVAL;
+
+ return -ENOTTY;
+}
+
+static ssize_t vfio_pci_read(void *device_data, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
+ struct vfio_pci_device *vdev = device_data;
+ struct pci_dev *pdev = vdev->pdev;
+
+ if (index >= VFIO_PCI_NUM_REGIONS)
+ return -EINVAL;
+
+ if (index == VFIO_PCI_CONFIG_REGION_INDEX)
+ return vfio_pci_config_readwrite(vdev, buf, count, ppos, false);
+ else if (index == VFIO_PCI_ROM_REGION_INDEX)
+ return vfio_pci_mem_readwrite(vdev, buf, count, ppos, false);
+ else if (pci_resource_flags(pdev, index) & IORESOURCE_IO)
+ return vfio_pci_io_readwrite(vdev, buf, count, ppos, false);
+ else if (pci_resource_flags(pdev, index) & IORESOURCE_MEM)
+ return vfio_pci_mem_readwrite(vdev, buf, count, ppos, false);
+
+ return -EINVAL;
+}
+
+static ssize_t vfio_pci_write(void *device_data, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
+ struct vfio_pci_device *vdev = device_data;
+ struct pci_dev *pdev = vdev->pdev;
+
+ if (index >= VFIO_PCI_NUM_REGIONS)
+ return -EINVAL;
+
+ if (index == VFIO_PCI_CONFIG_REGION_INDEX)
+ return vfio_pci_config_readwrite(vdev, (char __user *)buf,
+ count, ppos, true);
+ else if (index == VFIO_PCI_ROM_REGION_INDEX)
+ return -EINVAL;
+ else if (pci_resource_flags(pdev, index) & IORESOURCE_IO)
+ return vfio_pci_io_readwrite(vdev, (char __user *)buf,
+ count, ppos, true);
+ else if (pci_resource_flags(pdev, index) & IORESOURCE_MEM) {
+ return vfio_pci_mem_readwrite(vdev, (char __user *)buf,
+ count, ppos, true);
+ }
+
+ return -EINVAL;
+}
+
+static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma)
+{
+ struct vfio_pci_device *vdev = device_data;
+ struct pci_dev *pdev = vdev->pdev;
+ unsigned int index;
+ u64 phys_len, req_len, pgoff, req_start, phys;
+ int ret;
+
+ index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
+
+ if (vma->vm_end < vma->vm_start)
+ return -EINVAL;
+ if ((vma->vm_flags & VM_SHARED) == 0)
+ return -EINVAL;
+ if (index >= VFIO_PCI_ROM_REGION_INDEX)
+ return -EINVAL;
+ if (!(pci_resource_flags(pdev, index) & IORESOURCE_MEM))
+ return -EINVAL;
+
+ phys_len = pci_resource_len(pdev, index);
+ req_len = vma->vm_end - vma->vm_start;
+ pgoff = vma->vm_pgoff &
+ ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
+ req_start = pgoff << PAGE_SHIFT;
+
+ if (phys_len < PAGE_SIZE || req_start + req_len > phys_len)
+ return -EINVAL;
+
+ if (index == vdev->msix_bar) {
+ /*
+ * Disallow mmaps overlapping the MSI-X table; users don't
+ * get to touch this directly. We could find somewhere
+ * else to map the overlap, but page granularity is only
+ * a recommendation, not a requirement, so the user needs
+ * to know which bits are real. Requiring them to mmap
+ * around the table makes that clear.
+ */
+
+ /* If neither entirely above nor below, then it overlaps */
+ if (!(req_start >= vdev->msix_offset + vdev->msix_size ||
+ req_start + req_len <= vdev->msix_offset))
+ return -EINVAL;
+ }
+
+ /*
+ * Even though we don't make use of the barmap for the mmap,
+ * we need to request the region and the barmap tracks that.
+ */
+ if (!vdev->barmap[index]) {
+ ret = pci_request_selected_regions(pdev,
+ 1 << index, "vfio-pci");
+ if (ret)
+ return ret;
+
+ vdev->barmap[index] = pci_iomap(pdev, index, 0);
+ }
+
+ vma->vm_private_data = vdev;
+ vma->vm_flags |= (VM_IO | VM_RESERVED);
+ vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+
+ phys = (pci_resource_start(pdev, index) >> PAGE_SHIFT) + pgoff;
+
+ return remap_pfn_range(vma, vma->vm_start, phys,
+ req_len, vma->vm_page_prot);
+}
+
+static const struct vfio_device_ops vfio_pci_ops = {
+ .name = "vfio-pci",
+ .open = vfio_pci_open,
+ .release = vfio_pci_release,
+ .ioctl = vfio_pci_ioctl,
+ .read = vfio_pci_read,
+ .write = vfio_pci_write,
+ .mmap = vfio_pci_mmap,
+};
+
+static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+ u8 type;
+ struct vfio_pci_device *vdev;
+ struct iommu_group *group;
+ int ret;
+
+ pci_read_config_byte(pdev, PCI_HEADER_TYPE, &type);
+ if ((type & PCI_HEADER_TYPE) != PCI_HEADER_TYPE_NORMAL)
+ return -EINVAL;
+
+ group = iommu_group_get(&pdev->dev);
+ if (!group)
+ return -EINVAL;
+
+ vdev = kzalloc(sizeof(*vdev), GFP_KERNEL);
+ if (!vdev) {
+ iommu_group_put(group);
+ return -ENOMEM;
+ }
+
+ vdev->pdev = pdev;
+ vdev->irq_type = VFIO_PCI_NUM_IRQS;
+ mutex_init(&vdev->igate);
+ spin_lock_init(&vdev->irqlock);
+ atomic_set(&vdev->refcnt, 0);
+
+ ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev);
+ if (ret) {
+ iommu_group_put(group);
+ kfree(vdev);
+ }
+
+ return ret;
+}
+
+static void vfio_pci_remove(struct pci_dev *pdev)
+{
+ struct vfio_pci_device *vdev;
+
+ vdev = vfio_del_group_dev(&pdev->dev);
+ if (!vdev)
+ return;
+
+ iommu_group_put(pdev->dev.iommu_group);
+ kfree(vdev);
+}
+
+static struct pci_driver vfio_pci_driver = {
+ .name = "vfio-pci",
+ .id_table = NULL, /* only dynamic ids */
+ .probe = vfio_pci_probe,
+ .remove = vfio_pci_remove,
+};
+
+static void __exit vfio_pci_cleanup(void)
+{
+ pci_unregister_driver(&vfio_pci_driver);
+ vfio_pci_virqfd_exit();
+ vfio_pci_uninit_perm_bits();
+}
+
+static int __init vfio_pci_init(void)
+{
+ int ret;
+
+ /* Allocate shared config space permision data used by all devices */
+ ret = vfio_pci_init_perm_bits();
+ if (ret)
+ return ret;
+
+ /* Start the virqfd cleanup handler */
+ ret = vfio_pci_virqfd_init();
+ if (ret)
+ goto out_virqfd;
+
+ /* Register and scan for devices */
+ ret = pci_register_driver(&vfio_pci_driver);
+ if (ret)
+ goto out_driver;
+
+ return 0;
+
+out_virqfd:
+ vfio_pci_virqfd_exit();
+out_driver:
+ vfio_pci_uninit_perm_bits();
+ return ret;
+}
+
+module_init(vfio_pci_init);
+module_exit(vfio_pci_cleanup);
+
+MODULE_VERSION(DRIVER_VERSION);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c
new file mode 100644
index 000000000000..8b8f7d11e102
--- /dev/null
+++ b/drivers/vfio/pci/vfio_pci_config.c
@@ -0,0 +1,1540 @@
+/*
+ * VFIO PCI config space virtualization
+ *
+ * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
+ * Author: Alex Williamson <alex.williamson@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Derived from original vfio:
+ * Copyright 2010 Cisco Systems, Inc. All rights reserved.
+ * Author: Tom Lyon, pugs@cisco.com
+ */
+
+/*
+ * This code handles reading and writing of PCI configuration registers.
+ * This is hairy because we want to allow a lot of flexibility to the
+ * user driver, but cannot trust it with all of the config fields.
+ * Tables determine which fields can be read and written, as well as
+ * which fields are 'virtualized' - special actions and translations to
+ * make it appear to the user that he has control, when in fact things
+ * must be negotiated with the underlying OS.
+ */
+
+#include <linux/fs.h>
+#include <linux/pci.h>
+#include <linux/uaccess.h>
+#include <linux/vfio.h>
+
+#include "vfio_pci_private.h"
+
+#define PCI_CFG_SPACE_SIZE 256
+
+/* Useful "pseudo" capabilities */
+#define PCI_CAP_ID_BASIC 0
+#define PCI_CAP_ID_INVALID 0xFF
+
+#define is_bar(offset) \
+ ((offset >= PCI_BASE_ADDRESS_0 && offset < PCI_BASE_ADDRESS_5 + 4) || \
+ (offset >= PCI_ROM_ADDRESS && offset < PCI_ROM_ADDRESS + 4))
+
+/*
+ * Lengths of PCI Config Capabilities
+ * 0: Removed from the user visible capability list
+ * FF: Variable length
+ */
+static u8 pci_cap_length[] = {
+ [PCI_CAP_ID_BASIC] = PCI_STD_HEADER_SIZEOF, /* pci config header */
+ [PCI_CAP_ID_PM] = PCI_PM_SIZEOF,
+ [PCI_CAP_ID_AGP] = PCI_AGP_SIZEOF,
+ [PCI_CAP_ID_VPD] = PCI_CAP_VPD_SIZEOF,
+ [PCI_CAP_ID_SLOTID] = 0, /* bridge - don't care */
+ [PCI_CAP_ID_MSI] = 0xFF, /* 10, 14, 20, or 24 */
+ [PCI_CAP_ID_CHSWP] = 0, /* cpci - not yet */
+ [PCI_CAP_ID_PCIX] = 0xFF, /* 8 or 24 */
+ [PCI_CAP_ID_HT] = 0xFF, /* hypertransport */
+ [PCI_CAP_ID_VNDR] = 0xFF, /* variable */
+ [PCI_CAP_ID_DBG] = 0, /* debug - don't care */
+ [PCI_CAP_ID_CCRC] = 0, /* cpci - not yet */
+ [PCI_CAP_ID_SHPC] = 0, /* hotswap - not yet */
+ [PCI_CAP_ID_SSVID] = 0, /* bridge - don't care */
+ [PCI_CAP_ID_AGP3] = 0, /* AGP8x - not yet */
+ [PCI_CAP_ID_SECDEV] = 0, /* secure device not yet */
+ [PCI_CAP_ID_EXP] = 0xFF, /* 20 or 44 */
+ [PCI_CAP_ID_MSIX] = PCI_CAP_MSIX_SIZEOF,
+ [PCI_CAP_ID_SATA] = 0xFF,
+ [PCI_CAP_ID_AF] = PCI_CAP_AF_SIZEOF,
+};
+
+/*
+ * Lengths of PCIe/PCI-X Extended Config Capabilities
+ * 0: Removed or masked from the user visible capabilty list
+ * FF: Variable length
+ */
+static u16 pci_ext_cap_length[] = {
+ [PCI_EXT_CAP_ID_ERR] = PCI_ERR_ROOT_COMMAND,
+ [PCI_EXT_CAP_ID_VC] = 0xFF,
+ [PCI_EXT_CAP_ID_DSN] = PCI_EXT_CAP_DSN_SIZEOF,
+ [PCI_EXT_CAP_ID_PWR] = PCI_EXT_CAP_PWR_SIZEOF,
+ [PCI_EXT_CAP_ID_RCLD] = 0, /* root only - don't care */
+ [PCI_EXT_CAP_ID_RCILC] = 0, /* root only - don't care */
+ [PCI_EXT_CAP_ID_RCEC] = 0, /* root only - don't care */
+ [PCI_EXT_CAP_ID_MFVC] = 0xFF,
+ [PCI_EXT_CAP_ID_VC9] = 0xFF, /* same as CAP_ID_VC */
+ [PCI_EXT_CAP_ID_RCRB] = 0, /* root only - don't care */
+ [PCI_EXT_CAP_ID_VNDR] = 0xFF,
+ [PCI_EXT_CAP_ID_CAC] = 0, /* obsolete */
+ [PCI_EXT_CAP_ID_ACS] = 0xFF,
+ [PCI_EXT_CAP_ID_ARI] = PCI_EXT_CAP_ARI_SIZEOF,
+ [PCI_EXT_CAP_ID_ATS] = PCI_EXT_CAP_ATS_SIZEOF,
+ [PCI_EXT_CAP_ID_SRIOV] = PCI_EXT_CAP_SRIOV_SIZEOF,
+ [PCI_EXT_CAP_ID_MRIOV] = 0, /* not yet */
+ [PCI_EXT_CAP_ID_MCAST] = PCI_EXT_CAP_MCAST_ENDPOINT_SIZEOF,
+ [PCI_EXT_CAP_ID_PRI] = PCI_EXT_CAP_PRI_SIZEOF,
+ [PCI_EXT_CAP_ID_AMD_XXX] = 0, /* not yet */
+ [PCI_EXT_CAP_ID_REBAR] = 0xFF,
+ [PCI_EXT_CAP_ID_DPA] = 0xFF,
+ [PCI_EXT_CAP_ID_TPH] = 0xFF,
+ [PCI_EXT_CAP_ID_LTR] = PCI_EXT_CAP_LTR_SIZEOF,
+ [PCI_EXT_CAP_ID_SECPCI] = 0, /* not yet */
+ [PCI_EXT_CAP_ID_PMUX] = 0, /* not yet */
+ [PCI_EXT_CAP_ID_PASID] = 0, /* not yet */
+};
+
+/*
+ * Read/Write Permission Bits - one bit for each bit in capability
+ * Any field can be read if it exists, but what is read depends on
+ * whether the field is 'virtualized', or just pass thru to the
+ * hardware. Any virtualized field is also virtualized for writes.
+ * Writes are only permitted if they have a 1 bit here.
+ */
+struct perm_bits {
+ u8 *virt; /* read/write virtual data, not hw */
+ u8 *write; /* writeable bits */
+ int (*readfn)(struct vfio_pci_device *vdev, int pos, int count,
+ struct perm_bits *perm, int offset, __le32 *val);
+ int (*writefn)(struct vfio_pci_device *vdev, int pos, int count,
+ struct perm_bits *perm, int offset, __le32 val);
+};
+
+#define NO_VIRT 0
+#define ALL_VIRT 0xFFFFFFFFU
+#define NO_WRITE 0
+#define ALL_WRITE 0xFFFFFFFFU
+
+static int vfio_user_config_read(struct pci_dev *pdev, int offset,
+ __le32 *val, int count)
+{
+ int ret = -EINVAL;
+ u32 tmp_val = 0;
+
+ switch (count) {
+ case 1:
+ {
+ u8 tmp;
+ ret = pci_user_read_config_byte(pdev, offset, &tmp);
+ tmp_val = tmp;
+ break;
+ }
+ case 2:
+ {
+ u16 tmp;
+ ret = pci_user_read_config_word(pdev, offset, &tmp);
+ tmp_val = tmp;
+ break;
+ }
+ case 4:
+ ret = pci_user_read_config_dword(pdev, offset, &tmp_val);
+ break;
+ }
+
+ *val = cpu_to_le32(tmp_val);
+
+ return pcibios_err_to_errno(ret);
+}
+
+static int vfio_user_config_write(struct pci_dev *pdev, int offset,
+ __le32 val, int count)
+{
+ int ret = -EINVAL;
+ u32 tmp_val = le32_to_cpu(val);
+
+ switch (count) {
+ case 1:
+ ret = pci_user_write_config_byte(pdev, offset, tmp_val);
+ break;
+ case 2:
+ ret = pci_user_write_config_word(pdev, offset, tmp_val);
+ break;
+ case 4:
+ ret = pci_user_write_config_dword(pdev, offset, tmp_val);
+ break;
+ }
+
+ return pcibios_err_to_errno(ret);
+}
+
+static int vfio_default_config_read(struct vfio_pci_device *vdev, int pos,
+ int count, struct perm_bits *perm,
+ int offset, __le32 *val)
+{
+ __le32 virt = 0;
+
+ memcpy(val, vdev->vconfig + pos, count);
+
+ memcpy(&virt, perm->virt + offset, count);
+
+ /* Any non-virtualized bits? */
+ if (cpu_to_le32(~0U >> (32 - (count * 8))) != virt) {
+ struct pci_dev *pdev = vdev->pdev;
+ __le32 phys_val = 0;
+ int ret;
+
+ ret = vfio_user_config_read(pdev, pos, &phys_val, count);
+ if (ret)
+ return ret;
+
+ *val = (phys_val & ~virt) | (*val & virt);
+ }
+
+ return count;
+}
+
+static int vfio_default_config_write(struct vfio_pci_device *vdev, int pos,
+ int count, struct perm_bits *perm,
+ int offset, __le32 val)
+{
+ __le32 virt = 0, write = 0;
+
+ memcpy(&write, perm->write + offset, count);
+
+ if (!write)
+ return count; /* drop, no writable bits */
+
+ memcpy(&virt, perm->virt + offset, count);
+
+ /* Virtualized and writable bits go to vconfig */
+ if (write & virt) {
+ __le32 virt_val = 0;
+
+ memcpy(&virt_val, vdev->vconfig + pos, count);
+
+ virt_val &= ~(write & virt);
+ virt_val |= (val & (write & virt));
+
+ memcpy(vdev->vconfig + pos, &virt_val, count);
+ }
+
+ /* Non-virtualzed and writable bits go to hardware */
+ if (write & ~virt) {
+ struct pci_dev *pdev = vdev->pdev;
+ __le32 phys_val = 0;
+ int ret;
+
+ ret = vfio_user_config_read(pdev, pos, &phys_val, count);
+ if (ret)
+ return ret;
+
+ phys_val &= ~(write & ~virt);
+ phys_val |= (val & (write & ~virt));
+
+ ret = vfio_user_config_write(pdev, pos, phys_val, count);
+ if (ret)
+ return ret;
+ }
+
+ return count;
+}
+
+/* Allow direct read from hardware, except for capability next pointer */
+static int vfio_direct_config_read(struct vfio_pci_device *vdev, int pos,
+ int count, struct perm_bits *perm,
+ int offset, __le32 *val)
+{
+ int ret;
+
+ ret = vfio_user_config_read(vdev->pdev, pos, val, count);
+ if (ret)
+ return pcibios_err_to_errno(ret);
+
+ if (pos >= PCI_CFG_SPACE_SIZE) { /* Extended cap header mangling */
+ if (offset < 4)
+ memcpy(val, vdev->vconfig + pos, count);
+ } else if (pos >= PCI_STD_HEADER_SIZEOF) { /* Std cap mangling */
+ if (offset == PCI_CAP_LIST_ID && count > 1)
+ memcpy(val, vdev->vconfig + pos,
+ min(PCI_CAP_FLAGS, count));
+ else if (offset == PCI_CAP_LIST_NEXT)
+ memcpy(val, vdev->vconfig + pos, 1);
+ }
+
+ return count;
+}
+
+static int vfio_direct_config_write(struct vfio_pci_device *vdev, int pos,
+ int count, struct perm_bits *perm,
+ int offset, __le32 val)
+{
+ int ret;
+
+ ret = vfio_user_config_write(vdev->pdev, pos, val, count);
+ if (ret)
+ return ret;
+
+ return count;
+}
+
+/* Default all regions to read-only, no-virtualization */
+static struct perm_bits cap_perms[PCI_CAP_ID_MAX + 1] = {
+ [0 ... PCI_CAP_ID_MAX] = { .readfn = vfio_direct_config_read }
+};
+static struct perm_bits ecap_perms[PCI_EXT_CAP_ID_MAX + 1] = {
+ [0 ... PCI_EXT_CAP_ID_MAX] = { .readfn = vfio_direct_config_read }
+};
+
+static void free_perm_bits(struct perm_bits *perm)
+{
+ kfree(perm->virt);
+ kfree(perm->write);
+ perm->virt = NULL;
+ perm->write = NULL;
+}
+
+static int alloc_perm_bits(struct perm_bits *perm, int size)
+{
+ /*
+ * Round up all permission bits to the next dword, this lets us
+ * ignore whether a read/write exceeds the defined capability
+ * structure. We can do this because:
+ * - Standard config space is already dword aligned
+ * - Capabilities are all dword alinged (bits 0:1 of next reserved)
+ * - Express capabilities defined as dword aligned
+ */
+ size = round_up(size, 4);
+
+ /*
+ * Zero state is
+ * - All Readable, None Writeable, None Virtualized
+ */
+ perm->virt = kzalloc(size, GFP_KERNEL);
+ perm->write = kzalloc(size, GFP_KERNEL);
+ if (!perm->virt || !perm->write) {
+ free_perm_bits(perm);
+ return -ENOMEM;
+ }
+
+ perm->readfn = vfio_default_config_read;
+ perm->writefn = vfio_default_config_write;
+
+ return 0;
+}
+
+/*
+ * Helper functions for filling in permission tables
+ */
+static inline void p_setb(struct perm_bits *p, int off, u8 virt, u8 write)
+{
+ p->virt[off] = virt;
+ p->write[off] = write;
+}
+
+/* Handle endian-ness - pci and tables are little-endian */
+static inline void p_setw(struct perm_bits *p, int off, u16 virt, u16 write)
+{
+ *(__le16 *)(&p->virt[off]) = cpu_to_le16(virt);
+ *(__le16 *)(&p->write[off]) = cpu_to_le16(write);
+}
+
+/* Handle endian-ness - pci and tables are little-endian */
+static inline void p_setd(struct perm_bits *p, int off, u32 virt, u32 write)
+{
+ *(__le32 *)(&p->virt[off]) = cpu_to_le32(virt);
+ *(__le32 *)(&p->write[off]) = cpu_to_le32(write);
+}
+
+/*
+ * Restore the *real* BARs after we detect a FLR or backdoor reset.
+ * (backdoor = some device specific technique that we didn't catch)
+ */
+static void vfio_bar_restore(struct vfio_pci_device *vdev)
+{
+ struct pci_dev *pdev = vdev->pdev;
+ u32 *rbar = vdev->rbar;
+ int i;
+
+ if (pdev->is_virtfn)
+ return;
+
+ pr_info("%s: %s reset recovery - restoring bars\n",
+ __func__, dev_name(&pdev->dev));
+
+ for (i = PCI_BASE_ADDRESS_0; i <= PCI_BASE_ADDRESS_5; i += 4, rbar++)
+ pci_user_write_config_dword(pdev, i, *rbar);
+
+ pci_user_write_config_dword(pdev, PCI_ROM_ADDRESS, *rbar);
+}
+
+static __le32 vfio_generate_bar_flags(struct pci_dev *pdev, int bar)
+{
+ unsigned long flags = pci_resource_flags(pdev, bar);
+ u32 val;
+
+ if (flags & IORESOURCE_IO)
+ return cpu_to_le32(PCI_BASE_ADDRESS_SPACE_IO);
+
+ val = PCI_BASE_ADDRESS_SPACE_MEMORY;
+
+ if (flags & IORESOURCE_PREFETCH)
+ val |= PCI_BASE_ADDRESS_MEM_PREFETCH;
+
+ if (flags & IORESOURCE_MEM_64)
+ val |= PCI_BASE_ADDRESS_MEM_TYPE_64;
+
+ return cpu_to_le32(val);
+}
+
+/*
+ * Pretend we're hardware and tweak the values of the *virtual* PCI BARs
+ * to reflect the hardware capabilities. This implements BAR sizing.
+ */
+static void vfio_bar_fixup(struct vfio_pci_device *vdev)
+{
+ struct pci_dev *pdev = vdev->pdev;
+ int i;
+ __le32 *bar;
+ u64 mask;
+
+ bar = (__le32 *)&vdev->vconfig[PCI_BASE_ADDRESS_0];
+
+ for (i = PCI_STD_RESOURCES; i <= PCI_STD_RESOURCE_END; i++, bar++) {
+ if (!pci_resource_start(pdev, i)) {
+ *bar = 0; /* Unmapped by host = unimplemented to user */
+ continue;
+ }
+
+ mask = ~(pci_resource_len(pdev, i) - 1);
+
+ *bar &= cpu_to_le32((u32)mask);
+ *bar |= vfio_generate_bar_flags(pdev, i);
+
+ if (*bar & cpu_to_le32(PCI_BASE_ADDRESS_MEM_TYPE_64)) {
+ bar++;
+ *bar &= cpu_to_le32((u32)(mask >> 32));
+ i++;
+ }
+ }
+
+ bar = (__le32 *)&vdev->vconfig[PCI_ROM_ADDRESS];
+
+ /*
+ * NB. we expose the actual BAR size here, regardless of whether
+ * we can read it. When we report the REGION_INFO for the ROM
+ * we report what PCI tells us is the actual ROM size.
+ */
+ if (pci_resource_start(pdev, PCI_ROM_RESOURCE)) {
+ mask = ~(pci_resource_len(pdev, PCI_ROM_RESOURCE) - 1);
+ mask |= PCI_ROM_ADDRESS_ENABLE;
+ *bar &= cpu_to_le32((u32)mask);
+ } else
+ *bar = 0;
+
+ vdev->bardirty = false;
+}
+
+static int vfio_basic_config_read(struct vfio_pci_device *vdev, int pos,
+ int count, struct perm_bits *perm,
+ int offset, __le32 *val)
+{
+ if (is_bar(offset)) /* pos == offset for basic config */
+ vfio_bar_fixup(vdev);
+
+ count = vfio_default_config_read(vdev, pos, count, perm, offset, val);
+
+ /* Mask in virtual memory enable for SR-IOV devices */
+ if (offset == PCI_COMMAND && vdev->pdev->is_virtfn) {
+ u16 cmd = le16_to_cpu(*(__le16 *)&vdev->vconfig[PCI_COMMAND]);
+ u32 tmp_val = le32_to_cpu(*val);
+
+ tmp_val |= cmd & PCI_COMMAND_MEMORY;
+ *val = cpu_to_le32(tmp_val);
+ }
+
+ return count;
+}
+
+static int vfio_basic_config_write(struct vfio_pci_device *vdev, int pos,
+ int count, struct perm_bits *perm,
+ int offset, __le32 val)
+{
+ struct pci_dev *pdev = vdev->pdev;