From 6fe1010d6d9c02cf3556ab076585104551a6ee7e Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Fri, 6 Feb 2015 10:58:56 -0700
Subject: vfio/type1: DMA unmap chunking

When unmapping DMA entries we try to rely on the IOMMU API behavior
that allows the IOMMU to unmap a larger area than requested, up to
the size of the original mapping.  This works great when the IOMMU
supports superpages *and* they're in use.  Otherwise, each PAGE_SIZE
increment is unmapped separately, resulting in poor performance.

Instead we can use the IOVA-to-physical-address translation provided
by the IOMMU API and unmap using the largest contiguous physical
memory chunk available, which is also how vfio/type1 would have
mapped the region.  For a synthetic 1TB guest VM mapping and shutdown
test on Intel VT-d (2M IOMMU pagesize support), this achieves about
a 30% overall improvement mapping standard 4K pages, regardless of
IOMMU superpage enabling, and about a 40% improvement mapping 2M
hugetlbfs pages when IOMMU superpages are not available.  Hugetlbfs
with IOMMU superpages enabled is effectively unchanged.

Unfortunately the same algorithm does not work well on IOMMUs with
fine-grained superpages, like AMD-Vi, costing about 25% extra since
the IOMMU will automatically unmap any power-of-two contiguous
mapping we've provided it.  We add a routine and a domain flag to
detect this feature, leaving AMD-Vi unaffected by this unmap
optimization.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/vfio/vfio_iommu_type1.c | 54 ++++++++++++++++++++++++++++++++++++++---
 1 file changed, 51 insertions(+), 3 deletions(-)

(limited to 'drivers')

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 4a9d666f1e91..e6e7f155bdd9 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -66,6 +66,7 @@ struct vfio_domain {
 	struct list_head	next;
 	struct list_head	group_list;
 	int			prot;		/* IOMMU_CACHE */
+	bool			fgsp;		/* Fine-grained super pages */
 };
 
 struct vfio_dma {
@@ -350,8 +351,8 @@ static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
 		iommu_unmap(d->domain, dma->iova, dma->size);
 
 	while (iova < end) {
-		size_t unmapped;
-		phys_addr_t phys;
+		size_t unmapped, len;
+		phys_addr_t phys, next;
 
 		phys = iommu_iova_to_phys(domain->domain, iova);
 		if (WARN_ON(!phys)) {
@@ -359,7 +360,19 @@ static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
 			continue;
 		}
 
-		unmapped = iommu_unmap(domain->domain, iova, PAGE_SIZE);
+		/*
+		 * To optimize for fewer iommu_unmap() calls, each of which
+		 * may require hardware cache flushing, try to find the
+		 * largest contiguous physical memory chunk to unmap.
+		 */
+		for (len = PAGE_SIZE;
+		     !domain->fgsp && iova + len < end; len += PAGE_SIZE) {
+			next = iommu_iova_to_phys(domain->domain, iova + len);
+			if (next != phys + len)
+				break;
+		}
+
+		unmapped = iommu_unmap(domain->domain, iova, len);
 		if (WARN_ON(!unmapped))
 			break;
 
@@ -665,6 +678,39 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
 	return 0;
 }
 
+/*
+ * We change our unmap behavior slightly depending on whether the IOMMU
+ * supports fine-grained superpages.  IOMMUs like AMD-Vi will use a superpage
+ * for practically any contiguous power-of-two mapping we give it.  This means
+ * we don't need to look for contiguous chunks ourselves to make unmapping
+ * more efficient.  On IOMMUs with coarse-grained super pages, like Intel VT-d
+ * with discrete 2M/1G/512G/1T superpages, identifying contiguous chunks
+ * significantly boosts non-hugetlbfs mappings and doesn't seem to hurt when
+ * hugetlbfs is in use.
+ */
+static void vfio_test_domain_fgsp(struct vfio_domain *domain)
+{
+	struct page *pages;
+	int ret, order = get_order(PAGE_SIZE * 2);
+
+	pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, order);
+	if (!pages)
+		return;
+
+	ret = iommu_map(domain->domain, 0, page_to_phys(pages), PAGE_SIZE * 2,
+			IOMMU_READ | IOMMU_WRITE | domain->prot);
+	if (!ret) {
+		size_t unmapped = iommu_unmap(domain->domain, 0, PAGE_SIZE);
+
+		if (unmapped == PAGE_SIZE)
+			iommu_unmap(domain->domain, PAGE_SIZE, PAGE_SIZE);
+		else
+			domain->fgsp = true;
+	}
+
+	__free_pages(pages, order);
+}
+
 static int vfio_iommu_type1_attach_group(void *iommu_data,
 					 struct iommu_group *iommu_group)
 {
@@ -758,6 +804,8 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
 		}
 	}
 
+	vfio_test_domain_fgsp(domain);
+
 	/* replay mappings on new domains */
 	ret = vfio_iommu_replay(iommu, domain);
 	if (ret)
-- 
cgit v1.2.3


From babbf1760970f141eb4021288ce0fb7196bc1a23 Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Fri, 6 Feb 2015 10:59:16 -0700
Subject: vfio/type1: Chunk contiguous reserved/invalid page mappings

We currently map invalid and reserved pages, such as often occur from
mapping MMIO regions of a VM through the IOMMU, using single pages.
There's really no reason we can't instead follow the methodology we
use for normal pages and find the largest possible physically
contiguous chunk for mapping.  The only difference is that we don't
do locked memory accounting for these since they're not back by RAM.

In most applications this will be a very minor improvement, but when
graphics and GPGPU devices are in play, MMIO BARs become non-trivial.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/vfio/vfio_iommu_type1.c | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

(limited to 'drivers')

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index e6e7f155bdd9..35c90089478e 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -265,6 +265,7 @@ static long vfio_pin_pages(unsigned long vaddr, long npage,
 	unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
 	bool lock_cap = capable(CAP_IPC_LOCK);
 	long ret, i;
+	bool rsvd;
 
 	if (!current->mm)
 		return -ENODEV;
@@ -273,10 +274,9 @@ static long vfio_pin_pages(unsigned long vaddr, long npage,
 	if (ret)
 		return ret;
 
-	if (is_invalid_reserved_pfn(*pfn_base))
-		return 1;
+	rsvd = is_invalid_reserved_pfn(*pfn_base);
 
-	if (!lock_cap && current->mm->locked_vm + 1 > limit) {
+	if (!rsvd && !lock_cap && current->mm->locked_vm + 1 > limit) {
 		put_pfn(*pfn_base, prot);
 		pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
 			limit << PAGE_SHIFT);
@@ -284,7 +284,8 @@ static long vfio_pin_pages(unsigned long vaddr, long npage,
 	}
 
 	if (unlikely(disable_hugepages)) {
-		vfio_lock_acct(1);
+		if (!rsvd)
+			vfio_lock_acct(1);
 		return 1;
 	}
 
@@ -296,12 +297,14 @@ static long vfio_pin_pages(unsigned long vaddr, long npage,
 		if (ret)
 			break;
 
-		if (pfn != *pfn_base + i || is_invalid_reserved_pfn(pfn)) {
+		if (pfn != *pfn_base + i ||
+		    rsvd != is_invalid_reserved_pfn(pfn)) {
 			put_pfn(pfn, prot);
 			break;
 		}
 
-		if (!lock_cap && current->mm->locked_vm + i + 1 > limit) {
+		if (!rsvd && !lock_cap &&
+		    current->mm->locked_vm + i + 1 > limit) {
 			put_pfn(pfn, prot);
 			pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
 				__func__, limit << PAGE_SHIFT);
@@ -309,7 +312,8 @@ static long vfio_pin_pages(unsigned long vaddr, long npage,
 		}
 	}
 
-	vfio_lock_acct(i);
+	if (!rsvd)
+		vfio_lock_acct(i);
 
 	return i;
 }
-- 
cgit v1.2.3


From c5e6688752c25434d71920bc969f9fab60353c5e Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Fri, 6 Feb 2015 14:19:12 -0700
Subject: vfio/type1: Add conditional rescheduling

IOMMU operations can be expensive and it's not very difficult for a
user to give us a lot of work to do for a map or unmap operation.
Killing a large VM will vfio assigned devices can result in soft
lockups and IOMMU tracing shows that we can easily spend 80% of our
time with need-resched set.  A sprinkling of conf_resched() calls
after map and unmap calls has a very tiny affect on performance
while resulting in traces with <1% of calls overflowing into needs-
resched.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/vfio/vfio_iommu_type1.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 35c90089478e..57d8c37a002b 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -351,8 +351,10 @@ static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
 	domain = d = list_first_entry(&iommu->domain_list,
 				      struct vfio_domain, next);
 
-	list_for_each_entry_continue(d, &iommu->domain_list, next)
+	list_for_each_entry_continue(d, &iommu->domain_list, next) {
 		iommu_unmap(d->domain, dma->iova, dma->size);
+		cond_resched();
+	}
 
 	while (iova < end) {
 		size_t unmapped, len;
@@ -384,6 +386,8 @@ static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
 					     unmapped >> PAGE_SHIFT,
 					     dma->prot, false);
 		iova += unmapped;
+
+		cond_resched();
 	}
 
 	vfio_lock_acct(-unlocked);
@@ -528,6 +532,8 @@ static int vfio_iommu_map(struct vfio_iommu *iommu, dma_addr_t iova,
 			    map_try_harder(d, iova, pfn, npage, prot))
 				goto unwind;
 		}
+
+		cond_resched();
 	}
 
 	return 0;
-- 
cgit v1.2.3


From 60720a0fc6469e8f924f85510c2a24ecc7bdaf9c Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Fri, 6 Feb 2015 15:05:06 -0700
Subject: vfio: Add device tracking during unbind

There's a small window between the vfio bus driver calling
vfio_del_group_dev() and the device being completely unbound where
the vfio group appears to be non-viable.  This creates a race for
users like QEMU/KVM where the kvm-vfio module tries to get an
external reference to the group in order to match and release an
existing reference, while the device is potentially being removed
from the vfio bus driver.  If the group is momentarily non-viable,
kvm-vfio may not be able to release the group reference until VM
shutdown, making the group unusable until that point.

Bridge the gap between device removal from the group and completion
of the driver unbind by tracking it in a list.  The device is added
to the list before the bus driver reference is released and removed
using the existing unbind notifier.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/vfio/vfio.c | 76 +++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 71 insertions(+), 5 deletions(-)

(limited to 'drivers')

diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index f018d8d0f975..43d5622b19b7 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -63,6 +63,11 @@ struct vfio_container {
 	void				*iommu_data;
 };
 
+struct vfio_unbound_dev {
+	struct device			*dev;
+	struct list_head		unbound_next;
+};
+
 struct vfio_group {
 	struct kref			kref;
 	int				minor;
@@ -75,6 +80,8 @@ struct vfio_group {
 	struct notifier_block		nb;
 	struct list_head		vfio_next;
 	struct list_head		container_next;
+	struct list_head		unbound_list;
+	struct mutex			unbound_lock;
 	atomic_t			opened;
 };
 
@@ -204,6 +211,8 @@ static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group)
 	kref_init(&group->kref);
 	INIT_LIST_HEAD(&group->device_list);
 	mutex_init(&group->device_lock);
+	INIT_LIST_HEAD(&group->unbound_list);
+	mutex_init(&group->unbound_lock);
 	atomic_set(&group->container_users, 0);
 	atomic_set(&group->opened, 0);
 	group->iommu_group = iommu_group;
@@ -264,9 +273,16 @@ static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group)
 static void vfio_group_release(struct kref *kref)
 {
 	struct vfio_group *group = container_of(kref, struct vfio_group, kref);
+	struct vfio_unbound_dev *unbound, *tmp;
 
 	WARN_ON(!list_empty(&group->device_list));
 
+	list_for_each_entry_safe(unbound, tmp,
+				 &group->unbound_list, unbound_next) {
+		list_del(&unbound->unbound_next);
+		kfree(unbound);
+	}
+
 	device_destroy(vfio.class, MKDEV(MAJOR(vfio.group_devt), group->minor));
 	list_del(&group->vfio_next);
 	vfio_free_group_minor(group->minor);
@@ -440,17 +456,36 @@ static bool vfio_whitelisted_driver(struct device_driver *drv)
 }
 
 /*
- * A vfio group is viable for use by userspace if all devices are either
- * driver-less or bound to a vfio or whitelisted driver.  We test the
- * latter by the existence of a struct vfio_device matching the dev.
+ * A vfio group is viable for use by userspace if all devices are in
+ * one of the following states:
+ *  - driver-less
+ *  - bound to a vfio driver
+ *  - bound to a whitelisted driver
+ *
+ * We use two methods to determine whether a device is bound to a vfio
+ * driver.  The first is to test whether the device exists in the vfio
+ * group.  The second is to test if the device exists on the group
+ * unbound_list, indicating it's in the middle of transitioning from
+ * a vfio driver to driver-less.
  */
 static int vfio_dev_viable(struct device *dev, void *data)
 {
 	struct vfio_group *group = data;
 	struct vfio_device *device;
 	struct device_driver *drv = ACCESS_ONCE(dev->driver);
+	struct vfio_unbound_dev *unbound;
+	int ret = -EINVAL;
 
-	if (!drv || vfio_whitelisted_driver(drv))
+	mutex_lock(&group->unbound_lock);
+	list_for_each_entry(unbound, &group->unbound_list, unbound_next) {
+		if (dev == unbound->dev) {
+			ret = 0;
+			break;
+		}
+	}
+	mutex_unlock(&group->unbound_lock);
+
+	if (!ret || !drv || vfio_whitelisted_driver(drv))
 		return 0;
 
 	device = vfio_group_get_device(group, dev);
@@ -459,7 +494,7 @@ static int vfio_dev_viable(struct device *dev, void *data)
 		return 0;
 	}
 
-	return -EINVAL;
+	return ret;
 }
 
 /**
@@ -501,6 +536,7 @@ static int vfio_iommu_group_notifier(struct notifier_block *nb,
 {
 	struct vfio_group *group = container_of(nb, struct vfio_group, nb);
 	struct device *dev = data;
+	struct vfio_unbound_dev *unbound;
 
 	/*
 	 * Need to go through a group_lock lookup to get a reference or we
@@ -550,6 +586,17 @@ static int vfio_iommu_group_notifier(struct notifier_block *nb,
 		 * stop the system to maintain isolation.  At a minimum, we'd
 		 * want a toggle to disable driver auto probe for this device.
 		 */
+
+		mutex_lock(&group->unbound_lock);
+		list_for_each_entry(unbound,
+				    &group->unbound_list, unbound_next) {
+			if (dev == unbound->dev) {
+				list_del(&unbound->unbound_next);
+				kfree(unbound);
+				break;
+			}
+		}
+		mutex_unlock(&group->unbound_lock);
 		break;
 	}
 
@@ -657,6 +704,7 @@ void *vfio_del_group_dev(struct device *dev)
 	struct vfio_group *group = device->group;
 	struct iommu_group *iommu_group = group->iommu_group;
 	void *device_data = device->device_data;
+	struct vfio_unbound_dev *unbound;
 
 	/*
 	 * The group exists so long as we have a device reference.  Get
@@ -664,6 +712,24 @@ void *vfio_del_group_dev(struct device *dev)
 	 */
 	vfio_group_get(group);
 
+	/*
+	 * When the device is removed from the group, the group suddenly
+	 * becomes non-viable; the device has a driver (until the unbind
+	 * completes), but it's not present in the group.  This is bad news
+	 * for any external users that need to re-acquire a group reference
+	 * in order to match and release their existing reference.  To
+	 * solve this, we track such devices on the unbound_list to bridge
+	 * the gap until they're fully unbound.
+	 */
+	unbound = kzalloc(sizeof(*unbound), GFP_KERNEL);
+	if (unbound) {
+		unbound->dev = dev;
+		mutex_lock(&group->unbound_lock);
+		list_add(&unbound->unbound_next, &group->unbound_list);
+		mutex_unlock(&group->unbound_lock);
+	}
+	WARN_ON(!unbound);
+
 	vfio_device_put(device);
 
 	/* TODO send a signal to encourage this to be released */
-- 
cgit v1.2.3


From 4a68810dbbb4664fe4a9ac1be4d1c0e34a9b58f5 Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Fri, 6 Feb 2015 15:05:06 -0700
Subject: vfio: Tie IOMMU group reference to vfio group

Move the iommu_group reference from the device to the vfio_group.
This ensures that the iommu_group persists as long as the vfio_group
remains.  This can be important if all of the device from an
iommu_group are removed, but we still have an outstanding vfio_group
reference; we can still walk the empty list of devices.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/vfio/vfio.c | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

(limited to 'drivers')

diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 43d5622b19b7..13e0f39d91e0 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -274,6 +274,7 @@ static void vfio_group_release(struct kref *kref)
 {
 	struct vfio_group *group = container_of(kref, struct vfio_group, kref);
 	struct vfio_unbound_dev *unbound, *tmp;
+	struct iommu_group *iommu_group = group->iommu_group;
 
 	WARN_ON(!list_empty(&group->device_list));
 
@@ -287,6 +288,7 @@ static void vfio_group_release(struct kref *kref)
 	list_del(&group->vfio_next);
 	vfio_free_group_minor(group->minor);
 	vfio_group_unlock_and_free(group);
+	iommu_group_put(iommu_group);
 }
 
 static void vfio_group_put(struct vfio_group *group)
@@ -625,6 +627,12 @@ int vfio_add_group_dev(struct device *dev,
 			iommu_group_put(iommu_group);
 			return PTR_ERR(group);
 		}
+	} else {
+		/*
+		 * A found vfio_group already holds a reference to the
+		 * iommu_group.  A created vfio_group keeps the reference.
+		 */
+		iommu_group_put(iommu_group);
 	}
 
 	device = vfio_group_get_device(group, dev);
@@ -633,21 +641,19 @@ int vfio_add_group_dev(struct device *dev,
 		     dev_name(dev), iommu_group_id(iommu_group));
 		vfio_device_put(device);
 		vfio_group_put(group);
-		iommu_group_put(iommu_group);
 		return -EBUSY;
 	}
 
 	device = vfio_group_create_device(group, dev, ops, device_data);
 	if (IS_ERR(device)) {
 		vfio_group_put(group);
-		iommu_group_put(iommu_group);
 		return PTR_ERR(device);
 	}
 
 	/*
-	 * Added device holds reference to iommu_group and vfio_device
-	 * (which in turn holds reference to vfio_group).  Drop extra
-	 * group reference used while acquiring device.
+	 * Drop all but the vfio_device reference.  The vfio_device holds
+	 * a reference to the vfio_group, which holds a reference to the
+	 * iommu_group.
 	 */
 	vfio_group_put(group);
 
@@ -702,7 +708,6 @@ void *vfio_del_group_dev(struct device *dev)
 {
 	struct vfio_device *device = dev_get_drvdata(dev);
 	struct vfio_group *group = device->group;
-	struct iommu_group *iommu_group = group->iommu_group;
 	void *device_data = device->device_data;
 	struct vfio_unbound_dev *unbound;
 
@@ -737,8 +742,6 @@ void *vfio_del_group_dev(struct device *dev)
 
 	vfio_group_put(group);
 
-	iommu_group_put(iommu_group);
-
 	return device_data;
 }
 EXPORT_SYMBOL_GPL(vfio_del_group_dev);
-- 
cgit v1.2.3


From 13060b64b819c194909121b90b5f8dd9abb5ea4e Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Fri, 6 Feb 2015 15:05:07 -0700
Subject: vfio: Add and use device request op for vfio bus drivers

When a request is made to unbind a device from a vfio bus driver,
we need to wait for the device to become unused, ie. for userspace
to release the device.  However, we have a long standing TODO in
the code to do something proactive to make that happen.  To enable
this, we add a request callback on the vfio bus driver struct,
which is intended to signal the user through the vfio device
interface to release the device.  Instead of passively waiting for
the device to become unused, we can now pester the user to give
it up.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/vfio/vfio.c | 24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

(limited to 'drivers')

diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 13e0f39d91e0..4cde85501444 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -710,6 +710,7 @@ void *vfio_del_group_dev(struct device *dev)
 	struct vfio_group *group = device->group;
 	void *device_data = device->device_data;
 	struct vfio_unbound_dev *unbound;
+	unsigned int i = 0;
 
 	/*
 	 * The group exists so long as we have a device reference.  Get
@@ -737,8 +738,27 @@ void *vfio_del_group_dev(struct device *dev)
 
 	vfio_device_put(device);
 
-	/* TODO send a signal to encourage this to be released */
-	wait_event(vfio.release_q, !vfio_dev_present(group, dev));
+	/*
+	 * If the device is still present in the group after the above
+	 * 'put', then it is in use and we need to request it from the
+	 * bus driver.  The driver may in turn need to request the
+	 * device from the user.  We send the request on an arbitrary
+	 * interval with counter to allow the driver to take escalating
+	 * measures to release the device if it has the ability to do so.
+	 */
+	do {
+		device = vfio_group_get_device(group, dev);
+		if (!device)
+			break;
+
+		if (device->ops->request)
+			device->ops->request(device_data, i++);
+
+		vfio_device_put(device);
+
+	} while (wait_event_interruptible_timeout(vfio.release_q,
+						  !vfio_dev_present(group, dev),
+						  HZ * 10) <= 0);
 
 	vfio_group_put(group);
 
-- 
cgit v1.2.3


From cac80d6e382f63243ee4f31eb55afe22ed423e53 Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Fri, 6 Feb 2015 15:05:07 -0700
Subject: vfio-pci: Generalize setup of simple eventfds

We want another single vector IRQ index to support signaling of
the device request to userspace.  Generalize the error reporting
IRQ index to avoid code duplication.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/vfio/pci/vfio_pci_intrs.c | 44 ++++++++++++++++++++++++++-------------
 1 file changed, 29 insertions(+), 15 deletions(-)

(limited to 'drivers')

diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c
index e8d695b3f54e..b134befbfd0e 100644
--- a/drivers/vfio/pci/vfio_pci_intrs.c
+++ b/drivers/vfio/pci/vfio_pci_intrs.c
@@ -763,46 +763,60 @@ static int vfio_pci_set_msi_trigger(struct vfio_pci_device *vdev,
 	return 0;
 }
 
-static int vfio_pci_set_err_trigger(struct vfio_pci_device *vdev,
-				    unsigned index, unsigned start,
-				    unsigned count, uint32_t flags, void *data)
+static int vfio_pci_set_ctx_trigger_single(struct eventfd_ctx **ctx,
+					   uint32_t flags, void *data)
 {
 	int32_t fd = *(int32_t *)data;
 
-	if ((index != VFIO_PCI_ERR_IRQ_INDEX) ||
-	    !(flags & VFIO_IRQ_SET_DATA_TYPE_MASK))
+	if (!(flags & VFIO_IRQ_SET_DATA_TYPE_MASK))
 		return -EINVAL;
 
 	/* DATA_NONE/DATA_BOOL enables loopback testing */
 	if (flags & VFIO_IRQ_SET_DATA_NONE) {
-		if (vdev->err_trigger)
-			eventfd_signal(vdev->err_trigger, 1);
+		if (*ctx)
+			eventfd_signal(*ctx, 1);
 		return 0;
 	} else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
 		uint8_t trigger = *(uint8_t *)data;
-		if (trigger && vdev->err_trigger)
-			eventfd_signal(vdev->err_trigger, 1);
+		if (trigger && *ctx)
+			eventfd_signal(*ctx, 1);
 		return 0;
 	}
 
 	/* Handle SET_DATA_EVENTFD */
 	if (fd == -1) {
-		if (vdev->err_trigger)
-			eventfd_ctx_put(vdev->err_trigger);
-		vdev->err_trigger = NULL;
+		if (*ctx)
+			eventfd_ctx_put(*ctx);
+		*ctx = NULL;
 		return 0;
 	} else if (fd >= 0) {
 		struct eventfd_ctx *efdctx;
 		efdctx = eventfd_ctx_fdget(fd);
 		if (IS_ERR(efdctx))
 			return PTR_ERR(efdctx);
-		if (vdev->err_trigger)
-			eventfd_ctx_put(vdev->err_trigger);
-		vdev->err_trigger = efdctx;
+		if (*ctx)
+			eventfd_ctx_put(*ctx);
+		*ctx = efdctx;
 		return 0;
 	} else
 		return -EINVAL;
 }
+
+static int vfio_pci_set_err_trigger(struct vfio_pci_device *vdev,
+				    unsigned index, unsigned start,
+				    unsigned count, uint32_t flags, void *data)
+{
+	if (index != VFIO_PCI_ERR_IRQ_INDEX)
+		return -EINVAL;
+
+	/*
+	 * We should sanitize start & count, but that wasn't caught
+	 * originally, so this IRQ index must forever ignore them :-(
+	 */
+
+	return vfio_pci_set_ctx_trigger_single(&vdev->err_trigger, flags, data);
+}
+
 int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev, uint32_t flags,
 			    unsigned index, unsigned start, unsigned count,
 			    void *data)
-- 
cgit v1.2.3


From 6140a8f5623820cec7f56c63444b9551d8d35775 Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Fri, 6 Feb 2015 15:05:08 -0700
Subject: vfio-pci: Add device request interface

Userspace can opt to receive a device request notification,
indicating that the device should be released.  This is setup
the same way as the error IRQ and also supports eventfd signaling.
Future support may forcefully remove the device from the user if
the request is ignored.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/vfio/pci/vfio_pci.c         | 21 ++++++++++++++++++++-
 drivers/vfio/pci/vfio_pci_intrs.c   | 16 ++++++++++++++++
 drivers/vfio/pci/vfio_pci_private.h |  1 +
 3 files changed, 37 insertions(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 7cc0122a18ce..f8a186381ae8 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -239,9 +239,12 @@ static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type)
 
 			return (flags & PCI_MSIX_FLAGS_QSIZE) + 1;
 		}
-	} else if (irq_type == VFIO_PCI_ERR_IRQ_INDEX)
+	} else if (irq_type == VFIO_PCI_ERR_IRQ_INDEX) {
 		if (pci_is_pcie(vdev->pdev))
 			return 1;
+	} else if (irq_type == VFIO_PCI_REQ_IRQ_INDEX) {
+		return 1;
+	}
 
 	return 0;
 }
@@ -464,6 +467,7 @@ static long vfio_pci_ioctl(void *device_data,
 
 		switch (info.index) {
 		case VFIO_PCI_INTX_IRQ_INDEX ... VFIO_PCI_MSIX_IRQ_INDEX:
+		case VFIO_PCI_REQ_IRQ_INDEX:
 			break;
 		case VFIO_PCI_ERR_IRQ_INDEX:
 			if (pci_is_pcie(vdev->pdev))
@@ -828,6 +832,20 @@ static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma)
 			       req_len, vma->vm_page_prot);
 }
 
+static void vfio_pci_request(void *device_data, unsigned int count)
+{
+	struct vfio_pci_device *vdev = device_data;
+
+	mutex_lock(&vdev->igate);
+
+	if (vdev->req_trigger) {
+		dev_dbg(&vdev->pdev->dev, "Requesting device from user\n");
+		eventfd_signal(vdev->req_trigger, 1);
+	}
+
+	mutex_unlock(&vdev->igate);
+}
+
 static const struct vfio_device_ops vfio_pci_ops = {
 	.name		= "vfio-pci",
 	.open		= vfio_pci_open,
@@ -836,6 +854,7 @@ static const struct vfio_device_ops vfio_pci_ops = {
 	.read		= vfio_pci_read,
 	.write		= vfio_pci_write,
 	.mmap		= vfio_pci_mmap,
+	.request	= vfio_pci_request,
 };
 
 static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c
index b134befbfd0e..f88bfdf5b6a0 100644
--- a/drivers/vfio/pci/vfio_pci_intrs.c
+++ b/drivers/vfio/pci/vfio_pci_intrs.c
@@ -817,6 +817,16 @@ static int vfio_pci_set_err_trigger(struct vfio_pci_device *vdev,
 	return vfio_pci_set_ctx_trigger_single(&vdev->err_trigger, flags, data);
 }
 
+static int vfio_pci_set_req_trigger(struct vfio_pci_device *vdev,
+				    unsigned index, unsigned start,
+				    unsigned count, uint32_t flags, void *data)
+{
+	if (index != VFIO_PCI_REQ_IRQ_INDEX || start != 0 || count != 1)
+		return -EINVAL;
+
+	return vfio_pci_set_ctx_trigger_single(&vdev->req_trigger, flags, data);
+}
+
 int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev, uint32_t flags,
 			    unsigned index, unsigned start, unsigned count,
 			    void *data)
@@ -858,6 +868,12 @@ int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev, uint32_t flags,
 				func = vfio_pci_set_err_trigger;
 			break;
 		}
+	case VFIO_PCI_REQ_IRQ_INDEX:
+		switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
+		case VFIO_IRQ_SET_ACTION_TRIGGER:
+			func = vfio_pci_set_req_trigger;
+			break;
+		}
 	}
 
 	if (!func)
diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h
index 671c17a6e6d0..c9f9b323f152 100644
--- a/drivers/vfio/pci/vfio_pci_private.h
+++ b/drivers/vfio/pci/vfio_pci_private.h
@@ -58,6 +58,7 @@ struct vfio_pci_device {
 	struct pci_saved_state	*pci_saved_state;
 	int			refcnt;
 	struct eventfd_ctx	*err_trigger;
+	struct eventfd_ctx	*req_trigger;
 };
 
 #define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX)
-- 
cgit v1.2.3