diff options
Diffstat (limited to 'drivers/xen')
-rw-r--r-- | drivers/xen/Kconfig | 10 | ||||
-rw-r--r-- | drivers/xen/Makefile | 6 | ||||
-rw-r--r-- | drivers/xen/balloon.c | 375 | ||||
-rw-r--r-- | drivers/xen/events.c | 120 | ||||
-rw-r--r-- | drivers/xen/gntalloc.c | 545 | ||||
-rw-r--r-- | drivers/xen/gntdev.c | 382 | ||||
-rw-r--r-- | drivers/xen/grant-table.c | 10 | ||||
-rw-r--r-- | drivers/xen/manage.c | 153 | ||||
-rw-r--r-- | drivers/xen/platform-pci.c | 3 | ||||
-rw-r--r-- | drivers/xen/xen-balloon.c | 256 |
10 files changed, 1391 insertions, 469 deletions
diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig index 07bec09d1dad..a59638b37c1a 100644 --- a/drivers/xen/Kconfig +++ b/drivers/xen/Kconfig @@ -76,10 +76,20 @@ config XEN_XENBUS_FRONTEND config XEN_GNTDEV tristate "userspace grant access device driver" depends on XEN + default m select MMU_NOTIFIER help Allows userspace processes to use grants. +config XEN_GRANT_DEV_ALLOC + tristate "User-space grant reference allocator driver" + depends on XEN + default m + help + Allows userspace processes to create pages with access granted + to other domains. This can be used to implement frontend drivers + or as part of an inter-domain shared memory channel. + config XEN_PLATFORM_PCI tristate "xen platform pci device driver" depends on XEN_PVHVM && PCI diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile index 5088cc2e6fe2..f420f1ff7f13 100644 --- a/drivers/xen/Makefile +++ b/drivers/xen/Makefile @@ -1,4 +1,4 @@ -obj-y += grant-table.o features.o events.o manage.o +obj-y += grant-table.o features.o events.o manage.o balloon.o obj-y += xenbus/ nostackp := $(call cc-option, -fno-stack-protector) @@ -7,9 +7,10 @@ CFLAGS_features.o := $(nostackp) obj-$(CONFIG_BLOCK) += biomerge.o obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o obj-$(CONFIG_XEN_XENCOMM) += xencomm.o -obj-$(CONFIG_XEN_BALLOON) += balloon.o +obj-$(CONFIG_XEN_BALLOON) += xen-balloon.o obj-$(CONFIG_XEN_DEV_EVTCHN) += xen-evtchn.o obj-$(CONFIG_XEN_GNTDEV) += xen-gntdev.o +obj-$(CONFIG_XEN_GRANT_DEV_ALLOC) += xen-gntalloc.o obj-$(CONFIG_XENFS) += xenfs/ obj-$(CONFIG_XEN_SYS_HYPERVISOR) += sys-hypervisor.o obj-$(CONFIG_XEN_PLATFORM_PCI) += xen-platform-pci.o @@ -18,5 +19,6 @@ obj-$(CONFIG_XEN_DOM0) += pci.o xen-evtchn-y := evtchn.o xen-gntdev-y := gntdev.o +xen-gntalloc-y := gntalloc.o xen-platform-pci-y := platform-pci.o diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index 43f9f02c7db0..043af8ad6b60 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -1,6 +1,4 @@ /****************************************************************************** - * balloon.c - * * Xen balloon driver - enables returning/claiming memory to/from Xen. * * Copyright (c) 2003, B Dragovic @@ -33,7 +31,6 @@ */ #include <linux/kernel.h> -#include <linux/module.h> #include <linux/sched.h> #include <linux/errno.h> #include <linux/mm.h> @@ -42,13 +39,11 @@ #include <linux/highmem.h> #include <linux/mutex.h> #include <linux/list.h> -#include <linux/sysdev.h> #include <linux/gfp.h> #include <asm/page.h> #include <asm/pgalloc.h> #include <asm/pgtable.h> -#include <asm/uaccess.h> #include <asm/tlb.h> #include <asm/e820.h> @@ -58,35 +53,29 @@ #include <xen/xen.h> #include <xen/interface/xen.h> #include <xen/interface/memory.h> -#include <xen/xenbus.h> +#include <xen/balloon.h> #include <xen/features.h> #include <xen/page.h> -#define PAGES2KB(_p) ((_p)<<(PAGE_SHIFT-10)) - -#define BALLOON_CLASS_NAME "xen_memory" +/* + * balloon_process() state: + * + * BP_DONE: done or nothing to do, + * BP_EAGAIN: error, go to sleep, + * BP_ECANCELED: error, balloon operation canceled. + */ -struct balloon_stats { - /* We aim for 'current allocation' == 'target allocation'. */ - unsigned long current_pages; - unsigned long target_pages; - /* - * Drivers may alter the memory reservation independently, but they - * must inform the balloon driver so we avoid hitting the hard limit. - */ - unsigned long driver_pages; - /* Number of pages in high- and low-memory balloons. */ - unsigned long balloon_low; - unsigned long balloon_high; +enum bp_state { + BP_DONE, + BP_EAGAIN, + BP_ECANCELED }; -static DEFINE_MUTEX(balloon_mutex); - -static struct sys_device balloon_sysdev; -static int register_balloon(struct sys_device *sysdev); +static DEFINE_MUTEX(balloon_mutex); -static struct balloon_stats balloon_stats; +struct balloon_stats balloon_stats; +EXPORT_SYMBOL_GPL(balloon_stats); /* We increase/decrease in batches which fit in a page */ static unsigned long frame_list[PAGE_SIZE / sizeof(unsigned long)]; @@ -104,8 +93,7 @@ static LIST_HEAD(ballooned_pages); /* Main work function, always executed in process context. */ static void balloon_process(struct work_struct *work); -static DECLARE_WORK(balloon_worker, balloon_process); -static struct timer_list balloon_timer; +static DECLARE_DELAYED_WORK(balloon_worker, balloon_process); /* When ballooning out (allocating memory to return to Xen) we don't really want the kernel to try too hard since that can trigger the oom killer. */ @@ -140,14 +128,17 @@ static void balloon_append(struct page *page) } /* balloon_retrieve: rescue a page from the balloon, if it is not empty. */ -static struct page *balloon_retrieve(void) +static struct page *balloon_retrieve(bool prefer_highmem) { struct page *page; if (list_empty(&ballooned_pages)) return NULL; - page = list_entry(ballooned_pages.next, struct page, lru); + if (prefer_highmem) + page = list_entry(ballooned_pages.prev, struct page, lru); + else + page = list_entry(ballooned_pages.next, struct page, lru); list_del(&page->lru); if (PageHighMem(page)) { @@ -177,9 +168,29 @@ static struct page *balloon_next_page(struct page *page) return list_entry(next, struct page, lru); } -static void balloon_alarm(unsigned long unused) +static enum bp_state update_schedule(enum bp_state state) { - schedule_work(&balloon_worker); + if (state == BP_DONE) { + balloon_stats.schedule_delay = 1; + balloon_stats.retry_count = 1; + return BP_DONE; + } + + ++balloon_stats.retry_count; + + if (balloon_stats.max_retry_count != RETRY_UNLIMITED && + balloon_stats.retry_count > balloon_stats.max_retry_count) { + balloon_stats.schedule_delay = 1; + balloon_stats.retry_count = 1; + return BP_ECANCELED; + } + + balloon_stats.schedule_delay <<= 1; + + if (balloon_stats.schedule_delay > balloon_stats.max_schedule_delay) + balloon_stats.schedule_delay = balloon_stats.max_schedule_delay; + + return BP_EAGAIN; } static unsigned long current_target(void) @@ -194,11 +205,11 @@ static unsigned long current_target(void) return target; } -static int increase_reservation(unsigned long nr_pages) +static enum bp_state increase_reservation(unsigned long nr_pages) { + int rc; unsigned long pfn, i; struct page *page; - long rc; struct xen_memory_reservation reservation = { .address_bits = 0, .extent_order = 0, @@ -210,7 +221,10 @@ static int increase_reservation(unsigned long nr_pages) page = balloon_first_page(); for (i = 0; i < nr_pages; i++) { - BUG_ON(page == NULL); + if (!page) { + nr_pages = i; + break; + } frame_list[i] = page_to_pfn(page); page = balloon_next_page(page); } @@ -218,11 +232,11 @@ static int increase_reservation(unsigned long nr_pages) set_xen_guest_handle(reservation.extent_start, frame_list); reservation.nr_extents = nr_pages; rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation); - if (rc < 0) - goto out; + if (rc <= 0) + return BP_EAGAIN; for (i = 0; i < rc; i++) { - page = balloon_retrieve(); + page = balloon_retrieve(false); BUG_ON(page == NULL); pfn = page_to_pfn(page); @@ -232,7 +246,7 @@ static int increase_reservation(unsigned long nr_pages) set_phys_to_machine(pfn, frame_list[i]); /* Link back into the page tables if not highmem. */ - if (pfn < max_low_pfn) { + if (!xen_hvm_domain() && pfn < max_low_pfn) { int ret; ret = HYPERVISOR_update_va_mapping( (unsigned long)__va(pfn << PAGE_SHIFT), @@ -249,15 +263,14 @@ static int increase_reservation(unsigned long nr_pages) balloon_stats.current_pages += rc; - out: - return rc < 0 ? rc : rc != nr_pages; + return BP_DONE; } -static int decrease_reservation(unsigned long nr_pages) +static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp) { + enum bp_state state = BP_DONE; unsigned long pfn, i; struct page *page; - int need_sleep = 0; int ret; struct xen_memory_reservation reservation = { .address_bits = 0, @@ -269,9 +282,9 @@ static int decrease_reservation(unsigned long nr_pages) nr_pages = ARRAY_SIZE(frame_list); for (i = 0; i < nr_pages; i++) { - if ((page = alloc_page(GFP_BALLOON)) == NULL) { + if ((page = alloc_page(gfp)) == NULL) { nr_pages = i; - need_sleep = 1; + state = BP_EAGAIN; break; } @@ -280,7 +293,7 @@ static int decrease_reservation(unsigned long nr_pages) scrub_page(page); - if (!PageHighMem(page)) { + if (!xen_hvm_domain() && !PageHighMem(page)) { ret = HYPERVISOR_update_va_mapping( (unsigned long)__va(pfn << PAGE_SHIFT), __pte_ma(0), 0); @@ -296,7 +309,7 @@ static int decrease_reservation(unsigned long nr_pages) /* No more mappings: invalidate P2M and add to balloon. */ for (i = 0; i < nr_pages; i++) { pfn = mfn_to_pfn(frame_list[i]); - set_phys_to_machine(pfn, INVALID_P2M_ENTRY); + __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); balloon_append(pfn_to_page(pfn)); } @@ -307,7 +320,7 @@ static int decrease_reservation(unsigned long nr_pages) balloon_stats.current_pages -= nr_pages; - return need_sleep; + return state; } /* @@ -318,99 +331,125 @@ static int decrease_reservation(unsigned long nr_pages) */ static void balloon_process(struct work_struct *work) { - int need_sleep = 0; + enum bp_state state = BP_DONE; long credit; mutex_lock(&balloon_mutex); do { credit = current_target() - balloon_stats.current_pages; + if (credit > 0) - need_sleep = (increase_reservation(credit) != 0); + state = increase_reservation(credit); + if (credit < 0) - need_sleep = (decrease_reservation(-credit) != 0); + state = decrease_reservation(-credit, GFP_BALLOON); + + state = update_schedule(state); #ifndef CONFIG_PREEMPT if (need_resched()) schedule(); #endif - } while ((credit != 0) && !need_sleep); + } while (credit && state == BP_DONE); /* Schedule more work if there is some still to be done. */ - if (current_target() != balloon_stats.current_pages) - mod_timer(&balloon_timer, jiffies + HZ); + if (state == BP_EAGAIN) + schedule_delayed_work(&balloon_worker, balloon_stats.schedule_delay * HZ); mutex_unlock(&balloon_mutex); } /* Resets the Xen limit, sets new target, and kicks off processing. */ -static void balloon_set_new_target(unsigned long target) +void balloon_set_new_target(unsigned long target) { /* No need for lock. Not read-modify-write updates. */ balloon_stats.target_pages = target; - schedule_work(&balloon_worker); + schedule_delayed_work(&balloon_worker, 0); } +EXPORT_SYMBOL_GPL(balloon_set_new_target); -static struct xenbus_watch target_watch = -{ - .node = "memory/target" -}; - -/* React to a change in the target key */ -static void watch_target(struct xenbus_watch *watch, - const char **vec, unsigned int len) +/** + * alloc_xenballooned_pages - get pages that have been ballooned out + * @nr_pages: Number of pages to get + * @pages: pages returned + * @return 0 on success, error otherwise + */ +int alloc_xenballooned_pages(int nr_pages, struct page** pages) { - unsigned long long new_target; - int err; - - err = xenbus_scanf(XBT_NIL, "memory", "target", "%llu", &new_target); - if (err != 1) { - /* This is ok (for domain0 at least) - so just return */ - return; + int pgno = 0; + struct page* page; + mutex_lock(&balloon_mutex); + while (pgno < nr_pages) { + page = balloon_retrieve(true); + if (page) { + pages[pgno++] = page; + } else { + enum bp_state st; + st = decrease_reservation(nr_pages - pgno, GFP_HIGHUSER); + if (st != BP_DONE) + goto out_undo; + } } - - /* The given memory/target value is in KiB, so it needs converting to - * pages. PAGE_SHIFT converts bytes to pages, hence PAGE_SHIFT - 10. - */ - balloon_set_new_target(new_target >> (PAGE_SHIFT - 10)); + mutex_unlock(&balloon_mutex); + return 0; + out_undo: + while (pgno) + balloon_append(pages[--pgno]); + /* Free the memory back to the kernel soon */ + schedule_delayed_work(&balloon_worker, 0); + mutex_unlock(&balloon_mutex); + return -ENOMEM; } +EXPORT_SYMBOL(alloc_xenballooned_pages); -static int balloon_init_watcher(struct notifier_block *notifier, - unsigned long event, - void *data) +/** + * free_xenballooned_pages - return pages retrieved with get_ballooned_pages + * @nr_pages: Number of pages + * @pages: pages to return + */ +void free_xenballooned_pages(int nr_pages, struct page** pages) { - int err; + int i; - err = register_xenbus_watch(&target_watch); - if (err) - printk(KERN_ERR "Failed to set balloon watcher\n"); + mutex_lock(&balloon_mutex); - return NOTIFY_DONE; -} + for (i = 0; i < nr_pages; i++) { + if (pages[i]) + balloon_append(pages[i]); + } + + /* The balloon may be too large now. Shrink it if needed. */ + if (current_target() != balloon_stats.current_pages) + schedule_delayed_work(&balloon_worker, 0); -static struct notifier_block xenstore_notifier; + mutex_unlock(&balloon_mutex); +} +EXPORT_SYMBOL(free_xenballooned_pages); static int __init balloon_init(void) { - unsigned long pfn, extra_pfn_end; + unsigned long pfn, nr_pages, extra_pfn_end; struct page *page; - if (!xen_pv_domain()) + if (!xen_domain()) return -ENODEV; - pr_info("xen_balloon: Initialising balloon driver.\n"); + pr_info("xen/balloon: Initialising balloon driver.\n"); - balloon_stats.current_pages = min(xen_start_info->nr_pages, max_pfn); + if (xen_pv_domain()) + nr_pages = xen_start_info->nr_pages; + else + nr_pages = max_pfn; + balloon_stats.current_pages = min(nr_pages, max_pfn); balloon_stats.target_pages = balloon_stats.current_pages; balloon_stats.balloon_low = 0; balloon_stats.balloon_high = 0; - balloon_stats.driver_pages = 0UL; - - init_timer(&balloon_timer); - balloon_timer.data = 0; - balloon_timer.function = balloon_alarm; - register_balloon(&balloon_sysdev); + balloon_stats.schedule_delay = 1; + balloon_stats.max_schedule_delay = 32; + balloon_stats.retry_count = 1; + balloon_stats.max_retry_count = RETRY_UNLIMITED; /* * Initialise the balloon with excess memory space. We need @@ -432,153 +471,9 @@ static int __init balloon_init(void) __balloon_append(page); } - target_watch.callback = watch_target; - xenstore_notifier.notifier_call = balloon_init_watcher; - - register_xenstore_notifier(&xenstore_notifier); - return 0; } subsys_initcall(balloon_init); -static void balloon_exit(void) -{ - /* XXX - release balloon here */ - return; -} - -module_exit(balloon_exit); - -#define BALLOON_SHOW(name, format, args...) \ - static ssize_t show_##name(struct sys_device *dev, \ - struct sysdev_attribute *attr, \ - char *buf) \ - { \ - return sprintf(buf, format, ##args); \ - } \ - static SYSDEV_ATTR(name, S_IRUGO, show_##name, NULL) - -BALLOON_SHOW(current_kb, "%lu\n", PAGES2KB(balloon_stats.current_pages)); -BALLOON_SHOW(low_kb, "%lu\n", PAGES2KB(balloon_stats.balloon_low)); -BALLOON_SHOW(high_kb, "%lu\n", PAGES2KB(balloon_stats.balloon_high)); -BALLOON_SHOW(driver_kb, "%lu\n", PAGES2KB(balloon_stats.driver_pages)); - -static ssize_t show_target_kb(struct sys_device *dev, struct sysdev_attribute *attr, - char *buf) -{ - return sprintf(buf, "%lu\n", PAGES2KB(balloon_stats.target_pages)); -} - -static ssize_t store_target_kb(struct sys_device *dev, - struct sysdev_attribute *attr, - const char *buf, - size_t count) -{ - char *endchar; - unsigned long long target_bytes; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - target_bytes = simple_strtoull(buf, &endchar, 0) * 1024; - - balloon_set_new_target(target_bytes >> PAGE_SHIFT); - - return count; -} - -static SYSDEV_ATTR(target_kb, S_IRUGO | S_IWUSR, - show_target_kb, store_target_kb); - - -static ssize_t show_target(struct sys_device *dev, struct sysdev_attribute *attr, - char *buf) -{ - return sprintf(buf, "%llu\n", - (unsigned long long)balloon_stats.target_pages - << PAGE_SHIFT); -} - -static ssize_t store_target(struct sys_device *dev, - struct sysdev_attribute *attr, - const char *buf, - size_t count) -{ - char *endchar; - unsigned long long target_bytes; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - target_bytes = memparse(buf, &endchar); - - balloon_set_new_target(target_bytes >> PAGE_SHIFT); - - return count; -} - -static SYSDEV_ATTR(target, S_IRUGO | S_IWUSR, - show_target, store_target); - - -static struct sysdev_attribute *balloon_attrs[] = { - &attr_target_kb, - &attr_target, -}; - -static struct attribute *balloon_info_attrs[] = { - &attr_current_kb.attr, - &attr_low_kb.attr, - &attr_high_kb.attr, - &attr_driver_kb.attr, - NULL -}; - -static struct attribute_group balloon_info_group = { - .name = "info", - .attrs = balloon_info_attrs, -}; - -static struct sysdev_class balloon_sysdev_class = { - .name = BALLOON_CLASS_NAME, -}; - -static int register_balloon(struct sys_device *sysdev) -{ - int i, error; - - error = sysdev_class_register(&balloon_sysdev_class); - if (error) - return error; - - sysdev->id = 0; - sysdev->cls = &balloon_sysdev_class; - - error = sysdev_register(sysdev); - if (error) { - sysdev_class_unregister(&balloon_sysdev_class); - return error; - } - - for (i = 0; i < ARRAY_SIZE(balloon_attrs); i++) { - error = sysdev_create_file(sysdev, balloon_attrs[i]); - if (error) - goto fail; - } - - error = sysfs_create_group(&sysdev->kobj, &balloon_info_group); - if (error) - goto fail; - - return 0; - - fail: - while (--i >= 0) - sysdev_remove_file(sysdev, balloon_attrs[i]); - sysdev_unregister(sysdev); - sysdev_class_unregister(&balloon_sysdev_class); - return error; -} - MODULE_LICENSE("GPL"); diff --git a/drivers/xen/events.c b/drivers/xen/events.c index 4591853f6ebc..02b5a9c05cfa 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -694,7 +694,7 @@ int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc, handle_level_irq, name); xen_irq_info_pirq_init(irq, 0, pirq, 0, vector, 0); - ret = set_irq_msi(irq, msidesc); + ret = irq_set_msi_desc(irq, msidesc); if (ret < 0) goto error_irq; out: @@ -818,6 +818,21 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu) return irq; } +static int bind_interdomain_evtchn_to_irq(unsigned int remote_domain, + unsigned int remote_port) +{ + struct evtchn_bind_interdomain bind_interdomain; + int err; + + bind_interdomain.remote_dom = remote_domain; + bind_interdomain.remote_port = remote_port; + + err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain, + &bind_interdomain); + + return err ? : bind_evtchn_to_irq(bind_interdomain.local_port); +} + int bind_virq_to_irq(unsigned int virq, unsigned int cpu) { @@ -913,6 +928,29 @@ int bind_evtchn_to_irqhandler(unsigned int evtchn, } EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler); +int bind_interdomain_evtchn_to_irqhandler(unsigned int remote_domain, + unsigned int remote_port, + irq_handler_t handler, + unsigned long irqflags, + const char *devname, + void *dev_id) +{ + int irq, retval; + + irq = bind_interdomain_evtchn_to_irq(remote_domain, remote_port); + if (irq < 0) + return irq; + + retval = request_irq(irq, handler, irqflags, devname, dev_id); + if (retval != 0) { + unbind_from_irq(irq); + return retval; + } + + return irq; +} +EXPORT_SYMBOL_GPL(bind_interdomain_evtchn_to_irqhandler); + int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu, irq_handler_t handler, unsigned long irqflags, const char *devname, void *dev_id) @@ -1052,6 +1090,13 @@ irqreturn_t xen_debug_interrupt(int irq, void *dev_id) } static DEFINE_PER_CPU(unsigned, xed_nesting_count); +static DEFINE_PER_CPU(unsigned int, current_word_idx); +static DEFINE_PER_CPU(unsigned int, current_bit_idx); + +/* + * Mask out the i least significant bits of w + */ +#define MASK_LSBS(w, i) (w & ((~0UL) << i)) /* * Search the CPUs pending events bitmasks. For each one found, map @@ -1064,6 +1109,9 @@ static DEFINE_PER_CPU(unsigned, xed_nesting_count); */ static void __xen_evtchn_do_upcall(void) { + int start_word_idx, start_bit_idx; + int word_idx, bit_idx; + int i; int cpu = get_cpu(); struct shared_info *s = HYPERVISOR_shared_info; struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu); @@ -1082,17 +1130,57 @@ static void __xen_evtchn_do_upcall(void) wmb(); #endif pending_words = xchg(&vcpu_info->evtchn_pending_sel, 0); - while (pending_words != 0) { + + start_word_idx = __this_cpu_read(current_word_idx); + start_bit_idx = __this_cpu_read(current_bit_idx); + + word_idx = start_word_idx; + + for (i = 0; pending_words != 0; i++) { unsigned long pending_bits; - int word_idx = __ffs(pending_words); - pending_words &= ~(1UL << word_idx); + unsigned long words; + + words = MASK_LSBS(pending_words, word_idx); + + /* + * If we masked out all events, wrap to beginning. + */ + if (words == 0) { + word_idx = 0; + bit_idx = 0; + continue; + } + word_idx = __ffs(words); + + pending_bits = active_evtchns(cpu, s, word_idx); + bit_idx = 0; /* usually scan entire word from start */ + if (word_idx == start_word_idx) { + /* We scan the starting word in two parts */ + if (i == 0) + /* 1st time: start in the middle */ + bit_idx = start_bit_idx; + else + /* 2nd time: mask bits done already */ + bit_idx &= (1UL << start_bit_idx) - 1; + } - while ((pending_bits = active_evtchns(cpu, s, word_idx)) != 0) { - int bit_idx = __ffs(pending_bits); - int port = (word_idx * BITS_PER_LONG) + bit_idx; - int irq = evtchn_to_irq[port]; + do { + unsigned long bits; + int port, irq; struct irq_desc *desc; + bits = MASK_LSBS(pending_bits, bit_idx); + + /* If we masked out all events, move on. */ + if (bits == 0) + break; + + bit_idx = __ffs(bits); + + /* Process port. */ + port = (word_idx * BITS_PER_LONG) + bit_idx; + irq = evtchn_to_irq[port]; + mask_evtchn(port); clear_evtchn(port); @@ -1101,7 +1189,21 @@ static void __xen_evtchn_do_upcall(void) if (desc) generic_handle_irq_desc(irq, desc); } - } + + bit_idx = (bit_idx + 1) % BITS_PER_LONG; + + /* Next caller starts at last processed + 1 */ + __this_cpu_write(current_word_idx, + bit_idx ? word_idx : + (word_idx+1) % BITS_PER_LONG); + __this_cpu_write(current_bit_idx, bit_idx); + } while (bit_idx != 0); + + /* Scan start_l1i twice; all others once. */ + if ((word_idx != start_word_idx) || (i != 0)) + pending_words &= ~(1UL << word_idx); + + word_idx = (word_idx + 1) % BITS_PER_LONG; } BUG_ON(!irqs_disabled()); diff --git a/drivers/xen/gntalloc.c b/drivers/xen/gntalloc.c new file mode 100644 index 000000000000..a7ffdfe19fc9 --- /dev/null +++ b/drivers/xen/gntalloc.c @@ -0,0 +1,545 @@ +/****************************************************************************** + * gntalloc.c + * + * Device for creating grant references (in user-space) that may be shared + * with other domains. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +/* + * This driver exists to allow userspace programs in Linux to allocate kernel + * memory that will later be shared with another domain. Without this device, + * Linux userspace programs cannot create grant references. + * + * How this stuff works: + * X -> granting a page to Y + * Y -> mapping the grant from X + * + * 1. X uses the gntalloc device to allocate a page of kernel memory, P. + * 2. X creates an entry in the grant table that says domid(Y) can access P. + * This is done without a hypercall unless the grant table needs expansion. + * 3. X gives the grant reference identifier, GREF, to Y. + * 4. Y maps the page, either directly into kernel memory for use in a backend + * driver, or via a the gntdev device to map into the address space of an + * application running in Y. This is the first point at which Xen does any + * tracking of the page. + * 5. A program in X mmap()s a segment of the gntalloc device that corresponds + * to the shared page, and can now communicate with Y over the shared page. + * + * + * NOTE TO USERSPACE LIBRARIES: + * The grant allocation and mmap()ing are, naturally, two separate operations. + * You set up the sharing by calling the create ioctl() and then the mmap(). + * Teardown requires munmap() and either close() or ioctl(). + * + * WARNING: Since Xen does not allow a guest to forcibly end the use of a grant + * reference, this device can be used to consume kernel memory by leaving grant + * references mapped by another domain when an application exits. Therefore, + * there is a global limit on the number of pages that can be allocated. When + * all references to the page are unmapped, it will be freed during the next + * grant operation. + */ + +#include <linux/atomic.h> +#include <linux/module.h> +#include <linux/miscdevice.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/fs.h> +#include <linux/device.h> +#include <linux/mm.h> +#include <linux/uaccess.h> +#include <linux/types.h> +#include <linux/list.h> +#include <linux/highmem.h> + +#include <xen/xen.h> +#include <xen/page.h> +#include <xen/grant_table.h> +#include <xen/gntalloc.h> +#include <xen/events.h> + +static int limit = 1024; +module_param(limit, int, 0644); +MODULE_PARM_DESC(limit, "Maximum number of grants that may be allocated by " + "the gntalloc device"); + +static LIST_HEAD(gref_list); +static DEFINE_SPINLOCK(gref_lock); +static int gref_size; + +struct notify_info { + uint16_t pgoff:12; /* Bits 0-11: Offset of the byte to clear */ + uint16_t flags:2; /* Bits 12-13: Unmap notification flags */ + int event; /* Port (event channel) to notify */ +}; + +/* Metadata on a grant reference. */ +struct gntalloc_gref { + struct list_head next_gref; /* list entry gref_list */ + struct list_head next_file; /* list entry file->list, if open */ + struct page *page; /* The shared page */ + uint64_t file_index; /* File offset for mmap() */ + unsigned int users; /* Use count - when zero, waiting on Xen */ + grant_ref_t gref_id; /* The grant reference number */ + struct notify_info notify; /* Unmap notification */ +}; + +struct gntalloc_file_private_data { + struct list_head list; + uint64_t index; +}; + +static void __del_gref(struct gntalloc_gref *gref); + +static void do_cleanup(void) +{ + struct gntalloc_gref *gref, *n; + list_for_each_entry_safe(gref, n, &gref_list, next_gref) { + if (!gref->users) + __del_gref(gref); + } +} + +static int add_grefs(struct ioctl_gntalloc_alloc_gref *op, + uint32_t *gref_ids, struct gntalloc_file_private_data *priv) +{ + int i, rc, readonly; + LIST_HEAD(queue_gref); + LIST_HEAD(queue_file); + struct gntalloc_gref *gref; + + readonly = !(op->flags & GNTALLOC_FLAG_WRITABLE); + rc = -ENOMEM; + for (i = 0; i < op->count; i++) { + gref = kzalloc(sizeof(*gref), GFP_KERNEL); + if (!gref) + goto undo; + list_add_tail(&gref->next_gref, &queue_gref); + list_add_tail(&gref->next_file, &queue_file); + gref->users = 1; + gref->file_index = op->index + i * PAGE_SIZE; + gref->page = alloc_page(GFP_KERNEL|__GFP_ZERO); + if (!gref->page) + goto undo; + + /* Grant foreign access to the page. */ + gref->gref_id = gnttab_grant_foreign_access(op->domid, + pfn_to_mfn(page_to_pfn(gref->page)), readonly); + if (gref->gref_id < 0) { + rc = gref->gref_id; + goto undo; + } + gref_ids[i] = gref->gref_id; + } + + /* Add to gref lists. */ + spin_lock(&gref_lock); + list_splice_tail(&queue_gref, &gref_list); + list_splice_tail(&queue_file, &priv->list); + spin_unlock(&gref_lock); + + return 0; + +undo: + spin_lock(&gref_lock); + gref_size -= (op->count - i); + + list_for_each_entry(gref, &queue_file, next_file) { + /* __del_gref does not remove from queue_file */ + __del_gref(gref); + } + + /* It's possible for the target domain to map the just-allocated grant + * references by blindly guessing their IDs; if this is done, then + * __del_gref will leave them in the queue_gref list. They need to be + * added to the global list so that we can free them when they are no + * longer referenced. + */ + if (unlikely(!list_empty(&queue_gref))) + list_splice_tail(&queue_gref, &gref_list); + spin_unlock(&gref_lock); + return rc; +} + +static void __del_gref(struct gntalloc_gref *gref) +{ + if (gref->notify.flags & UNMAP_NOTIFY_CLEAR_BYTE) { + uint8_t *tmp = kmap(gref->page); + tmp[gref->notify.pgoff] = 0; + kunmap(gref->page); + } + if (gref->notify.flags & UNMAP_NOTIFY_SEND_EVENT) + notify_remote_via_evtchn(gref->notify.event); + + gref->notify.flags = 0; + + if (gref->gref_id > 0) { + if (gnttab_query_foreign_access(gref->gref_id)) + return; + + if (!gnttab_end_foreign_access_ref(gref->gref_id, 0)) + return; + } + + gref_size--; + list_del(&gref->next_gref); + + if (gref->page) + __free_page(gref->page); + + kfree(gref); +} + +/* finds contiguous grant references in a file, returns the first */ +static struct gntalloc_gref *find_grefs(struct gntalloc_file_private_data *priv, + uint64_t index, uint32_t count) +{ + struct gntalloc_gref *rv = NULL, *gref; + list_for_each_entry(gref, &priv->list, next_file) { + if (gref->file_index == index && !rv) + rv = gref; + if (rv) { + if (gref->file_index != index) + return NULL; + index += PAGE_SIZE; + count--; + if (count == 0) + return rv; + } + } + return NULL; +} + +/* + * ------------------------------------- + * File operations. + * ------------------------------------- + */ +static int gntalloc_open(struct inode *inode, struct file *filp) +{ + struct gntalloc_file_private_data *priv; + + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) + goto out_nomem; + INIT_LIST_HEAD(&priv->list); + + filp->private_data = priv; + + pr_debug("%s: priv %p\n", __func__, priv); + + return 0; + +out_nomem: + return -ENOMEM; +} + +static int gntalloc_release(struct inode *inode, struct file *filp) +{ + struct gntalloc_file_private_data *priv = filp->private_data; + struct gntalloc_gref *gref; + + pr_debug("%s: priv %p\n", __func__, priv); + + spin_lock(&gref_lock); + while (!list_empty(& |