summaryrefslogtreecommitdiffstats
path: root/drivers
diff options
context:
space:
mode:
Diffstat (limited to 'drivers')
-rw-r--r--drivers/Makefile2
-rw-r--r--drivers/block/virtio_blk.c148
-rw-r--r--drivers/char/hw_random/virtio-rng.c2
-rw-r--r--drivers/char/virtio_console.c14
-rw-r--r--drivers/lguest/Kconfig5
-rw-r--r--drivers/lguest/core.c67
-rw-r--r--drivers/lguest/lg.h6
-rw-r--r--drivers/lguest/lguest_user.c6
-rw-r--r--drivers/lguest/page_tables.c567
-rw-r--r--drivers/lguest/x86/core.c7
-rw-r--r--drivers/net/caif/Kconfig14
-rw-r--r--drivers/net/caif/Makefile3
-rw-r--r--drivers/net/caif/caif_virtio.c790
-rw-r--r--drivers/net/virtio_net.c77
-rw-r--r--drivers/rpmsg/virtio_rpmsg_bus.c8
-rw-r--r--drivers/scsi/virtio_scsi.c487
-rw-r--r--drivers/vhost/Kconfig8
-rw-r--r--drivers/vhost/Makefile2
-rw-r--r--drivers/vhost/test.c4
-rw-r--r--drivers/vhost/vringh.c1007
-rw-r--r--drivers/virtio/virtio_balloon.c6
-rw-r--r--drivers/virtio/virtio_ring.c297
22 files changed, 2850 insertions, 677 deletions
diff --git a/drivers/Makefile b/drivers/Makefile
index 33360de63650..8e57688ebd95 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -124,7 +124,7 @@ obj-$(CONFIG_PPC_PS3) += ps3/
obj-$(CONFIG_OF) += of/
obj-$(CONFIG_SSB) += ssb/
obj-$(CONFIG_BCMA) += bcma/
-obj-$(CONFIG_VHOST_NET) += vhost/
+obj-$(CONFIG_VHOST_RING) += vhost/
obj-$(CONFIG_VLYNQ) += vlynq/
obj-$(CONFIG_STAGING) += staging/
obj-y += platform/
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 8ad21a25bc0d..64723953e1c9 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -100,96 +100,103 @@ static inline struct virtblk_req *virtblk_alloc_req(struct virtio_blk *vblk,
return vbr;
}
-static void virtblk_add_buf_wait(struct virtio_blk *vblk,
- struct virtblk_req *vbr,
- unsigned long out,
- unsigned long in)
+static int __virtblk_add_req(struct virtqueue *vq,
+ struct virtblk_req *vbr,
+ struct scatterlist *data_sg,
+ bool have_data)
{
- DEFINE_WAIT(wait);
+ struct scatterlist hdr, status, cmd, sense, inhdr, *sgs[6];
+ unsigned int num_out = 0, num_in = 0;
+ int type = vbr->out_hdr.type & ~VIRTIO_BLK_T_OUT;
- for (;;) {
- prepare_to_wait_exclusive(&vblk->queue_wait, &wait,
- TASK_UNINTERRUPTIBLE);
+ sg_init_one(&hdr, &vbr->out_hdr, sizeof(vbr->out_hdr));
+ sgs[num_out++] = &hdr;
- spin_lock_irq(vblk->disk->queue->queue_lock);
- if (virtqueue_add_buf(vblk->vq, vbr->sg, out, in, vbr,
- GFP_ATOMIC) < 0) {
- spin_unlock_irq(vblk->disk->queue->queue_lock);
- io_schedule();
- } else {
- virtqueue_kick(vblk->vq);
- spin_unlock_irq(vblk->disk->queue->queue_lock);
- break;
- }
+ /*
+ * If this is a packet command we need a couple of additional headers.
+ * Behind the normal outhdr we put a segment with the scsi command
+ * block, and before the normal inhdr we put the sense data and the
+ * inhdr with additional status information.
+ */
+ if (type == VIRTIO_BLK_T_SCSI_CMD) {
+ sg_init_one(&cmd, vbr->req->cmd, vbr->req->cmd_len);
+ sgs[num_out++] = &cmd;
+ }
+ if (have_data) {
+ if (vbr->out_hdr.type & VIRTIO_BLK_T_OUT)
+ sgs[num_out++] = data_sg;
+ else
+ sgs[num_out + num_in++] = data_sg;
}
- finish_wait(&vblk->queue_wait, &wait);
+ if (type == VIRTIO_BLK_T_SCSI_CMD) {
+ sg_init_one(&sense, vbr->req->sense, SCSI_SENSE_BUFFERSIZE);
+ sgs[num_out + num_in++] = &sense;
+ sg_init_one(&inhdr, &vbr->in_hdr, sizeof(vbr->in_hdr));
+ sgs[num_out + num_in++] = &inhdr;
+ }
+
+ sg_init_one(&status, &vbr->status, sizeof(vbr->status));
+ sgs[num_out + num_in++] = &status;
+
+ return virtqueue_add_sgs(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC);
}
-static inline void virtblk_add_req(struct virtblk_req *vbr,
- unsigned int out, unsigned int in)
+static void virtblk_add_req(struct virtblk_req *vbr, bool have_data)
{
struct virtio_blk *vblk = vbr->vblk;
+ DEFINE_WAIT(wait);
+ int ret;
spin_lock_irq(vblk->disk->queue->queue_lock);
- if (unlikely(virtqueue_add_buf(vblk->vq, vbr->sg, out, in, vbr,
- GFP_ATOMIC) < 0)) {
+ while (unlikely((ret = __virtblk_add_req(vblk->vq, vbr, vbr->sg,
+ have_data)) < 0)) {
+ prepare_to_wait_exclusive(&vblk->queue_wait, &wait,
+ TASK_UNINTERRUPTIBLE);
+
spin_unlock_irq(vblk->disk->queue->queue_lock);
- virtblk_add_buf_wait(vblk, vbr, out, in);
- return;
+ io_schedule();
+ spin_lock_irq(vblk->disk->queue->queue_lock);
+
+ finish_wait(&vblk->queue_wait, &wait);
}
+
virtqueue_kick(vblk->vq);
spin_unlock_irq(vblk->disk->queue->queue_lock);
}
-static int virtblk_bio_send_flush(struct virtblk_req *vbr)
+static void virtblk_bio_send_flush(struct virtblk_req *vbr)
{
- unsigned int out = 0, in = 0;
-
vbr->flags |= VBLK_IS_FLUSH;
vbr->out_hdr.type = VIRTIO_BLK_T_FLUSH;
vbr->out_hdr.sector = 0;
vbr->out_hdr.ioprio = 0;
- sg_set_buf(&vbr->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr));
- sg_set_buf(&vbr->sg[out + in++], &vbr->status, sizeof(vbr->status));
-
- virtblk_add_req(vbr, out, in);
- return 0;
+ virtblk_add_req(vbr, false);
}
-static int virtblk_bio_send_data(struct virtblk_req *vbr)
+static void virtblk_bio_send_data(struct virtblk_req *vbr)
{
struct virtio_blk *vblk = vbr->vblk;
- unsigned int num, out = 0, in = 0;
struct bio *bio = vbr->bio;
+ bool have_data;
vbr->flags &= ~VBLK_IS_FLUSH;
vbr->out_hdr.type = 0;
vbr->out_hdr.sector = bio->bi_sector;
vbr->out_hdr.ioprio = bio_prio(bio);
- sg_set_buf(&vbr->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr));
-
- num = blk_bio_map_sg(vblk->disk->queue, bio, vbr->sg + out);
-
- sg_set_buf(&vbr->sg[num + out + in++], &vbr->status,
- sizeof(vbr->status));
-
- if (num) {
- if (bio->bi_rw & REQ_WRITE) {
+ if (blk_bio_map_sg(vblk->disk->queue, bio, vbr->sg)) {
+ have_data = true;
+ if (bio->bi_rw & REQ_WRITE)
vbr->out_hdr.type |= VIRTIO_BLK_T_OUT;
- out += num;
- } else {
+ else
vbr->out_hdr.type |= VIRTIO_BLK_T_IN;
- in += num;
- }
- }
+ } else
+ have_data = false;
- virtblk_add_req(vbr, out, in);
-
- return 0;
+ virtblk_add_req(vbr, have_data);
}
static void virtblk_bio_send_data_work(struct work_struct *work)
@@ -298,7 +305,7 @@ static void virtblk_done(struct virtqueue *vq)
static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
struct request *req)
{
- unsigned long num, out = 0, in = 0;
+ unsigned int num;
struct virtblk_req *vbr;
vbr = virtblk_alloc_req(vblk, GFP_ATOMIC);
@@ -335,40 +342,15 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
}
}
- sg_set_buf(&vblk->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr));
-
- /*
- * If this is a packet command we need a couple of additional headers.
- * Behind the normal outhdr we put a segment with the scsi command
- * block, and before the normal inhdr we put the sense data and the
- * inhdr with additional status information before the normal inhdr.
- */
- if (vbr->req->cmd_type == REQ_TYPE_BLOCK_PC)
- sg_set_buf(&vblk->sg[out++], vbr->req->cmd, vbr->req->cmd_len);
-
- num = blk_rq_map_sg(q, vbr->req, vblk->sg + out);
-
- if (vbr->req->cmd_type == REQ_TYPE_BLOCK_PC) {
- sg_set_buf(&vblk->sg[num + out + in++], vbr->req->sense, SCSI_SENSE_BUFFERSIZE);
- sg_set_buf(&vblk->sg[num + out + in++], &vbr->in_hdr,
- sizeof(vbr->in_hdr));
- }
-
- sg_set_buf(&vblk->sg[num + out + in++], &vbr->status,
- sizeof(vbr->status));
-
+ num = blk_rq_map_sg(q, vbr->req, vblk->sg);
if (num) {
- if (rq_data_dir(vbr->req) == WRITE) {
+ if (rq_data_dir(vbr->req) == WRITE)
vbr->out_hdr.type |= VIRTIO_BLK_T_OUT;
- out += num;
- } else {
+ else
vbr->out_hdr.type |= VIRTIO_BLK_T_IN;
- in += num;
- }
}
- if (virtqueue_add_buf(vblk->vq, vblk->sg, out, in, vbr,
- GFP_ATOMIC) < 0) {
+ if (__virtblk_add_req(vblk->vq, vbr, vblk->sg, num) < 0) {
mempool_free(vbr, vblk->pool);
return false;
}
@@ -539,6 +521,7 @@ static void virtblk_config_changed_work(struct work_struct *work)
struct virtio_device *vdev = vblk->vdev;
struct request_queue *q = vblk->disk->queue;
char cap_str_2[10], cap_str_10[10];
+ char *envp[] = { "RESIZE=1", NULL };
u64 capacity, size;
mutex_lock(&vblk->config_lock);
@@ -568,6 +551,7 @@ static void virtblk_config_changed_work(struct work_struct *work)
set_capacity(vblk->disk, capacity);
revalidate_disk(vblk->disk);
+ kobject_uevent_env(&disk_to_dev(vblk->disk)->kobj, KOBJ_CHANGE, envp);
done:
mutex_unlock(&vblk->config_lock);
}
diff --git a/drivers/char/hw_random/virtio-rng.c b/drivers/char/hw_random/virtio-rng.c
index 6bf4d47324eb..ef46a9cfd832 100644
--- a/drivers/char/hw_random/virtio-rng.c
+++ b/drivers/char/hw_random/virtio-rng.c
@@ -47,7 +47,7 @@ static void register_buffer(u8 *buf, size_t size)
sg_init_one(&sg, buf, size);
/* There should always be room for one buffer. */
- if (virtqueue_add_buf(vq, &sg, 0, 1, buf, GFP_KERNEL) < 0)
+ if (virtqueue_add_inbuf(vq, &sg, 1, buf, GFP_KERNEL) < 0)
BUG();
virtqueue_kick(vq);
diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c
index ce5f3fc25d6d..1b456fe9b87a 100644
--- a/drivers/char/virtio_console.c
+++ b/drivers/char/virtio_console.c
@@ -78,8 +78,8 @@ struct ports_driver_data {
};
static struct ports_driver_data pdrvdata;
-DEFINE_SPINLOCK(pdrvdata_lock);
-DECLARE_COMPLETION(early_console_added);
+static DEFINE_SPINLOCK(pdrvdata_lock);
+static DECLARE_COMPLETION(early_console_added);
/* This struct holds information that's relevant only for console ports */
struct console {
@@ -503,7 +503,7 @@ static int add_inbuf(struct virtqueue *vq, struct port_buffer *buf)
sg_init_one(sg, buf->buf, buf->size);
- ret = virtqueue_add_buf(vq, sg, 0, 1, buf, GFP_ATOMIC);
+ ret = virtqueue_add_inbuf(vq, sg, 1, buf, GFP_ATOMIC);
virtqueue_kick(vq);
if (!ret)
ret = vq->num_free;
@@ -572,7 +572,7 @@ static ssize_t __send_control_msg(struct ports_device *portdev, u32 port_id,
sg_init_one(sg, &cpkt, sizeof(cpkt));
spin_lock(&portdev->c_ovq_lock);
- if (virtqueue_add_buf(vq, sg, 1, 0, &cpkt, GFP_ATOMIC) == 0) {
+ if (virtqueue_add_outbuf(vq, sg, 1, &cpkt, GFP_ATOMIC) == 0) {
virtqueue_kick(vq);
while (!virtqueue_get_buf(vq, &len))
cpu_relax();
@@ -622,7 +622,7 @@ static ssize_t __send_to_port(struct port *port, struct scatterlist *sg,
reclaim_consumed_buffers(port);
- err = virtqueue_add_buf(out_vq, sg, nents, 0, data, GFP_ATOMIC);
+ err = virtqueue_add_outbuf(out_vq, sg, nents, data, GFP_ATOMIC);
/* Tell Host to go! */
virtqueue_kick(out_vq);
@@ -1040,7 +1040,7 @@ static int port_fops_open(struct inode *inode, struct file *filp)
spin_lock_irq(&port->inbuf_lock);
if (port->guest_connected) {
spin_unlock_irq(&port->inbuf_lock);
- ret = -EMFILE;
+ ret = -EBUSY;
goto out;
}
@@ -1202,7 +1202,7 @@ int __init virtio_cons_early_init(int (*put_chars)(u32, const char *, int))
return hvc_instantiate(0, 0, &hv_ops);
}
-int init_port_console(struct port *port)
+static int init_port_console(struct port *port)
{
int ret;
diff --git a/drivers/lguest/Kconfig b/drivers/lguest/Kconfig
index 89875ea19ade..ee035ec4526b 100644
--- a/drivers/lguest/Kconfig
+++ b/drivers/lguest/Kconfig
@@ -5,10 +5,9 @@ config LGUEST
---help---
This is a very simple module which allows you to run
multiple instances of the same Linux kernel, using the
- "lguest" command found in the Documentation/virtual/lguest
- directory.
+ "lguest" command found in the tools/lguest directory.
Note that "lguest" is pronounced to rhyme with "fell quest",
- not "rustyvisor". See Documentation/virtual/lguest/lguest.txt.
+ not "rustyvisor". See tools/lguest/lguest.txt.
If unsure, say N. If curious, say M. If masochistic, say Y.
diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
index a5ebc0083d87..0bf1e4edf04d 100644
--- a/drivers/lguest/core.c
+++ b/drivers/lguest/core.c
@@ -20,9 +20,9 @@
#include <asm/asm-offsets.h>
#include "lg.h"
-
+unsigned long switcher_addr;
+struct page **lg_switcher_pages;
static struct vm_struct *switcher_vma;
-static struct page **switcher_page;
/* This One Big lock protects all inter-guest data structures. */
DEFINE_MUTEX(lguest_lock);
@@ -52,13 +52,21 @@ static __init int map_switcher(void)
* easy.
*/
+ /* We assume Switcher text fits into a single page. */
+ if (end_switcher_text - start_switcher_text > PAGE_SIZE) {
+ printk(KERN_ERR "lguest: switcher text too large (%zu)\n",
+ end_switcher_text - start_switcher_text);
+ return -EINVAL;
+ }
+
/*
* We allocate an array of struct page pointers. map_vm_area() wants
* this, rather than just an array of pages.
*/
- switcher_page = kmalloc(sizeof(switcher_page[0])*TOTAL_SWITCHER_PAGES,
- GFP_KERNEL);
- if (!switcher_page) {
+ lg_switcher_pages = kmalloc(sizeof(lg_switcher_pages[0])
+ * TOTAL_SWITCHER_PAGES,
+ GFP_KERNEL);
+ if (!lg_switcher_pages) {
err = -ENOMEM;
goto out;
}
@@ -68,32 +76,29 @@ static __init int map_switcher(void)
* so we make sure they're zeroed.
*/
for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) {
- switcher_page[i] = alloc_page(GFP_KERNEL|__GFP_ZERO);
- if (!switcher_page[i]) {
+ lg_switcher_pages[i] = alloc_page(GFP_KERNEL|__GFP_ZERO);
+ if (!lg_switcher_pages[i]) {
err = -ENOMEM;
goto free_some_pages;
}
}
/*
- * First we check that the Switcher won't overlap the fixmap area at
- * the top of memory. It's currently nowhere near, but it could have
- * very strange effects if it ever happened.
+ * We place the Switcher underneath the fixmap area, which is the
+ * highest virtual address we can get. This is important, since we
+ * tell the Guest it can't access this memory, so we want its ceiling
+ * as high as possible.
*/
- if (SWITCHER_ADDR + (TOTAL_SWITCHER_PAGES+1)*PAGE_SIZE > FIXADDR_START){
- err = -ENOMEM;
- printk("lguest: mapping switcher would thwack fixmap\n");
- goto free_pages;
- }
+ switcher_addr = FIXADDR_START - (TOTAL_SWITCHER_PAGES+1)*PAGE_SIZE;
/*
- * Now we reserve the "virtual memory area" we want: 0xFFC00000
- * (SWITCHER_ADDR). We might not get it in theory, but in practice
- * it's worked so far. The end address needs +1 because __get_vm_area
- * allocates an extra guard page, so we need space for that.
+ * Now we reserve the "virtual memory area" we want. We might
+ * not get it in theory, but in practice it's worked so far.
+ * The end address needs +1 because __get_vm_area allocates an
+ * extra guard page, so we need space for that.
*/
switcher_vma = __get_vm_area(TOTAL_SWITCHER_PAGES * PAGE_SIZE,
- VM_ALLOC, SWITCHER_ADDR, SWITCHER_ADDR
+ VM_ALLOC, switcher_addr, switcher_addr
+ (TOTAL_SWITCHER_PAGES+1) * PAGE_SIZE);
if (!switcher_vma) {
err = -ENOMEM;
@@ -103,12 +108,12 @@ static __init int map_switcher(void)
/*
* This code actually sets up the pages we've allocated to appear at
- * SWITCHER_ADDR. map_vm_area() takes the vma we allocated above, the
+ * switcher_addr. map_vm_area() takes the vma we allocated above, the
* kind of pages we're mapping (kernel pages), and a pointer to our
* array of struct pages. It increments that pointer, but we don't
* care.
*/
- pagep = switcher_page;
+ pagep = lg_switcher_pages;
err = map_vm_area(switcher_vma, PAGE_KERNEL_EXEC, &pagep);
if (err) {
printk("lguest: map_vm_area failed: %i\n", err);
@@ -133,8 +138,8 @@ free_pages:
i = TOTAL_SWITCHER_PAGES;
free_some_pages:
for (--i; i >= 0; i--)
- __free_pages(switcher_page[i], 0);
- kfree(switcher_page);
+ __free_pages(lg_switcher_pages[i], 0);
+ kfree(lg_switcher_pages);
out:
return err;
}
@@ -149,8 +154,8 @@ static void unmap_switcher(void)
vunmap(switcher_vma->addr);
/* Now we just need to free the pages we copied the switcher into */
for (i = 0; i < TOTAL_SWITCHER_PAGES; i++)
- __free_pages(switcher_page[i], 0);
- kfree(switcher_page);
+ __free_pages(lg_switcher_pages[i], 0);
+ kfree(lg_switcher_pages);
}
/*H:032
@@ -323,15 +328,10 @@ static int __init init(void)
if (err)
goto out;
- /* Now we set up the pagetable implementation for the Guests. */
- err = init_pagetables(switcher_page, SHARED_SWITCHER_PAGES);
- if (err)
- goto unmap;
-
/* We might need to reserve an interrupt vector. */
err = init_interrupts();
if (err)
- goto free_pgtables;
+ goto unmap;
/* /dev/lguest needs to be registered. */
err = lguest_device_init();
@@ -346,8 +346,6 @@ static int __init init(void)
free_interrupts:
free_interrupts();
-free_pgtables:
- free_pagetables();
unmap:
unmap_switcher();
out:
@@ -359,7 +357,6 @@ static void __exit fini(void)
{
lguest_device_remove();
free_interrupts();
- free_pagetables();
unmap_switcher();
lguest_arch_host_fini();
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
index 295df06e6590..2eef40be4c04 100644
--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@@ -14,11 +14,10 @@
#include <asm/lguest.h>
-void free_pagetables(void);
-int init_pagetables(struct page **switcher_page, unsigned int pages);
-
struct pgdir {
unsigned long gpgdir;
+ bool switcher_mapped;
+ int last_host_cpu;
pgd_t *pgdir;
};
@@ -124,6 +123,7 @@ bool lguest_address_ok(const struct lguest *lg,
unsigned long addr, unsigned long len);
void __lgread(struct lg_cpu *, void *, unsigned long, unsigned);
void __lgwrite(struct lg_cpu *, unsigned long, const void *, unsigned);
+extern struct page **lg_switcher_pages;
/*H:035
* Using memory-copy operations like that is usually inconvient, so we
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c
index ff4a0bc9904d..4263f4cc8c55 100644
--- a/drivers/lguest/lguest_user.c
+++ b/drivers/lguest/lguest_user.c
@@ -250,13 +250,13 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o)
*/
static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip)
{
- /* We have a limited number the number of CPUs in the lguest struct. */
+ /* We have a limited number of CPUs in the lguest struct. */
if (id >= ARRAY_SIZE(cpu->lg->cpus))
return -EINVAL;
/* Set up this CPU's id, and pointer back to the lguest struct. */
cpu->id = id;
- cpu->lg = container_of((cpu - id), struct lguest, cpus[0]);
+ cpu->lg = container_of(cpu, struct lguest, cpus[id]);
cpu->lg->nr_cpus++;
/* Each CPU has a timer it can set. */
@@ -270,7 +270,7 @@ static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip)
if (!cpu->regs_page)
return -ENOMEM;
- /* We actually put the registers at the bottom of the page. */
+ /* We actually put the registers at the end of the page. */
cpu->regs = (void *)cpu->regs_page + PAGE_SIZE - sizeof(*cpu->regs);
/*
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c
index 864baabaee25..699187ab3800 100644
--- a/drivers/lguest/page_tables.c
+++ b/drivers/lguest/page_tables.c
@@ -7,7 +7,7 @@
* converted Guest pages when running the Guest.
:*/
-/* Copyright (C) Rusty Russell IBM Corporation 2006.
+/* Copyright (C) Rusty Russell IBM Corporation 2013.
* GPL v2 and any later version */
#include <linux/mm.h>
#include <linux/gfp.h>
@@ -62,22 +62,11 @@
* will need the last pmd entry of the last pmd page.
*/
#ifdef CONFIG_X86_PAE
-#define SWITCHER_PMD_INDEX (PTRS_PER_PMD - 1)
-#define RESERVE_MEM 2U
#define CHECK_GPGD_MASK _PAGE_PRESENT
#else
-#define RESERVE_MEM 4U
#define CHECK_GPGD_MASK _PAGE_TABLE
#endif
-/*
- * We actually need a separate PTE page for each CPU. Remember that after the
- * Switcher code itself comes two pages for each CPU, and we don't want this
- * CPU's guest to see the pages of any other CPU.
- */
-static DEFINE_PER_CPU(pte_t *, switcher_pte_pages);
-#define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu)
-
/*H:320
* The page table code is curly enough to need helper functions to keep it
* clear and clean. The kernel itself provides many of them; one advantage
@@ -95,13 +84,6 @@ static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr)
{
unsigned int index = pgd_index(vaddr);
-#ifndef CONFIG_X86_PAE
- /* We kill any Guest trying to touch the Switcher addresses. */
- if (index >= SWITCHER_PGD_INDEX) {
- kill_guest(cpu, "attempt to access switcher pages");
- index = 0;
- }
-#endif
/* Return a pointer index'th pgd entry for the i'th page table. */
return &cpu->lg->pgdirs[i].pgdir[index];
}
@@ -117,13 +99,6 @@ static pmd_t *spmd_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr)
unsigned int index = pmd_index(vaddr);
pmd_t *page;
- /* We kill any Guest trying to touch the Switcher addresses. */
- if (pgd_index(vaddr) == SWITCHER_PGD_INDEX &&
- index >= SWITCHER_PMD_INDEX) {
- kill_guest(cpu, "attempt to access switcher pages");
- index = 0;
- }
-
/* You should never call this if the PGD entry wasn't valid */
BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT));
page = __va(pgd_pfn(spgd) << PAGE_SHIFT);
@@ -275,122 +250,177 @@ static void release_pte(pte_t pte)
}
/*:*/
-static void check_gpte(struct lg_cpu *cpu, pte_t gpte)
+static bool check_gpte(struct lg_cpu *cpu, pte_t gpte)
{
if ((pte_flags(gpte) & _PAGE_PSE) ||
- pte_pfn(gpte) >= cpu->lg->pfn_limit)
+ pte_pfn(gpte) >= cpu->lg->pfn_limit) {
kill_guest(cpu, "bad page table entry");
+ return false;
+ }
+ return true;
}
-static void check_gpgd(struct lg_cpu *cpu, pgd_t gpgd)
+static bool check_gpgd(struct lg_cpu *cpu, pgd_t gpgd)
{
if ((pgd_flags(gpgd) & ~CHECK_GPGD_MASK) ||
- (pgd_pfn(gpgd) >= cpu->lg->pfn_limit))
+ (pgd_pfn(gpgd) >= cpu->lg->pfn_limit)) {
kill_guest(cpu, "bad page directory entry");
+ return false;
+ }
+ return true;
}
#ifdef CONFIG_X86_PAE
-static void check_gpmd(struct lg_cpu *cpu, pmd_t gpmd)
+static bool check_gpmd(struct lg_cpu *cpu, pmd_t gpmd)
{
if ((pmd_flags(gpmd) & ~_PAGE_TABLE) ||
- (pmd_pfn(gpmd) >= cpu->lg->pfn_limit))
+ (pmd_pfn(gpmd) >= cpu->lg->pfn_limit)) {
kill_guest(cpu, "bad page middle directory entry");
+ return false;
+ }
+ return true;
}
#endif
-/*H:330
- * (i) Looking up a page table entry when the Guest faults.
- *
- * We saw this call in run_guest(): when we see a page fault in the Guest, we
- * come here. That's because we only set up the shadow page tables lazily as
- * they're needed, so we get page faults all the time and quietly fix them up
- * and return to the Guest without it knowing.
+/*H:331
+ * This is the core routine to walk the shadow page tables and find the page
+ * table entry for a specific address.
*
- * If we fixed up the fault (ie. we mapped the address), this routine returns
- * true. Otherwise, it was a real fault and we need to tell the Guest.
+ * If allocate is set, then we allocate any missing levels, setting the flags
+ * on the new page directory and mid-level directories using the arguments
+ * (which are copied from the Guest's page table entries).
*/
-bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
+static pte_t *find_spte(struct lg_cpu *cpu, unsigned long vaddr, bool allocate,
+ int pgd_flags, int pmd_flags)
{
- pgd_t gpgd;
pgd_t *spgd;
- unsigned long gpte_ptr;
- pte_t gpte;
- pte_t *spte;
-
/* Mid level for PAE. */
#ifdef CONFIG_X86_PAE
pmd_t *spmd;
- pmd_t gpmd;
#endif
- /* First step: get the top-level Guest page table entry. */
- if (unlikely(cpu->linear_pages)) {
- /* Faking up a linear mapping. */
- gpgd = __pgd(CHECK_GPGD_MASK);
- } else {
- gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
- /* Toplevel not present? We can't map it in. */
- if (!(pgd_flags(gpgd) & _PAGE_PRESENT))
- return false;
- }
-
- /* Now look at the matching shadow entry. */
+ /* Get top level entry. */
spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr);
if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) {
/* No shadow entry: allocate a new shadow PTE page. */
- unsigned long ptepage = get_zeroed_page(GFP_KERNEL);
+ unsigned long ptepage;
+
+ /* If they didn't want us to allocate anything, stop. */
+ if (!allocate)
+ return NULL;
+
+ ptepage = get_zeroed_page(GFP_KERNEL);
/*
* This is not really the Guest's fault, but killing it is
* simple for this corner case.
*/
if (!ptepage) {
kill_guest(cpu, "out of memory allocating pte page");
- return false;
+ return NULL;
}
- /* We check that the Guest pgd is OK. */
- check_gpgd(cpu, gpgd);
/*
* And we copy the flags to the shadow PGD entry. The page
* number in the shadow PGD is the page we just allocated.
*/
- set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags(gpgd)));
+ set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags));
}
+ /*
+ * Intel's Physical Address Extension actually uses three levels of
+ * page tables, so we need to look in the mid-level.
+ */
#ifdef CONFIG_X86_PAE
- if (unlikely(cpu->linear_pages)) {
- /* Faking up a linear mapping. */
- gpmd = __pmd(_PAGE_TABLE);
- } else {
- gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t);
- /* Middle level not present? We can't map it in. */
- if (!(pmd_flags(gpmd) & _PAGE_PRESENT))
- return false;
- }
-
- /* Now look at the matching shadow entry. */
+ /* Now look at the mid-level shadow entry. */
spmd = spmd_addr(cpu, *spgd, vaddr);
if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) {
/* No shadow entry: allocate a new shadow PTE page. */
- unsigned long ptepage = get_zeroed_page(GFP_KERNEL);
+ unsigned long ptepage;
+
+ /* If they didn't want us to allocate anything, stop. */
+ if (!allocate)
+ return NULL;
+
+ ptepage = get_zeroed_page(GFP_KERNEL);
/*
* This is not really the Guest's fault, but killing it is
* simple for this corner case.
*/
if (!ptepage) {
- kill_guest(cpu, "out of memory allocating pte page");
- return false;
+ kill_guest(cpu, "out of memory allocating pmd page");
+ return NULL;
}
- /* We check that the Guest pmd is OK. */
- check_gpmd(cpu, gpmd);
-
/*
* And we copy the flags to the shadow PMD entry. The page
* number in the shadow PMD is the page we just allocated.
*/
- set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags(gpmd)));
+ set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags));
+ }
+#endif
+
+ /* Get the pointer to the shadow PTE entry we're going to set. */
+ return spte_addr(cpu, *spgd, vaddr);
+}
+
+/*H:330
+ * (i) Looking up a page table entry when the Guest faults.
+ *
+ * We saw this call in run_guest(): when we see a page fault in the Guest, we
+ * come here. That's because we only set up the shadow page tables lazily as
+ * they're needed, so we get page faults all the time and quietly fix them up
+ * and return to the Guest without it knowing.
+ *
+ * If we fixed up the fault (ie. we mapped the address), this routine returns
+ * true. Otherwise, it was a real fault and we need to tell the Guest.
+ */
+bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
+{
+ unsigned long gpte_ptr;
+ pte_t gpte;
+ pte_t *spte;
+ pmd_t gpmd;
+ pgd_t gpgd;
+
+ /* We never demand page the Switcher, so trying is a mistake. */
+ if (vaddr >= switcher_addr)
+ return false;
+
+ /* First step: get the top-level Guest page table entry. */
+ if (unlikely(cpu->linear_pages)) {
+ /* Faking up a linear mapping. */
+ gpgd = __pgd(CHECK_GPGD_MASK);
+ } else {
+ gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
+ /* Toplevel not present? We can't map it in. */
+ if (!(pgd_flags(gpgd) & _PAGE_PRESENT))
+ return false;
+
+ /*
+ * This kills the Guest if it has weird flags or tries to
+ * refer to a "physical" address outside the bounds.
+ */
+ if (!check_gpgd(cpu, gpgd))
+ return false;
+ }
+
+ /* This "mid-level" entry is only used for non-linear, PAE mode. */
+ gpmd = __pmd(_PAGE_TABLE);
+
+#ifdef CONFIG_X86_PAE
+ if (likely(!cpu->linear_pages)) {
+ gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t);
+ /* Middle level not present? We can't map it in. */
+ if (!(pmd_flags(gpmd) & _PAGE_PRESENT))
+ return false;
+
+ /*
+ * This kills the Guest if it has weird flags or tries to
+ * refer to a "physical" address outside the bounds.
+ */
+ if (!check_gpmd(cpu, gpmd))
+ return false;
}
/*
@@ -433,7 +463,8 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
* Check that the Guest PTE flags are OK, and the page number is below
* the pfn_limit (ie. not mapping the Launcher binary).
*/
- check_gpte(cpu, gpte);
+ if (!check_gpte(cpu, gpte))
+ return false;
/* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */
gpte = pte_mkyoung(gpte);
@@ -441,7 +472,9 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
gpte = pte_mkdirty(gpte);
/* Get the pointer to the shadow PTE entry we're going to set. */
- spte = spte_addr(cpu, *spgd, vaddr);
+ spte = find_spte(cpu, vaddr, true, pgd_flags(gpgd), pmd_flags(gpmd));
+ if (!spte)
+ return false;
/*
* If there was a valid shadow PTE entry here before, we release it.
@@ -493,29 +526,23 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
*/
static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr)
{
- pgd_t *spgd;
+ pte_t *spte;
unsigned long flags;
-#ifdef CONFIG_X86_PAE
- pmd_t *spmd;
-#endif
- /* Look at the current top level entry: is it present? */
- spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr);
- if (!(pgd_flags(*spgd) & _PAGE_PRESENT))
+ /* You can't put your stack in the Switcher! */
+ if (vaddr >= switcher_addr)
return false;
-#ifdef CONFIG_X86_PAE
- spmd = spmd_addr(cpu, *spgd, vaddr);
- if (!(pmd_flags(*spmd) & _PAGE_PRESENT))
+ /* If there's no shadow PTE, it's not writable. */
+ spte = find_spte(cpu, vaddr, false, 0, 0);
+ if (!spte)
return false;
-#endif
/*
* Check the flags on the pte entry itself: it must be present and
* writable.
*/
- flags = pte_flags(*(spte_addr(cpu, *spgd, vaddr)));
-
+ flags = pte_flags(*spte);
return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW);
}
@@ -678,9 +705,6 @@ static unsigned int new_pgdir(struct lg_cpu *cpu,
int *blank_pgdir)
{
unsigned int next;
-#ifdef CONFIG_X86_PAE
- pmd_t *pmd_table;
-#endif
/*
* We pick one entry at random to throw out. Choosing the Least
@@ -695,29 +719,11 @@ static unsigned int new_pgdir(struct lg_cpu *cpu,
if (!cpu->lg->pgdirs[next].pgdir)
next = cpu->cpu_pgd;
else {
-#ifdef CONFIG_X86_PAE
/*
- * In PAE mode, allocate a pmd page and populate the
- * last pgd entry.
+ * This is a blank page, so there are no kernel