summaryrefslogtreecommitdiffstats
path: root/drivers/staging/ramster
diff options
context:
space:
mode:
authorDan Magenheimer <dan.magenheimer@oracle.com>2012-02-15 07:54:16 -0800
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>2012-02-15 09:02:03 -0800
commit19ee3ef5f4bb22d17eb73d89a520437745b8b444 (patch)
tree9d51281ef697f834529386aca0fa168783dfe1ea /drivers/staging/ramster
parentb605c9621e57e72cbd45d59fd93d97b4a75c3597 (diff)
staging: ramster: local compression + tmem
RAMster implements peer-to-peer transcendent memory, allowing a "cluster" of kernels to dynamically pool their RAM. This patch copies files from drivers/staging/zcache. RAMster compresses pages locally before transmitting them to another node, so we can leverage the zcache and tmem code directly. Note: there are no ramster-specific changes yet to these files. (Why copy? The ramster tmem.c/tmem.h changes are definitely shareable between zcache and ramster; the eventual destination for tmem.c is the linux lib directory. Ramster changes to zcache are more substantial and zcache is currently undergoing some significant unrelated changes (including a new allocator and breaking zcache-main.c into smaller files), so it seemed best to branch temporarily and merge later.) Signed-off-by: Dan Magenheimer <dan.magenheimer@oracle.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Diffstat (limited to 'drivers/staging/ramster')
-rw-r--r--drivers/staging/ramster/Kconfig13
-rw-r--r--drivers/staging/ramster/Makefile3
-rw-r--r--drivers/staging/ramster/tmem.c770
-rw-r--r--drivers/staging/ramster/tmem.h206
-rw-r--r--drivers/staging/ramster/zcache-main.c1988
5 files changed, 2980 insertions, 0 deletions
diff --git a/drivers/staging/ramster/Kconfig b/drivers/staging/ramster/Kconfig
new file mode 100644
index 000000000000..7fabcb2bc80d
--- /dev/null
+++ b/drivers/staging/ramster/Kconfig
@@ -0,0 +1,13 @@
+config ZCACHE
+ tristate "Dynamic compression of swap pages and clean pagecache pages"
+ depends on CLEANCACHE || FRONTSWAP
+ select XVMALLOC
+ select LZO_COMPRESS
+ select LZO_DECOMPRESS
+ default n
+ help
+ Zcache doubles RAM efficiency while providing a significant
+ performance boosts on many workloads. Zcache uses lzo1x
+ compression and an in-kernel implementation of transcendent
+ memory to store clean page cache pages and swap in RAM,
+ providing a noticeable reduction in disk I/O.
diff --git a/drivers/staging/ramster/Makefile b/drivers/staging/ramster/Makefile
new file mode 100644
index 000000000000..60daa272c204
--- /dev/null
+++ b/drivers/staging/ramster/Makefile
@@ -0,0 +1,3 @@
+zcache-y := zcache-main.o tmem.o
+
+obj-$(CONFIG_ZCACHE) += zcache.o
diff --git a/drivers/staging/ramster/tmem.c b/drivers/staging/ramster/tmem.c
new file mode 100644
index 000000000000..1ca66ea9b281
--- /dev/null
+++ b/drivers/staging/ramster/tmem.c
@@ -0,0 +1,770 @@
+/*
+ * In-kernel transcendent memory (generic implementation)
+ *
+ * Copyright (c) 2009-2011, Dan Magenheimer, Oracle Corp.
+ *
+ * The primary purpose of Transcedent Memory ("tmem") is to map object-oriented
+ * "handles" (triples containing a pool id, and object id, and an index), to
+ * pages in a page-accessible memory (PAM). Tmem references the PAM pages via
+ * an abstract "pampd" (PAM page-descriptor), which can be operated on by a
+ * set of functions (pamops). Each pampd contains some representation of
+ * PAGE_SIZE bytes worth of data. Tmem must support potentially millions of
+ * pages and must be able to insert, find, and delete these pages at a
+ * potential frequency of thousands per second concurrently across many CPUs,
+ * (and, if used with KVM, across many vcpus across many guests).
+ * Tmem is tracked with a hierarchy of data structures, organized by
+ * the elements in a handle-tuple: pool_id, object_id, and page index.
+ * One or more "clients" (e.g. guests) each provide one or more tmem_pools.
+ * Each pool, contains a hash table of rb_trees of tmem_objs. Each
+ * tmem_obj contains a radix-tree-like tree of pointers, with intermediate
+ * nodes called tmem_objnodes. Each leaf pointer in this tree points to
+ * a pampd, which is accessible only through a small set of callbacks
+ * registered by the PAM implementation (see tmem_register_pamops). Tmem
+ * does all memory allocation via a set of callbacks registered by the tmem
+ * host implementation (e.g. see tmem_register_hostops).
+ */
+
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/atomic.h>
+
+#include "tmem.h"
+
+/* data structure sentinels used for debugging... see tmem.h */
+#define POOL_SENTINEL 0x87658765
+#define OBJ_SENTINEL 0x12345678
+#define OBJNODE_SENTINEL 0xfedcba09
+
+/*
+ * A tmem host implementation must use this function to register callbacks
+ * for memory allocation.
+ */
+static struct tmem_hostops tmem_hostops;
+
+static void tmem_objnode_tree_init(void);
+
+void tmem_register_hostops(struct tmem_hostops *m)
+{
+ tmem_objnode_tree_init();
+ tmem_hostops = *m;
+}
+
+/*
+ * A tmem host implementation must use this function to register
+ * callbacks for a page-accessible memory (PAM) implementation
+ */
+static struct tmem_pamops tmem_pamops;
+
+void tmem_register_pamops(struct tmem_pamops *m)
+{
+ tmem_pamops = *m;
+}
+
+/*
+ * Oid's are potentially very sparse and tmem_objs may have an indeterminately
+ * short life, being added and deleted at a relatively high frequency.
+ * So an rb_tree is an ideal data structure to manage tmem_objs. But because
+ * of the potentially huge number of tmem_objs, each pool manages a hashtable
+ * of rb_trees to reduce search, insert, delete, and rebalancing time.
+ * Each hashbucket also has a lock to manage concurrent access.
+ *
+ * The following routines manage tmem_objs. When any tmem_obj is accessed,
+ * the hashbucket lock must be held.
+ */
+
+/* searches for object==oid in pool, returns locked object if found */
+static struct tmem_obj *tmem_obj_find(struct tmem_hashbucket *hb,
+ struct tmem_oid *oidp)
+{
+ struct rb_node *rbnode;
+ struct tmem_obj *obj;
+
+ rbnode = hb->obj_rb_root.rb_node;
+ while (rbnode) {
+ BUG_ON(RB_EMPTY_NODE(rbnode));
+ obj = rb_entry(rbnode, struct tmem_obj, rb_tree_node);
+ switch (tmem_oid_compare(oidp, &obj->oid)) {
+ case 0: /* equal */
+ goto out;
+ case -1:
+ rbnode = rbnode->rb_left;
+ break;
+ case 1:
+ rbnode = rbnode->rb_right;
+ break;
+ }
+ }
+ obj = NULL;
+out:
+ return obj;
+}
+
+static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *);
+
+/* free an object that has no more pampds in it */
+static void tmem_obj_free(struct tmem_obj *obj, struct tmem_hashbucket *hb)
+{
+ struct tmem_pool *pool;
+
+ BUG_ON(obj == NULL);
+ ASSERT_SENTINEL(obj, OBJ);
+ BUG_ON(obj->pampd_count > 0);
+ pool = obj->pool;
+ BUG_ON(pool == NULL);
+ if (obj->objnode_tree_root != NULL) /* may be "stump" with no leaves */
+ tmem_pampd_destroy_all_in_obj(obj);
+ BUG_ON(obj->objnode_tree_root != NULL);
+ BUG_ON((long)obj->objnode_count != 0);
+ atomic_dec(&pool->obj_count);
+ BUG_ON(atomic_read(&pool->obj_count) < 0);
+ INVERT_SENTINEL(obj, OBJ);
+ obj->pool = NULL;
+ tmem_oid_set_invalid(&obj->oid);
+ rb_erase(&obj->rb_tree_node, &hb->obj_rb_root);
+}
+
+/*
+ * initialize, and insert an tmem_object_root (called only if find failed)
+ */
+static void tmem_obj_init(struct tmem_obj *obj, struct tmem_hashbucket *hb,
+ struct tmem_pool *pool,
+ struct tmem_oid *oidp)
+{
+ struct rb_root *root = &hb->obj_rb_root;
+ struct rb_node **new = &(root->rb_node), *parent = NULL;
+ struct tmem_obj *this;
+
+ BUG_ON(pool == NULL);
+ atomic_inc(&pool->obj_count);
+ obj->objnode_tree_height = 0;
+ obj->objnode_tree_root = NULL;
+ obj->pool = pool;
+ obj->oid = *oidp;
+ obj->objnode_count = 0;
+ obj->pampd_count = 0;
+ (*tmem_pamops.new_obj)(obj);
+ SET_SENTINEL(obj, OBJ);
+ while (*new) {
+ BUG_ON(RB_EMPTY_NODE(*new));
+ this = rb_entry(*new, struct tmem_obj, rb_tree_node);
+ parent = *new;
+ switch (tmem_oid_compare(oidp, &this->oid)) {
+ case 0:
+ BUG(); /* already present; should never happen! */
+ break;
+ case -1:
+ new = &(*new)->rb_left;
+ break;
+ case 1:
+ new = &(*new)->rb_right;
+ break;
+ }
+ }
+ rb_link_node(&obj->rb_tree_node, parent, new);
+ rb_insert_color(&obj->rb_tree_node, root);
+}
+
+/*
+ * Tmem is managed as a set of tmem_pools with certain attributes, such as
+ * "ephemeral" vs "persistent". These attributes apply to all tmem_objs
+ * and all pampds that belong to a tmem_pool. A tmem_pool is created
+ * or deleted relatively rarely (for example, when a filesystem is
+ * mounted or unmounted.
+ */
+
+/* flush all data from a pool and, optionally, free it */
+static void tmem_pool_flush(struct tmem_pool *pool, bool destroy)
+{
+ struct rb_node *rbnode;
+ struct tmem_obj *obj;
+ struct tmem_hashbucket *hb = &pool->hashbucket[0];
+ int i;
+
+ BUG_ON(pool == NULL);
+ for (i = 0; i < TMEM_HASH_BUCKETS; i++, hb++) {
+ spin_lock(&hb->lock);
+ rbnode = rb_first(&hb->obj_rb_root);
+ while (rbnode != NULL) {
+ obj = rb_entry(rbnode, struct tmem_obj, rb_tree_node);
+ rbnode = rb_next(rbnode);
+ tmem_pampd_destroy_all_in_obj(obj);
+ tmem_obj_free(obj, hb);
+ (*tmem_hostops.obj_free)(obj, pool);
+ }
+ spin_unlock(&hb->lock);
+ }
+ if (destroy)
+ list_del(&pool->pool_list);
+}
+
+/*
+ * A tmem_obj contains a radix-tree-like tree in which the intermediate
+ * nodes are called tmem_objnodes. (The kernel lib/radix-tree.c implementation
+ * is very specialized and tuned for specific uses and is not particularly
+ * suited for use from this code, though some code from the core algorithms has
+ * been reused, thus the copyright notices below). Each tmem_objnode contains
+ * a set of pointers which point to either a set of intermediate tmem_objnodes
+ * or a set of of pampds.
+ *
+ * Portions Copyright (C) 2001 Momchil Velikov
+ * Portions Copyright (C) 2001 Christoph Hellwig
+ * Portions Copyright (C) 2005 SGI, Christoph Lameter <clameter@sgi.com>
+ */
+
+struct tmem_objnode_tree_path {
+ struct tmem_objnode *objnode;
+ int offset;
+};
+
+/* objnode height_to_maxindex translation */
+static unsigned long tmem_objnode_tree_h2max[OBJNODE_TREE_MAX_PATH + 1];
+
+static void tmem_objnode_tree_init(void)
+{
+ unsigned int ht, tmp;
+
+ for (ht = 0; ht < ARRAY_SIZE(tmem_objnode_tree_h2max); ht++) {
+ tmp = ht * OBJNODE_TREE_MAP_SHIFT;
+ if (tmp >= OBJNODE_TREE_INDEX_BITS)
+ tmem_objnode_tree_h2max[ht] = ~0UL;
+ else
+ tmem_objnode_tree_h2max[ht] =
+ (~0UL >> (OBJNODE_TREE_INDEX_BITS - tmp - 1)) >> 1;
+ }
+}
+
+static struct tmem_objnode *tmem_objnode_alloc(struct tmem_obj *obj)
+{
+ struct tmem_objnode *objnode;
+
+ ASSERT_SENTINEL(obj, OBJ);
+ BUG_ON(obj->pool == NULL);
+ ASSERT_SENTINEL(obj->pool, POOL);
+ objnode = (*tmem_hostops.objnode_alloc)(obj->pool);
+ if (unlikely(objnode == NULL))
+ goto out;
+ objnode->obj = obj;
+ SET_SENTINEL(objnode, OBJNODE);
+ memset(&objnode->slots, 0, sizeof(objnode->slots));
+ objnode->slots_in_use = 0;
+ obj->objnode_count++;
+out:
+ return objnode;
+}
+
+static void tmem_objnode_free(struct tmem_objnode *objnode)
+{
+ struct tmem_pool *pool;
+ int i;
+
+ BUG_ON(objnode == NULL);
+ for (i = 0; i < OBJNODE_TREE_MAP_SIZE; i++)
+ BUG_ON(objnode->slots[i] != NULL);
+ ASSERT_SENTINEL(objnode, OBJNODE);
+ INVERT_SENTINEL(objnode, OBJNODE);
+ BUG_ON(objnode->obj == NULL);
+ ASSERT_SENTINEL(objnode->obj, OBJ);
+ pool = objnode->obj->pool;
+ BUG_ON(pool == NULL);
+ ASSERT_SENTINEL(pool, POOL);
+ objnode->obj->objnode_count--;
+ objnode->obj = NULL;
+ (*tmem_hostops.objnode_free)(objnode, pool);
+}
+
+/*
+ * lookup index in object and return associated pampd (or NULL if not found)
+ */
+static void **__tmem_pampd_lookup_in_obj(struct tmem_obj *obj, uint32_t index)
+{
+ unsigned int height, shift;
+ struct tmem_objnode **slot = NULL;
+
+ BUG_ON(obj == NULL);
+ ASSERT_SENTINEL(obj, OBJ);
+ BUG_ON(obj->pool == NULL);
+ ASSERT_SENTINEL(obj->pool, POOL);
+
+ height = obj->objnode_tree_height;
+ if (index > tmem_objnode_tree_h2max[obj->objnode_tree_height])
+ goto out;
+ if (height == 0 && obj->objnode_tree_root) {
+ slot = &obj->objnode_tree_root;
+ goto out;
+ }
+ shift = (height-1) * OBJNODE_TREE_MAP_SHIFT;
+ slot = &obj->objnode_tree_root;
+ while (height > 0) {
+ if (*slot == NULL)
+ goto out;
+ slot = (struct tmem_objnode **)
+ ((*slot)->slots +
+ ((index >> shift) & OBJNODE_TREE_MAP_MASK));
+ shift -= OBJNODE_TREE_MAP_SHIFT;
+ height--;
+ }
+out:
+ return slot != NULL ? (void **)slot : NULL;
+}
+
+static void *tmem_pampd_lookup_in_obj(struct tmem_obj *obj, uint32_t index)
+{
+ struct tmem_objnode **slot;
+
+ slot = (struct tmem_objnode **)__tmem_pampd_lookup_in_obj(obj, index);
+ return slot != NULL ? *slot : NULL;
+}
+
+static void *tmem_pampd_replace_in_obj(struct tmem_obj *obj, uint32_t index,
+ void *new_pampd)
+{
+ struct tmem_objnode **slot;
+ void *ret = NULL;
+
+ slot = (struct tmem_objnode **)__tmem_pampd_lookup_in_obj(obj, index);
+ if ((slot != NULL) && (*slot != NULL)) {
+ void *old_pampd = *(void **)slot;
+ *(void **)slot = new_pampd;
+ (*tmem_pamops.free)(old_pampd, obj->pool, NULL, 0);
+ ret = new_pampd;
+ }
+ return ret;
+}
+
+static int tmem_pampd_add_to_obj(struct tmem_obj *obj, uint32_t index,
+ void *pampd)
+{
+ int ret = 0;
+ struct tmem_objnode *objnode = NULL, *newnode, *slot;
+ unsigned int height, shift;
+ int offset = 0;
+
+ /* if necessary, extend the tree to be higher */
+ if (index > tmem_objnode_tree_h2max[obj->objnode_tree_height]) {
+ height = obj->objnode_tree_height + 1;
+ if (index > tmem_objnode_tree_h2max[height])
+ while (index > tmem_objnode_tree_h2max[height])
+ height++;
+ if (obj->objnode_tree_root == NULL) {
+ obj->objnode_tree_height = height;
+ goto insert;
+ }
+ do {
+ newnode = tmem_objnode_alloc(obj);
+ if (!newnode) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ newnode->slots[0] = obj->objnode_tree_root;
+ newnode->slots_in_use = 1;
+ obj->objnode_tree_root = newnode;
+ obj->objnode_tree_height++;
+ } while (height > obj->objnode_tree_height);
+ }
+insert:
+ slot = obj->objnode_tree_root;
+ height = obj->objnode_tree_height;
+ shift = (height-1) * OBJNODE_TREE_MAP_SHIFT;
+ while (height > 0) {
+ if (slot == NULL) {
+ /* add a child objnode. */
+ slot = tmem_objnode_alloc(obj);
+ if (!slot) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ if (objnode) {
+
+ objnode->slots[offset] = slot;
+ objnode->slots_in_use++;
+ } else
+ obj->objnode_tree_root = slot;
+ }
+ /* go down a level */
+ offset = (index >> shift) & OBJNODE_TREE_MAP_MASK;
+ objnode = slot;
+ slot = objnode->slots[offset];
+ shift -= OBJNODE_TREE_MAP_SHIFT;
+ height--;
+ }
+ BUG_ON(slot != NULL);
+ if (objnode) {
+ objnode->slots_in_use++;
+ objnode->slots[offset] = pampd;
+ } else
+ obj->objnode_tree_root = pampd;
+ obj->pampd_count++;
+out:
+ return ret;
+}
+
+static void *tmem_pampd_delete_from_obj(struct tmem_obj *obj, uint32_t index)
+{
+ struct tmem_objnode_tree_path path[OBJNODE_TREE_MAX_PATH + 1];
+ struct tmem_objnode_tree_path *pathp = path;
+ struct tmem_objnode *slot = NULL;
+ unsigned int height, shift;
+ int offset;
+
+ BUG_ON(obj == NULL);
+ ASSERT_SENTINEL(obj, OBJ);
+ BUG_ON(obj->pool == NULL);
+ ASSERT_SENTINEL(obj->pool, POOL);
+ height = obj->objnode_tree_height;
+ if (index > tmem_objnode_tree_h2max[height])
+ goto out;
+ slot = obj->objnode_tree_root;
+ if (height == 0 && obj->objnode_tree_root) {
+ obj->objnode_tree_root = NULL;
+ goto out;
+ }
+ shift = (height - 1) * OBJNODE_TREE_MAP_SHIFT;
+ pathp->objnode = NULL;
+ do {
+ if (slot == NULL)
+ goto out;
+ pathp++;
+ offset = (index >> shift) & OBJNODE_TREE_MAP_MASK;
+ pathp->offset = offset;
+ pathp->objnode = slot;
+ slot = slot->slots[offset];
+ shift -= OBJNODE_TREE_MAP_SHIFT;
+ height--;
+ } while (height > 0);
+ if (slot == NULL)
+ goto out;
+ while (pathp->objnode) {
+ pathp->objnode->slots[pathp->offset] = NULL;
+ pathp->objnode->slots_in_use--;
+ if (pathp->objnode->slots_in_use) {
+ if (pathp->objnode == obj->objnode_tree_root) {
+ while (obj->objnode_tree_height > 0 &&
+ obj->objnode_tree_root->slots_in_use == 1 &&
+ obj->objnode_tree_root->slots[0]) {
+ struct tmem_objnode *to_free =
+ obj->objnode_tree_root;
+
+ obj->objnode_tree_root =
+ to_free->slots[0];
+ obj->objnode_tree_height--;
+ to_free->slots[0] = NULL;
+ to_free->slots_in_use = 0;
+ tmem_objnode_free(to_free);
+ }
+ }
+ goto out;
+ }
+ tmem_objnode_free(pathp->objnode); /* 0 slots used, free it */
+ pathp--;
+ }
+ obj->objnode_tree_height = 0;
+ obj->objnode_tree_root = NULL;
+
+out:
+ if (slot != NULL)
+ obj->pampd_count--;
+ BUG_ON(obj->pampd_count < 0);
+ return slot;
+}
+
+/* recursively walk the objnode_tree destroying pampds and objnodes */
+static void tmem_objnode_node_destroy(struct tmem_obj *obj,
+ struct tmem_objnode *objnode,
+ unsigned int ht)
+{
+ int i;
+
+ if (ht == 0)
+ return;
+ for (i = 0; i < OBJNODE_TREE_MAP_SIZE; i++) {
+ if (objnode->slots[i]) {
+ if (ht == 1) {
+ obj->pampd_count--;
+ (*tmem_pamops.free)(objnode->slots[i],
+ obj->pool, NULL, 0);
+ objnode->slots[i] = NULL;
+ continue;
+ }
+ tmem_objnode_node_destroy(obj, objnode->slots[i], ht-1);
+ tmem_objnode_free(objnode->slots[i]);
+ objnode->slots[i] = NULL;
+ }
+ }
+}
+
+static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *obj)
+{
+ if (obj->objnode_tree_root == NULL)
+ return;
+ if (obj->objnode_tree_height == 0) {
+ obj->pampd_count--;
+ (*tmem_pamops.free)(obj->objnode_tree_root, obj->pool, NULL, 0);
+ } else {
+ tmem_objnode_node_destroy(obj, obj->objnode_tree_root,
+ obj->objnode_tree_height);
+ tmem_objnode_free(obj->objnode_tree_root);
+ obj->objnode_tree_height = 0;
+ }
+ obj->objnode_tree_root = NULL;
+ (*tmem_pamops.free_obj)(obj->pool, obj);
+}
+
+/*
+ * Tmem is operated on by a set of well-defined actions:
+ * "put", "get", "flush", "flush_object", "new pool" and "destroy pool".
+ * (The tmem ABI allows for subpages and exchanges but these operations
+ * are not included in this implementation.)
+ *
+ * These "tmem core" operations are implemented in the following functions.
+ */
+
+/*
+ * "Put" a page, e.g. copy a page from the kernel into newly allocated
+ * PAM space (if such space is available). Tmem_put is complicated by
+ * a corner case: What if a page with matching handle already exists in
+ * tmem? To guarantee coherency, one of two actions is necessary: Either
+ * the data for the page must be overwritten, or the page must be
+ * "flushed" so that the data is not accessible to a subsequent "get".
+ * Since these "duplicate puts" are relatively rare, this implementation
+ * always flushes for simplicity.
+ */
+int tmem_put(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index,
+ char *data, size_t size, bool raw, bool ephemeral)
+{
+ struct tmem_obj *obj = NULL, *objfound = NULL, *objnew = NULL;
+ void *pampd = NULL, *pampd_del = NULL;
+ int ret = -ENOMEM;
+ struct tmem_hashbucket *hb;
+
+ hb = &pool->hashbucket[tmem_oid_hash(oidp)];
+ spin_lock(&hb->lock);
+ obj = objfound = tmem_obj_find(hb, oidp);
+ if (obj != NULL) {
+ pampd = tmem_pampd_lookup_in_obj(objfound, index);
+ if (pampd != NULL) {
+ /* if found, is a dup put, flush the old one */
+ pampd_del = tmem_pampd_delete_from_obj(obj, index);
+ BUG_ON(pampd_del != pampd);
+ (*tmem_pamops.free)(pampd, pool, oidp, index);
+ if (obj->pampd_count == 0) {
+ objnew = obj;
+ objfound = NULL;
+ }
+ pampd = NULL;
+ }
+ } else {
+ obj = objnew = (*tmem_hostops.obj_alloc)(pool);
+ if (unlikely(obj == NULL)) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ tmem_obj_init(obj, hb, pool, oidp);
+ }
+ BUG_ON(obj == NULL);
+ BUG_ON(((objnew != obj) && (objfound != obj)) || (objnew == objfound));
+ pampd = (*tmem_pamops.create)(data, size, raw, ephemeral,
+ obj->pool, &obj->oid, index);
+ if (unlikely(pampd == NULL))
+ goto free;
+ ret = tmem_pampd_add_to_obj(obj, index, pampd);
+ if (unlikely(ret == -ENOMEM))
+ /* may have partially built objnode tree ("stump") */
+ goto delete_and_free;
+ goto out;
+
+delete_and_free:
+ (void)tmem_pampd_delete_from_obj(obj, index);
+free:
+ if (pampd)
+ (*tmem_pamops.free)(pampd, pool, NULL, 0);
+ if (objnew) {
+ tmem_obj_free(objnew, hb);
+ (*tmem_hostops.obj_free)(objnew, pool);
+ }
+out:
+ spin_unlock(&hb->lock);
+ return ret;
+}
+
+/*
+ * "Get" a page, e.g. if one can be found, copy the tmem page with the
+ * matching handle from PAM space to the kernel. By tmem definition,
+ * when a "get" is successful on an ephemeral page, the page is "flushed",
+ * and when a "get" is successful on a persistent page, the page is retained
+ * in tmem. Note that to preserve
+ * coherency, "get" can never be skipped if tmem contains the data.
+ * That is, if a get is done with a certain handle and fails, any
+ * subsequent "get" must also fail (unless of course there is a
+ * "put" done with the same handle).
+
+ */
+int tmem_get(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index,
+ char *data, size_t *size, bool raw, int get_and_free)
+{
+ struct tmem_obj *obj;
+ void *pampd;
+ bool ephemeral = is_ephemeral(pool);
+ int ret = -1;
+ struct tmem_hashbucket *hb;
+ bool free = (get_and_free == 1) || ((get_and_free == 0) && ephemeral);
+ bool lock_held = false;
+
+ hb = &pool->hashbucket[tmem_oid_hash(oidp)];
+ spin_lock(&hb->lock);
+ lock_held = true;
+ obj = tmem_obj_find(hb, oidp);
+ if (obj == NULL)
+ goto out;
+ if (free)
+ pampd = tmem_pampd_delete_from_obj(obj, index);
+ else
+ pampd = tmem_pampd_lookup_in_obj(obj, index);
+ if (pampd == NULL)
+ goto out;
+ if (free) {
+ if (obj->pampd_count == 0) {
+ tmem_obj_free(obj, hb);
+ (*tmem_hostops.obj_free)(obj, pool);
+ obj = NULL;
+ }
+ }
+ if (tmem_pamops.is_remote(pampd)) {
+ lock_held = false;
+ spin_unlock(&hb->lock);
+ }
+ if (free)
+ ret = (*tmem_pamops.get_data_and_free)(
+ data, size, raw, pampd, pool, oidp, index);
+ else
+ ret = (*tmem_pamops.get_data)(
+ data, size, raw, pampd, pool, oidp, index);
+ if (ret < 0)
+ goto out;
+ ret = 0;
+out:
+ if (lock_held)
+ spin_unlock(&hb->lock);
+ return ret;
+}
+
+/*
+ * If a page in tmem matches the handle, "flush" this page from tmem such
+ * that any subsequent "get" does not succeed (unless, of course, there
+ * was another "put" with the same handle).
+ */
+int tmem_flush_page(struct tmem_pool *pool,
+ struct tmem_oid *oidp, uint32_t index)
+{
+ struct tmem_obj *obj;
+ void *pampd;
+ int ret = -1;
+ struct tmem_hashbucket *hb;
+
+ hb = &pool->hashbucket[tmem_oid_hash(oidp)];
+ spin_lock(&hb->lock);
+ obj = tmem_obj_find(hb, oidp);
+ if (obj == NULL)
+ goto out;
+ pampd = tmem_pampd_delete_from_obj(obj, index);
+ if (pampd == NULL)
+ goto out;
+ (*tmem_pamops.free)(pampd, pool, oidp, index);
+ if (obj->pampd_count == 0) {
+ tmem_obj_free(obj, hb);
+ (*tmem_hostops.obj_free)(obj, pool);
+ }
+ ret = 0;
+
+out:
+ spin_unlock(&hb->lock);
+ return ret;
+}
+
+/*
+ * If a page in tmem matches the handle, replace the page so that any
+ * subsequent "get" gets the new page. Returns 0 if
+ * there was a page to replace, else returns -1.
+ */
+int tmem_replace(struct tmem_pool *pool, struct tmem_oid *oidp,
+ uint32_t index, void *new_pampd)
+{
+ struct tmem_obj *obj;
+ int ret = -1;
+ struct tmem_hashbucket *hb;
+
+ hb = &pool->hashbucket[tmem_oid_hash(oidp)];
+ spin_lock(&hb->lock);
+ obj = tmem_obj_find(hb, oidp);
+ if (obj == NULL)
+ goto out;
+ new_pampd = tmem_pampd_replace_in_obj(obj, index, new_pampd);
+ ret = (*tmem_pamops.replace_in_obj)(new_pampd, obj);
+out:
+ spin_unlock(&hb->lock);
+ return ret;
+}
+
+/*
+ * "Flush" all pages in tmem matching this oid.
+ */
+int tmem_flush_object(struct tmem_pool *pool, struct tmem_oid *oidp)
+{
+ struct tmem_obj *obj;
+ struct tmem_hashbucket *hb;
+ int ret = -1;
+
+ hb = &pool->hashbucket[tmem_oid_hash(oidp)];
+ spin_lock(&hb->lock);
+ obj = tmem_obj_find(hb, oidp);
+ if (obj == NULL)
+ goto out;
+ tmem_pampd_destroy_all_in_obj(obj);
+ tmem_obj_free(obj, hb);
+ (*tmem_hostops.obj_free)(obj, pool);
+ ret = 0;
+
+out:
+ spin_unlock(&hb->lock);
+ return ret;
+}
+
+/*
+ * "Flush" all pages (and tmem_objs) from this tmem_pool and disable
+ * all subsequent access to this tmem_pool.
+ */
+int tmem_destroy_pool(struct tmem_pool *pool)
+{
+ int ret = -1;
+
+ if (pool == NULL)
+ goto out;
+ tmem_pool_flush(pool, 1);
+ ret = 0;
+out:
+ return ret;
+}
+
+static LIST_HEAD(tmem_global_pool_list);
+
+/*
+ * Create a new tmem_pool with the provided flag and return
+ * a pool id provided by the tmem host implementation.
+ */
+void tmem_new_pool(struct tmem_pool *pool, uint32_t flags)
+{
+ int persistent = flags & TMEM_POOL_PERSIST;
+ int shared = flags & TMEM_POOL_SHARED;
+ struct tmem_hashbucket *hb = &pool->hashbucket[0];
+ int i;
+
+ for (i = 0; i < TMEM_HASH_BUCKETS; i++, hb++) {
+ hb->obj_rb_root = RB_ROOT;
+ spin_lock_init(&hb->lock);
+ }
+ INIT_LIST_HEAD(&pool->pool_list);
+ atomic_set(&pool->obj_count, 0);
+ SET_SENTINEL(pool, POOL);
+ list_add_tail(&pool->pool_list, &tmem_global_pool_list);
+ pool->persistent = persistent;
+ pool->shared = shared;
+}
diff --git a/drivers/staging/ramster/tmem.h b/drivers/staging/ramster/tmem.h
new file mode 100644
index 000000000000..ed147c4b110d
--- /dev/null
+++ b/drivers/staging/ramster/tmem.h
@@ -0,0 +1,206 @@
+/*
+ * tmem.h
+ *
+ * Transcendent memory
+ *
+ * Copyright (c) 2009-2011, Dan Magenheimer, Oracle Corp.
+ */
+
+#ifndef _TMEM_H_
+#define _TMEM_H_
+
+#include <linux/types.h>
+#include <linux/highmem.h>
+#include <linux/hash.h>
+#include <linux/atomic.h>
+
+/*
+ * These are pre-defined by the Xen<->Linux ABI
+ */
+#define TMEM_PUT_PAGE 4
+#define TMEM_GET_PAGE 5
+#define TMEM_FLUSH_PAGE 6
+#define TMEM_FLUSH_OBJECT 7
+#define TMEM_POOL_PERSIST 1
+#define TMEM_POOL_SHARED 2
+#define TMEM_POOL_PRECOMPRESSED 4
+#define TMEM_POOL_PAGESIZE_SHIFT 4
+#define TMEM_POOL_PAGESIZE_MASK 0xf
+#define TMEM_POOL_RESERVED_BITS 0x00ffff00
+
+/*
+ * sentinels have proven very useful for debugging but can be removed
+ * or disabled before final merge.
+ */
+#define SENTINELS
+#ifdef SENTINELS
+#define DECL_SENTINEL uint32_t sentinel;
+#define SET_SENTINEL(_x, _y) (_x->sentinel = _y##_SENTINEL)
+#define INVERT_SENTINEL(_x, _y) (_x->sentinel = ~_y##_SENTINEL)
+#define ASSERT_SENTINEL(_x, _y) WARN_ON(_x->sentinel != _y##_SENTINEL)
+#define ASSERT_INVERTED_SENTINEL(_x, _y) WARN_ON(_x->sentinel != ~_y##_SENTINEL)
+#else
+#define DECL_SENTINEL
+#define SET_SENTINEL(_x, _y) do { } while (0)
+#define INVERT_SENTINEL(_x, _y) do { } while (0)
+#define ASSERT_SENTINEL(_x, _y) do { } while (0)
+#define ASSERT_INVERTED_SENTINEL(_x, _y) do { } while (0)
+#endif
+
+#define ASSERT_SPINLOCK(_l) WARN_ON(!spin_is_locked(_l))
+
+/*
+ * A pool is the highest-level data structure managed by tmem and
+ * usually corresponds to a large independent set of pages such as
+ * a filesystem. Each pool has an id, and certain attributes and counters.
+ * It also contains a set of hash buckets, each of which contains an rbtree
+ * of objects and a lock to manage concurrency within the pool.
+ */
+
+#define TMEM_HASH_BUCKET_BITS 8
+#define TMEM_HASH_BUCKETS (1<<TMEM_HASH_BUCKET_BITS)
+
+struct tmem_hashbucket {
+ struct rb_root obj_rb_root;
+ spinlock_t lock;
+};
+
+struct tmem_pool {
+ void *client; /* "up" for some clients, avoids table lookup */
+ struct list_head pool_list;
+ uint32_t pool_id;
+ bool persistent;
+ bool shared;
+ atomic_t obj_count;
+ atomic_t refcount;
+ struct tmem_hashbucket hashbucket[TMEM_HASH_BUCKETS];
+ DECL_SENTINEL
+};
+
+#define is_persistent(_p) (_p->persistent)
+#define is_ephemeral(_p) (!(_p->persistent))
+
+/*
+ * An object id ("oid") is large: 192-bits (to ensure, for example, files
+ * in a modern filesystem can be uniquely identified).
+ */
+
+struct tmem_oid {
+ uint64_t oid[3];
+};
+
+static inline void tmem_oid_set_invalid(struct tmem_oid *oidp)
+{
+ oidp->oid[0] = oidp->oid[1] = oidp->oid[2] = -1UL;
+}
+
+static inline bool tmem_oid_valid(struct tmem_oid *oidp)
+{
+ return oidp->oid[0] != -1UL || oidp->oid[1] != -1UL ||
+ oidp->oid[2] != -1UL;
+}
+
+static inline int tmem_oid_compare(struct tmem_oid *left,
+ struct tmem_oid *right)
+{
+ int ret;
+
+ if (left->oid[2] == right->oid[2]) {
+ if (left->oid[1] == right->oid[1]) {
+ if (left->oid[0] == right->oid[0])
+ ret = 0;
+ else if (left->oid[0] < right->oid[0])
+ ret = -1;
+ else
+ return 1;
+ } else if (left->oid[1] < right->oid[1])
+ ret = -1;
+ else
+ ret = 1;
+ } else if (left->oid[2] < right->oid[2])
+ ret = -1;
+ else
+ ret = 1;
+ return ret;
+}
+
+static inline unsigned tmem_oid_hash(struct tmem_oid *oidp)
+{
+ return hash_long(oidp->oid[0] ^ oidp->oid[1] ^ oidp->oid[2],
+ TMEM_HASH_BUCKET_BITS);
+}
+
+/*
+ * A tmem_obj contains an identifier (oid), pointers to the parent
+ * pool and the rb_tree to which it belongs, counters, and an ordered
+ * set of pampds, structured in a radix-tree-like tree. The intermediate
+ * nodes of the tree are called tmem_objnodes.
+ */
+
+struct tmem_objnode;
+
+struct tmem_obj {
+ struct tmem_oid oid;
+ struct tmem_pool *pool;
+ struct rb_node rb_tree_node;
+ struct tmem_objnode *objnode_tree_root;
+ unsigned int objnode_tree_height;
+ unsigned long objnode_count;
+ long pampd_count;
+ void *extra; /* for private use by pampd implementation */
+ DECL_SENTINEL
+};
+
+#define OBJNODE_TREE_MAP_SHIFT 6
+#define OBJNODE_TREE_MAP_SIZE (1UL << OBJNODE_TREE_MAP_SHIFT)
+#define OBJNODE_TREE_MAP_MASK (OBJNODE_TREE_MAP_SIZE-1)
+#define OBJNODE_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long))
+#define OBJNODE_TREE_MAX_PATH \
+ (OBJNODE_TREE_INDEX_BITS/OBJNODE_TREE_MAP_SHIFT + 2)
+
+struct tmem_objnode {
+ struct tmem_obj *obj;
+ DECL_SENTINEL
+ void *slots[OBJNODE_TREE_MAP_SIZE];
+ unsigned int slots_in_use;
+};
+
+/* pampd abstract datatype methods provided by the PAM implementation */
+struct tmem_pamops {
+ void *(*create)(char *, size_t, bool, int,
+ struct tmem_pool *, struct tmem_oid *, uint32_t);
+ int (*get_data)(char *, size_t *, bool, void *, struct tmem_pool *,
+ struct tmem_oid *, uint32_t);
+ int (*get_data_and_free)(char *, size_t *, bool, void *,
+ struct tmem_pool *, struct tmem_oid *,
+ uint32_t);
+ void (*free)(void *, struct tmem_pool *, struct tmem_oid *, uint32_t);
+ void (*free_obj)(struct tmem_pool *, struct tmem_obj *);
+ bool (*is_remote)(void *);
+ void (*new_obj)(struct tmem_obj *);
+ int (*replace_in_obj)(void *, struct tmem_obj *);
+};
+extern void tmem_register_pamops(struct tmem_pamops *m);
+
+/* memory allocation methods provided by the host implementation */
+struct tmem_hostops {
+ struct tmem_obj *(*obj_alloc)(struct tmem_pool *);
+ void (*obj_free)(struct tmem_obj *, struct tmem_pool *);
+ struct tmem_objnode *(*objnode_alloc)(struct tmem_pool *);
+ void (*objnode_free)(struct tmem_objnode *, struct tmem_pool *);
+};
+extern void tmem_register_hostops(struct tmem_hostops *m);
+
+/* core tmem accessor functions */
+extern int tmem_put(struct tmem_pool *, struct tmem_oid *, uint32_t index,
+ char *, size_t, bool, bool);
+extern int tmem_get(struct tmem_pool *, struct tmem_oid *, uint32_t index,
+ char *, size_t *, bool, int);
+extern int tmem_replace(struct tmem_pool *, struct tmem_oid *, uint32_t index,
+ void *);
+extern int tmem_flush_page(struct tmem_pool *, struct tmem_oid *,
+ uint32_t index);
+extern int tmem_flush_object(struct tmem_pool *, struct tmem_oid *);
+extern int tmem_destroy_pool(struct tmem_pool *);
+extern void tmem_new_pool(struct tmem_pool *, uint32_t);
+#endif /* _TMEM_H */
diff --git a/drivers/staging/ramster/zcache-main.c b/drivers/staging/ramster/zcache-main.c
new file mode 100644
index 000000000000..49c8791462f4
--- /dev/null
+++ b/drivers/staging/ramster/zcache-main.c
@@ -0,0 +1,1988 @@
+/*
+ * zcache.c
+ *
+ * Copyright (c) 2010,2011, Dan Magenheimer, Oracle Corp.
+ * Copyright (c) 2010,2011, Nitin Gupta
+ *
+ * Zcache provides an in-kernel "host implementation" for transcendent memory
+ * and, thus indirectly, for cleancache and frontswap. Zcache includes two
+ * page-accessible memory [1] interfaces, both utilizing lzo1x compression:
+ * 1) "compression buddies" ("zbud") is used for ephemeral pages
+ * 2) xvmalloc is used for persistent pages.
+ * Xvmalloc (based on the TLSF allocator) has very low fragmentation
+ * so maximizes space efficiency, while zbud allows pairs (and potentially,
+ * in the future, more than a pair of) compressed pages to be closely linked
+ * so that reclaiming can be done via the kernel's physical-page-oriented
+ * "shrinker" interface.
+ *
+ * [1] For a definition of page-accessible memory (aka PAM), see:
+ * http://marc.info/?l=linux-mm&m=127811271605009
+ */
+
+#include <linux/module.h>
+#include <linux/cpu.h>
+#include <linux/highmem.h>
+#include <linux/list.h>
+#include <linux/lzo.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/types.h>
+#include <linux/atomic.h>
+#include <linux/math64.h>
+#include "tmem.h"
+
+#include "../zram/xvmalloc.h" /* if built in drivers/staging */
+
+#if (!defined(CONFIG_CLEANCACHE) && !defined(CONFIG_FRONTSWAP))
+#error "zcache is useless without CONFIG_CLEANCACHE or CONFIG_FRONTSWAP"
+#endif
+#ifdef CONFIG_CLEANCACHE
<