From e7fd41792fc0ee52a05fcaac87511f118328d147 Mon Sep 17 00:00:00 2001 From: David Teigland Date: Wed, 18 Jan 2006 09:30:29 +0000 Subject: [DLM] The core of the DLM for GFS2/CLVM This is the core of the distributed lock manager which is required to use GFS2 as a cluster filesystem. It is also used by CLVM and can be used as a standalone lock manager independantly of either of these two projects. It implements VAX-style locking modes. Signed-off-by: David Teigland Signed-off-by: Steve Whitehouse --- fs/Kconfig | 1 + fs/Makefile | 1 + fs/dlm/Kconfig | 30 + fs/dlm/Makefile | 21 + fs/dlm/ast.c | 167 +++ fs/dlm/ast.h | 26 + fs/dlm/config.c | 787 +++++++++++ fs/dlm/config.h | 42 + fs/dlm/debug_fs.c | 310 +++++ fs/dlm/device.c | 1084 +++++++++++++++ fs/dlm/dir.c | 423 ++++++ fs/dlm/dir.h | 30 + fs/dlm/dlm_internal.h | 518 +++++++ fs/dlm/lock.c | 3610 +++++++++++++++++++++++++++++++++++++++++++++++++ fs/dlm/lock.h | 50 + fs/dlm/lockspace.c | 666 +++++++++ fs/dlm/lockspace.h | 24 + fs/dlm/lowcomms.c | 1218 +++++++++++++++++ fs/dlm/lowcomms.h | 25 + fs/dlm/lvb_table.h | 18 + fs/dlm/main.c | 89 ++ fs/dlm/member.c | 314 +++++ fs/dlm/member.h | 24 + fs/dlm/memory.c | 122 ++ fs/dlm/memory.h | 31 + fs/dlm/midcomms.c | 140 ++ fs/dlm/midcomms.h | 21 + fs/dlm/rcom.c | 460 +++++++ fs/dlm/rcom.h | 24 + fs/dlm/recover.c | 762 +++++++++++ fs/dlm/recover.h | 34 + fs/dlm/recoverd.c | 285 ++++ fs/dlm/recoverd.h | 24 + fs/dlm/requestqueue.c | 184 +++ fs/dlm/requestqueue.h | 22 + fs/dlm/util.c | 173 +++ fs/dlm/util.h | 22 + 37 files changed, 11782 insertions(+) create mode 100644 fs/dlm/Kconfig create mode 100644 fs/dlm/Makefile create mode 100644 fs/dlm/ast.c create mode 100644 fs/dlm/ast.h create mode 100644 fs/dlm/config.c create mode 100644 fs/dlm/config.h create mode 100644 fs/dlm/debug_fs.c create mode 100644 fs/dlm/device.c create mode 100644 fs/dlm/dir.c create mode 100644 fs/dlm/dir.h create mode 100644 fs/dlm/dlm_internal.h create mode 100644 fs/dlm/lock.c create mode 100644 fs/dlm/lock.h create mode 100644 fs/dlm/lockspace.c create mode 100644 fs/dlm/lockspace.h create mode 100644 fs/dlm/lowcomms.c create mode 100644 fs/dlm/lowcomms.h create mode 100644 fs/dlm/lvb_table.h create mode 100644 fs/dlm/main.c create mode 100644 fs/dlm/member.c create mode 100644 fs/dlm/member.h create mode 100644 fs/dlm/memory.c create mode 100644 fs/dlm/memory.h create mode 100644 fs/dlm/midcomms.c create mode 100644 fs/dlm/midcomms.h create mode 100644 fs/dlm/rcom.c create mode 100644 fs/dlm/rcom.h create mode 100644 fs/dlm/recover.c create mode 100644 fs/dlm/recover.h create mode 100644 fs/dlm/recoverd.c create mode 100644 fs/dlm/recoverd.h create mode 100644 fs/dlm/requestqueue.c create mode 100644 fs/dlm/requestqueue.h create mode 100644 fs/dlm/util.c create mode 100644 fs/dlm/util.h (limited to 'fs') diff --git a/fs/Kconfig b/fs/Kconfig index bec8afa5704d..3a32f3f97e9b 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -1831,6 +1831,7 @@ source "fs/partitions/Kconfig" endmenu source "fs/nls/Kconfig" +source "fs/dlm/Kconfig" endmenu diff --git a/fs/Makefile b/fs/Makefile index 0922727732c5..b298f4fdc6f2 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -48,6 +48,7 @@ obj-$(CONFIG_SYSFS) += sysfs/ obj-y += devpts/ obj-$(CONFIG_PROFILING) += dcookies.o +obj-$(CONFIG_DLM) += dlm/ # Do not add any filesystems before this line obj-$(CONFIG_REISERFS_FS) += reiserfs/ diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig new file mode 100644 index 000000000000..d01f735e6e06 --- /dev/null +++ b/fs/dlm/Kconfig @@ -0,0 +1,30 @@ +menu "Distributed Lock Manager" + depends on INET && EXPERIMENTAL + +config DLM + tristate "Distributed Lock Manager (DLM)" + depends on SYSFS + depends on IPV6 || IPV6=n + select IP_SCTP + select CONFIGFS_FS + help + A general purpose distributed lock manager for kernel or userspace + applications. + +config DLM_DEVICE + tristate "DLM device for userspace access" + depends on DLM + help + This module creates a misc device through which the dlm lockspace + and locking functions become available to userspace applications + (usually through the libdlm library). + +config DLM_DEBUG + bool "DLM debugging" + depends on DLM + help + Under the debugfs mount point, the name of each lockspace will + appear as a file in the "dlm" directory. The output is the + list of resource and locks the local node knows about. + +endmenu diff --git a/fs/dlm/Makefile b/fs/dlm/Makefile new file mode 100644 index 000000000000..1e6232e7d8e5 --- /dev/null +++ b/fs/dlm/Makefile @@ -0,0 +1,21 @@ +obj-$(CONFIG_DLM) += dlm.o +obj-$(CONFIG_DLM_DEVICE) += dlm_device.o + +dlm-y := ast.o \ + config.o \ + dir.o \ + lock.o \ + lockspace.o \ + lowcomms.o \ + main.o \ + member.o \ + memory.o \ + midcomms.o \ + rcom.o \ + recover.o \ + recoverd.o \ + requestqueue.o \ + util.o +dlm-$(CONFIG_DLM_DEBUG) += debug_fs.o + +dlm_device-y := device.o diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c new file mode 100644 index 000000000000..2bd1c5e1a72c --- /dev/null +++ b/fs/dlm/ast.c @@ -0,0 +1,167 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#include "dlm_internal.h" +#include "lock.h" +#include "ast.h" + +#define WAKE_ASTS 0 + +static struct list_head ast_queue; +static spinlock_t ast_queue_lock; +static struct task_struct * astd_task; +static unsigned long astd_wakeflags; +static struct semaphore astd_running; + + +void dlm_del_ast(struct dlm_lkb *lkb) +{ + spin_lock(&ast_queue_lock); + if (lkb->lkb_ast_type & (AST_COMP | AST_BAST)) + list_del(&lkb->lkb_astqueue); + spin_unlock(&ast_queue_lock); +} + +void dlm_add_ast(struct dlm_lkb *lkb, int type) +{ + spin_lock(&ast_queue_lock); + if (!(lkb->lkb_ast_type & (AST_COMP | AST_BAST))) { + kref_get(&lkb->lkb_ref); + list_add_tail(&lkb->lkb_astqueue, &ast_queue); + } + lkb->lkb_ast_type |= type; + spin_unlock(&ast_queue_lock); + + set_bit(WAKE_ASTS, &astd_wakeflags); + wake_up_process(astd_task); +} + +static void process_asts(void) +{ + struct dlm_ls *ls = NULL; + struct dlm_rsb *r = NULL; + struct dlm_lkb *lkb; + void (*cast) (long param); + void (*bast) (long param, int mode); + int type = 0, found, bmode; + + for (;;) { + found = FALSE; + spin_lock(&ast_queue_lock); + list_for_each_entry(lkb, &ast_queue, lkb_astqueue) { + r = lkb->lkb_resource; + ls = r->res_ls; + + if (dlm_locking_stopped(ls)) + continue; + + list_del(&lkb->lkb_astqueue); + type = lkb->lkb_ast_type; + lkb->lkb_ast_type = 0; + found = TRUE; + break; + } + spin_unlock(&ast_queue_lock); + + if (!found) + break; + + cast = lkb->lkb_astaddr; + bast = lkb->lkb_bastaddr; + bmode = lkb->lkb_bastmode; + + if ((type & AST_COMP) && cast) + cast(lkb->lkb_astparam); + + /* FIXME: Is it safe to look at lkb_grmode here + without doing a lock_rsb() ? + Look at other checks in v1 to avoid basts. */ + + if ((type & AST_BAST) && bast) + if (!dlm_modes_compat(lkb->lkb_grmode, bmode)) + bast(lkb->lkb_astparam, bmode); + + /* this removes the reference added by dlm_add_ast + and may result in the lkb being freed */ + dlm_put_lkb(lkb); + + schedule(); + } +} + +static inline int no_asts(void) +{ + int ret; + + spin_lock(&ast_queue_lock); + ret = list_empty(&ast_queue); + spin_unlock(&ast_queue_lock); + return ret; +} + +static int dlm_astd(void *data) +{ + while (!kthread_should_stop()) { + set_current_state(TASK_INTERRUPTIBLE); + if (!test_bit(WAKE_ASTS, &astd_wakeflags)) + schedule(); + set_current_state(TASK_RUNNING); + + down(&astd_running); + if (test_and_clear_bit(WAKE_ASTS, &astd_wakeflags)) + process_asts(); + up(&astd_running); + } + return 0; +} + +void dlm_astd_wake(void) +{ + if (!no_asts()) { + set_bit(WAKE_ASTS, &astd_wakeflags); + wake_up_process(astd_task); + } +} + +int dlm_astd_start(void) +{ + struct task_struct *p; + int error = 0; + + INIT_LIST_HEAD(&ast_queue); + spin_lock_init(&ast_queue_lock); + init_MUTEX(&astd_running); + + p = kthread_run(dlm_astd, NULL, "dlm_astd"); + if (IS_ERR(p)) + error = PTR_ERR(p); + else + astd_task = p; + return error; +} + +void dlm_astd_stop(void) +{ + kthread_stop(astd_task); +} + +void dlm_astd_suspend(void) +{ + down(&astd_running); +} + +void dlm_astd_resume(void) +{ + up(&astd_running); +} + diff --git a/fs/dlm/ast.h b/fs/dlm/ast.h new file mode 100644 index 000000000000..6ee276c74c52 --- /dev/null +++ b/fs/dlm/ast.h @@ -0,0 +1,26 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) 2005 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#ifndef __ASTD_DOT_H__ +#define __ASTD_DOT_H__ + +void dlm_add_ast(struct dlm_lkb *lkb, int type); +void dlm_del_ast(struct dlm_lkb *lkb); + +void dlm_astd_wake(void); +int dlm_astd_start(void); +void dlm_astd_stop(void); +void dlm_astd_suspend(void); +void dlm_astd_resume(void); + +#endif + diff --git a/fs/dlm/config.c b/fs/dlm/config.c new file mode 100644 index 000000000000..024ace9973a8 --- /dev/null +++ b/fs/dlm/config.c @@ -0,0 +1,787 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#include +#include +#include +#include + +#include "config.h" + +/* + * /config/dlm//spaces//nodes//nodeid + * /config/dlm//spaces//nodes//weight + * /config/dlm//comms//nodeid + * /config/dlm//comms//local + * /config/dlm//comms//addr + * The level is useless, but I haven't figured out how to avoid it. + */ + +static struct config_group *space_list; +static struct config_group *comm_list; +static struct comm *local_comm; + +struct clusters; +struct cluster; +struct spaces; +struct space; +struct comms; +struct comm; +struct nodes; +struct node; + +static struct config_group *make_cluster(struct config_group *, const char *); +static void drop_cluster(struct config_group *, struct config_item *); +static void release_cluster(struct config_item *); +static struct config_group *make_space(struct config_group *, const char *); +static void drop_space(struct config_group *, struct config_item *); +static void release_space(struct config_item *); +static struct config_item *make_comm(struct config_group *, const char *); +static void drop_comm(struct config_group *, struct config_item *); +static void release_comm(struct config_item *); +static struct config_item *make_node(struct config_group *, const char *); +static void drop_node(struct config_group *, struct config_item *); +static void release_node(struct config_item *); + +static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a, + char *buf); +static ssize_t store_comm(struct config_item *i, struct configfs_attribute *a, + const char *buf, size_t len); +static ssize_t show_node(struct config_item *i, struct configfs_attribute *a, + char *buf); +static ssize_t store_node(struct config_item *i, struct configfs_attribute *a, + const char *buf, size_t len); + +static ssize_t comm_nodeid_read(struct comm *cm, char *buf); +static ssize_t comm_nodeid_write(struct comm *cm, const char *buf, size_t len); +static ssize_t comm_local_read(struct comm *cm, char *buf); +static ssize_t comm_local_write(struct comm *cm, const char *buf, size_t len); +static ssize_t comm_addr_write(struct comm *cm, const char *buf, size_t len); +static ssize_t node_nodeid_read(struct node *nd, char *buf); +static ssize_t node_nodeid_write(struct node *nd, const char *buf, size_t len); +static ssize_t node_weight_read(struct node *nd, char *buf); +static ssize_t node_weight_write(struct node *nd, const char *buf, size_t len); + +enum { + COMM_ATTR_NODEID = 0, + COMM_ATTR_LOCAL, + COMM_ATTR_ADDR, +}; + +struct comm_attribute { + struct configfs_attribute attr; + ssize_t (*show)(struct comm *, char *); + ssize_t (*store)(struct comm *, const char *, size_t); +}; + +static struct comm_attribute comm_attr_nodeid = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "nodeid", + .ca_mode = S_IRUGO | S_IWUSR }, + .show = comm_nodeid_read, + .store = comm_nodeid_write, +}; + +static struct comm_attribute comm_attr_local = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "local", + .ca_mode = S_IRUGO | S_IWUSR }, + .show = comm_local_read, + .store = comm_local_write, +}; + +static struct comm_attribute comm_attr_addr = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "addr", + .ca_mode = S_IRUGO | S_IWUSR }, + .store = comm_addr_write, +}; + +static struct configfs_attribute *comm_attrs[] = { + [COMM_ATTR_NODEID] = &comm_attr_nodeid.attr, + [COMM_ATTR_LOCAL] = &comm_attr_local.attr, + [COMM_ATTR_ADDR] = &comm_attr_addr.attr, + NULL, +}; + +enum { + NODE_ATTR_NODEID = 0, + NODE_ATTR_WEIGHT, +}; + +struct node_attribute { + struct configfs_attribute attr; + ssize_t (*show)(struct node *, char *); + ssize_t (*store)(struct node *, const char *, size_t); +}; + +static struct node_attribute node_attr_nodeid = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "nodeid", + .ca_mode = S_IRUGO | S_IWUSR }, + .show = node_nodeid_read, + .store = node_nodeid_write, +}; + +static struct node_attribute node_attr_weight = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "weight", + .ca_mode = S_IRUGO | S_IWUSR }, + .show = node_weight_read, + .store = node_weight_write, +}; + +static struct configfs_attribute *node_attrs[] = { + [NODE_ATTR_NODEID] = &node_attr_nodeid.attr, + [NODE_ATTR_WEIGHT] = &node_attr_weight.attr, + NULL, +}; + +struct clusters { + struct configfs_subsystem subsys; +}; + +struct cluster { + struct config_group group; +}; + +struct spaces { + struct config_group ss_group; +}; + +struct space { + struct config_group group; + struct list_head members; + struct semaphore members_lock; + int members_count; +}; + +struct comms { + struct config_group cs_group; +}; + +struct comm { + struct config_item item; + int nodeid; + int local; + int addr_count; + struct sockaddr_storage *addr[DLM_MAX_ADDR_COUNT]; +}; + +struct nodes { + struct config_group ns_group; +}; + +struct node { + struct config_item item; + struct list_head list; /* space->members */ + int nodeid; + int weight; +}; + +static struct configfs_group_operations clusters_ops = { + .make_group = make_cluster, + .drop_item = drop_cluster, +}; + +static struct configfs_item_operations cluster_ops = { + .release = release_cluster, +}; + +static struct configfs_group_operations spaces_ops = { + .make_group = make_space, + .drop_item = drop_space, +}; + +static struct configfs_item_operations space_ops = { + .release = release_space, +}; + +static struct configfs_group_operations comms_ops = { + .make_item = make_comm, + .drop_item = drop_comm, +}; + +static struct configfs_item_operations comm_ops = { + .release = release_comm, + .show_attribute = show_comm, + .store_attribute = store_comm, +}; + +static struct configfs_group_operations nodes_ops = { + .make_item = make_node, + .drop_item = drop_node, +}; + +static struct configfs_item_operations node_ops = { + .release = release_node, + .show_attribute = show_node, + .store_attribute = store_node, +}; + +static struct config_item_type clusters_type = { + .ct_group_ops = &clusters_ops, + .ct_owner = THIS_MODULE, +}; + +static struct config_item_type cluster_type = { + .ct_item_ops = &cluster_ops, + .ct_owner = THIS_MODULE, +}; + +static struct config_item_type spaces_type = { + .ct_group_ops = &spaces_ops, + .ct_owner = THIS_MODULE, +}; + +static struct config_item_type space_type = { + .ct_item_ops = &space_ops, + .ct_owner = THIS_MODULE, +}; + +static struct config_item_type comms_type = { + .ct_group_ops = &comms_ops, + .ct_owner = THIS_MODULE, +}; + +static struct config_item_type comm_type = { + .ct_item_ops = &comm_ops, + .ct_attrs = comm_attrs, + .ct_owner = THIS_MODULE, +}; + +static struct config_item_type nodes_type = { + .ct_group_ops = &nodes_ops, + .ct_owner = THIS_MODULE, +}; + +static struct config_item_type node_type = { + .ct_item_ops = &node_ops, + .ct_attrs = node_attrs, + .ct_owner = THIS_MODULE, +}; + +static struct cluster *to_cluster(struct config_item *i) +{ + return i ? container_of(to_config_group(i), struct cluster, group):NULL; +} + +static struct space *to_space(struct config_item *i) +{ + return i ? container_of(to_config_group(i), struct space, group) : NULL; +} + +static struct comm *to_comm(struct config_item *i) +{ + return i ? container_of(i, struct comm, item) : NULL; +} + +static struct node *to_node(struct config_item *i) +{ + return i ? container_of(i, struct node, item) : NULL; +} + +static struct config_group *make_cluster(struct config_group *g, + const char *name) +{ + struct cluster *cl = NULL; + struct spaces *sps = NULL; + struct comms *cms = NULL; + void *gps = NULL; + + cl = kzalloc(sizeof(struct cluster), GFP_KERNEL); + gps = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL); + sps = kzalloc(sizeof(struct spaces), GFP_KERNEL); + cms = kzalloc(sizeof(struct comms), GFP_KERNEL); + + if (!cl || !gps || !sps || !cms) + goto fail; + + config_group_init_type_name(&cl->group, name, &cluster_type); + config_group_init_type_name(&sps->ss_group, "spaces", &spaces_type); + config_group_init_type_name(&cms->cs_group, "comms", &comms_type); + + cl->group.default_groups = gps; + cl->group.default_groups[0] = &sps->ss_group; + cl->group.default_groups[1] = &cms->cs_group; + cl->group.default_groups[2] = NULL; + + space_list = &sps->ss_group; + comm_list = &cms->cs_group; + return &cl->group; + + fail: + kfree(cl); + kfree(gps); + kfree(sps); + kfree(cms); + return NULL; +} + +static void drop_cluster(struct config_group *g, struct config_item *i) +{ + struct cluster *cl = to_cluster(i); + struct config_item *tmp; + int j; + + for (j = 0; cl->group.default_groups[j]; j++) { + tmp = &cl->group.default_groups[j]->cg_item; + cl->group.default_groups[j] = NULL; + config_item_put(tmp); + } + + space_list = NULL; + comm_list = NULL; + + config_item_put(i); +} + +static void release_cluster(struct config_item *i) +{ + struct cluster *cl = to_cluster(i); + kfree(cl->group.default_groups); + kfree(cl); +} + +static struct config_group *make_space(struct config_group *g, const char *name) +{ + struct space *sp = NULL; + struct nodes *nds = NULL; + void *gps = NULL; + + sp = kzalloc(sizeof(struct space), GFP_KERNEL); + gps = kcalloc(2, sizeof(struct config_group *), GFP_KERNEL); + nds = kzalloc(sizeof(struct nodes), GFP_KERNEL); + + if (!sp || !gps || !nds) + goto fail; + + config_group_init_type_name(&sp->group, name, &space_type); + config_group_init_type_name(&nds->ns_group, "nodes", &nodes_type); + + sp->group.default_groups = gps; + sp->group.default_groups[0] = &nds->ns_group; + sp->group.default_groups[1] = NULL; + + INIT_LIST_HEAD(&sp->members); + init_MUTEX(&sp->members_lock); + sp->members_count = 0; + return &sp->group; + + fail: + kfree(sp); + kfree(gps); + kfree(nds); + return NULL; +} + +static void drop_space(struct config_group *g, struct config_item *i) +{ + struct space *sp = to_space(i); + struct config_item *tmp; + int j; + + /* assert list_empty(&sp->members) */ + + for (j = 0; sp->group.default_groups[j]; j++) { + tmp = &sp->group.default_groups[j]->cg_item; + sp->group.default_groups[j] = NULL; + config_item_put(tmp); + } + + config_item_put(i); +} + +static void release_space(struct config_item *i) +{ + struct space *sp = to_space(i); + kfree(sp->group.default_groups); + kfree(sp); +} + +static struct config_item *make_comm(struct config_group *g, const char *name) +{ + struct comm *cm; + + cm = kzalloc(sizeof(struct comm), GFP_KERNEL); + if (!cm) + return NULL; + + config_item_init_type_name(&cm->item, name, &comm_type); + cm->nodeid = -1; + cm->local = 0; + cm->addr_count = 0; + return &cm->item; +} + +static void drop_comm(struct config_group *g, struct config_item *i) +{ + struct comm *cm = to_comm(i); + if (local_comm == cm) + local_comm = NULL; + while (cm->addr_count--) + kfree(cm->addr[cm->addr_count]); + config_item_put(i); +} + +static void release_comm(struct config_item *i) +{ + struct comm *cm = to_comm(i); + kfree(cm); +} + +static struct config_item *make_node(struct config_group *g, const char *name) +{ + struct space *sp = to_space(g->cg_item.ci_parent); + struct node *nd; + + nd = kzalloc(sizeof(struct node), GFP_KERNEL); + if (!nd) + return NULL; + + config_item_init_type_name(&nd->item, name, &node_type); + nd->nodeid = -1; + nd->weight = 1; /* default weight of 1 if none is set */ + + down(&sp->members_lock); + list_add(&nd->list, &sp->members); + sp->members_count++; + up(&sp->members_lock); + + return &nd->item; +} + +static void drop_node(struct config_group *g, struct config_item *i) +{ + struct space *sp = to_space(g->cg_item.ci_parent); + struct node *nd = to_node(i); + + down(&sp->members_lock); + list_del(&nd->list); + sp->members_count--; + up(&sp->members_lock); + + config_item_put(i); +} + +static void release_node(struct config_item *i) +{ + struct node *nd = to_node(i); + kfree(nd); +} + +static struct clusters clusters_root = { + .subsys = { + .su_group = { + .cg_item = { + .ci_namebuf = "dlm", + .ci_type = &clusters_type, + }, + }, + }, +}; + +int dlm_config_init(void) +{ + config_group_init(&clusters_root.subsys.su_group); + init_MUTEX(&clusters_root.subsys.su_sem); + return configfs_register_subsystem(&clusters_root.subsys); +} + +void dlm_config_exit(void) +{ + configfs_unregister_subsystem(&clusters_root.subsys); +} + +/* + * Functions for user space to read/write attributes + */ + +static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a, + char *buf) +{ + struct comm *cm = to_comm(i); + struct comm_attribute *cma = + container_of(a, struct comm_attribute, attr); + return cma->show ? cma->show(cm, buf) : 0; +} + +static ssize_t store_comm(struct config_item *i, struct configfs_attribute *a, + const char *buf, size_t len) +{ + struct comm *cm = to_comm(i); + struct comm_attribute *cma = + container_of(a, struct comm_attribute, attr); + return cma->store ? cma->store(cm, buf, len) : -EINVAL; +} + +static ssize_t comm_nodeid_read(struct comm *cm, char *buf) +{ + return sprintf(buf, "%d\n", cm->nodeid); +} + +static ssize_t comm_nodeid_write(struct comm *cm, const char *buf, size_t len) +{ + cm->nodeid = simple_strtol(buf, NULL, 0); + return len; +} + +static ssize_t comm_local_read(struct comm *cm, char *buf) +{ + return sprintf(buf, "%d\n", cm->local); +} + +static ssize_t comm_local_write(struct comm *cm, const char *buf, size_t len) +{ + cm->local= simple_strtol(buf, NULL, 0); + if (cm->local && !local_comm) + local_comm = cm; + return len; +} + +static ssize_t comm_addr_write(struct comm *cm, const char *buf, size_t len) +{ + struct sockaddr_storage *addr; + + if (len != sizeof(struct sockaddr_storage)) + return -EINVAL; + + if (cm->addr_count >= DLM_MAX_ADDR_COUNT) + return -ENOSPC; + + addr = kzalloc(sizeof(*addr), GFP_KERNEL); + if (!addr) + return -ENOMEM; + + memcpy(addr, buf, len); + cm->addr[cm->addr_count++] = addr; + return len; +} + +static ssize_t show_node(struct config_item *i, struct configfs_attribute *a, + char *buf) +{ + struct node *nd = to_node(i); + struct node_attribute *nda = + container_of(a, struct node_attribute, attr); + return nda->show ? nda->show(nd, buf) : 0; +} + +static ssize_t store_node(struct config_item *i, struct configfs_attribute *a, + const char *buf, size_t len) +{ + struct node *nd = to_node(i); + struct node_attribute *nda = + container_of(a, struct node_attribute, attr); + return nda->store ? nda->store(nd, buf, len) : -EINVAL; +} + +static ssize_t node_nodeid_read(struct node *nd, char *buf) +{ + return sprintf(buf, "%d\n", nd->nodeid); +} + +static ssize_t node_nodeid_write(struct node *nd, const char *buf, size_t len) +{ + nd->nodeid = simple_strtol(buf, NULL, 0); + return len; +} + +static ssize_t node_weight_read(struct node *nd, char *buf) +{ + return sprintf(buf, "%d\n", nd->weight); +} + +static ssize_t node_weight_write(struct node *nd, const char *buf, size_t len) +{ + nd->weight = simple_strtol(buf, NULL, 0); + return len; +} + +/* + * Functions for the dlm to get the info that's been configured + */ + +static struct space *get_space(char *name) +{ + if (!space_list) + return NULL; + return to_space(config_group_find_obj(space_list, name)); +} + +static void put_space(struct space *sp) +{ + config_item_put(&sp->group.cg_item); +} + +static struct comm *get_comm(int nodeid, struct sockaddr_storage *addr) +{ + struct config_item *i; + struct comm *cm = NULL; + int found = 0; + + if (!comm_list) + return NULL; + + down(&clusters_root.subsys.su_sem); + + list_for_each_entry(i, &comm_list->cg_children, ci_entry) { + cm = to_comm(i); + + if (nodeid) { + if (cm->nodeid != nodeid) + continue; + found = 1; + break; + } else { + if (!cm->addr_count || + memcmp(cm->addr[0], addr, sizeof(*addr))) + continue; + found = 1; + break; + } + } + up(&clusters_root.subsys.su_sem); + + if (found) + config_item_get(i); + else + cm = NULL; + return cm; +} + +static void put_comm(struct comm *cm) +{ + config_item_put(&cm->item); +} + +/* caller must free mem */ +int dlm_nodeid_list(char *lsname, int **ids_out) +{ + struct space *sp; + struct node *nd; + int i = 0, rv = 0; + int *ids; + + sp = get_space(lsname); + if (!sp) + return -EEXIST; + + down(&sp->members_lock); + if (!sp->members_count) { + rv = 0; + goto out; + } + + ids = kcalloc(sp->members_count, sizeof(int), GFP_KERNEL); + if (!ids) { + rv = -ENOMEM; + goto out; + } + + rv = sp->members_count; + list_for_each_entry(nd, &sp->members, list) + ids[i++] = nd->nodeid; + + if (rv != i) + printk("bad nodeid count %d %d\n", rv, i); + + *ids_out = ids; + out: + up(&sp->members_lock); + put_space(sp); + return rv; +} + +int dlm_node_weight(char *lsname, int nodeid) +{ + struct space *sp; + struct node *nd; + int w = -EEXIST; + + sp = get_space(lsname); + if (!sp) + goto out; + + down(&sp->members_lock); + list_for_each_entry(nd, &sp->members, list) { + if (nd->nodeid != nodeid) + continue; + w = nd->weight; + break; + } + up(&sp->members_lock); + put_space(sp); + out: + return w; +} + +int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr) +{ + struct comm *cm = get_comm(nodeid, NULL); + if (!cm) + return -EEXIST; + if (!cm->addr_count) + return -ENOENT; + memcpy(addr, cm->addr[0], sizeof(*addr)); + put_comm(cm); + return 0; +} + +int dlm_addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid) +{ + struct comm *cm = get_comm(0, addr); + if (!cm) + return -EEXIST; + *nodeid = cm->nodeid; + put_comm(cm); + return 0; +} + +int dlm_our_nodeid(void) +{ + return local_comm ? local_comm->nodeid : 0; +} + +/* num 0 is first addr, num 1 is second addr */ +int dlm_our_addr(struct sockaddr_storage *addr, int num) +{ + if (!local_comm) + return -1; + if (num + 1 > local_comm->addr_count) + return -1; + memcpy(addr, local_comm->addr[num], sizeof(*addr)); + return 0; +} + +/* Config file defaults */ +#define DEFAULT_TCP_PORT 21064 +#define DEFAULT_BUFFER_SIZE 4096 +#define DEFAULT_RSBTBL_SIZE 256 +#define DEFAULT_LKBTBL_SIZE 1024 +#define DEFAULT_DIRTBL_SIZE 512 +#define DEFAULT_RECOVER_TIMER 5 +#define DEFAULT_TOSS_SECS 10 +#define DEFAULT_SCAN_SECS 5 + +struct dlm_config_info dlm_config = { + .tcp_port = DEFAULT_TCP_PORT, + .buffer_size = DEFAULT_BUFFER_SIZE, + .rsbtbl_size = DEFAULT_RSBTBL_SIZE, + .lkbtbl_size = DEFAULT_LKBTBL_SIZE, + .dirtbl_size = DEFAULT_DIRTBL_SIZE, + .recover_timer = DEFAULT_RECOVER_TIMER, + .toss_secs = DEFAULT_TOSS_SECS, + .scan_secs = DEFAULT_SCAN_SECS +}; + diff --git a/fs/dlm/config.h b/fs/dlm/config.h new file mode 100644 index 000000000000..9da7839958a9 --- /dev/null +++ b/fs/dlm/config.h @@ -0,0 +1,42 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#ifndef __CONFIG_DOT_H__ +#define __CONFIG_DOT_H__ + +#define DLM_MAX_ADDR_COUNT 3 + +struct dlm_config_info { + int tcp_port; + int buffer_size; + int rsbtbl_size; + int lkbtbl_size; + int dirtbl_size; + int recover_timer; + int toss_secs; + int scan_secs; +}; + +extern struct dlm_config_info dlm_config; + +int dlm_config_init(void); +void dlm_config_exit(void); +int dlm_node_weight(char *lsname, int nodeid); +int dlm_nodeid_list(char *lsname, int **ids_out); +int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr); +int dlm_addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid); +int dlm_our_nodeid(void); +int dlm_our_addr(struct sockaddr_storage *addr, int num); + +#endif /* __CONFIG_DOT_H__ */ + diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c new file mode 100644 index 000000000000..98b49a1ece47 --- /dev/null +++ b/fs/dlm/debug_fs.c @@ -0,0 +1,310 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) 2005 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +#include +#include +#include +#include +#include + +#include "dlm_internal.h" + + +static struct dentry *dlm_root; + +struct rsb_iter { + int entry; + struct dlm_ls *ls; + struct list_head *next; + struct dlm_rsb *rsb; +}; + +static char *print_lockmode(int mode) +{ + switch (mode) { + case DLM_LOCK_IV: + return "--"; + case DLM_LOCK_NL: + return "NL"; + case DLM_LOCK_CR: + return "CR"; + case DLM_LOCK_CW: + return "CW"; + case DLM_LOCK_PR: + return "PR"; + case DLM_LOCK_PW: + return "PW"; + case DLM_LOCK_EX: + return "EX"; + default: + return "??"; + } +} + +static void print_lock(struct seq_file *s, struct dlm_lkb *lkb, + struct dlm_rsb *res) +{ + seq_printf(s, "%08x %s", lkb->lkb_id, print_lockmode(lkb->lkb_grmode)); + + if (lkb->lkb_status == DLM_LKSTS_CONVERT + || lkb->lkb_status == DLM_LKSTS_WAITING) + seq_printf(s, " (%s)", print_lockmode(lkb->lkb_rqmode)); + + if (lkb->lkb_range) { + /* FIXME: this warns on Alpha */ + if (lkb->lkb_status == DLM_LKSTS_CONVERT + || lkb->lkb_status == DLM_LKSTS_GRANTED) + seq_printf(s, " %" PRIx64 "-%" PRIx64, + lkb->lkb_range[GR_RANGE_START], + lkb->lkb_range[GR_RANGE_END]); + if (lkb->lkb_status == DLM_LKSTS_CONVERT + || lkb->lkb_status == DLM_LKSTS_WAITING) + seq_printf(s, " (%" PRIx64 "-%" PRIx64 ")", + lkb->lkb_range[RQ_RANGE_START], + lkb->lkb_range[RQ_RANGE_END]); + } + + if (lkb->lkb_nodeid) { + if (lkb->lkb_nodeid != res->res_nodeid) + seq_printf(s, " Remote: %3d %08x", lkb->lkb_nodeid, + lkb->lkb_remid); + else + seq_printf(s, " Master: %08x", lkb->lkb_remid); + } + + if (lkb->lkb_wait_type) + seq_printf(s, " wait_type: %d", lkb->lkb_wait_type); + + seq_printf(s, "\n"); +} + +static int print_resource(struct dlm_rsb *res, struct seq_file *s) +{ + struct dlm_lkb *lkb; + int i, lvblen = res->res_ls->ls_lvblen; + + seq_printf(s, "\nResource %p Name (len=%d) \"", res, res->res_length); + for (i = 0; i < res->res_length; i++) { + if (isprint(res->res_name[i])) + seq_printf(s, "%c", res->res_name[i]); + else + seq_printf(s, "%c", '.'); + } + if (res->res_nodeid > 0) + seq_printf(s, "\" \nLocal Copy, Master is node %d\n", + res->res_nodeid); + else if (res->res_nodeid == 0) + seq_printf(s, "\" \nMaster Copy\n"); + else if (res->res_nodeid == -1) + seq_printf(s, "\" \nLooking up master (lkid %x)\n", + res->res_first_lkid); + else + seq_printf(s, "\" \nInvalid master %d\n", res->res_nodeid); + + /* Print the LVB: */ + if (res->res_lvbptr) { + seq_printf(s, "LVB: "); + for (i = 0; i < lvblen; i++) { + if (i == lvblen / 2) + seq_printf(s, "\n "); + seq_printf(s, "%02x ", + (unsigned char) res->res_lvbptr[i]); + } + if (rsb_flag(res, RSB_VALNOTVALID)) + seq_printf(s, " (INVALID)"); + seq_printf(s, "\n"); + } + + /* Print the locks attached to this resource */ + seq_printf(s, "Granted Queue\n"); + list_for_each_entry(lkb, &res->res_grantqueue, lkb_statequeue) + print_lock(s, lkb, res); + + seq_printf(s, "Conversion Queue\n"); + list_for_each_entry(lkb, &res->res_convertqueue, lkb_statequeue) + print_lock(s, lkb, res); + + seq_printf(s, "Waiting Queue\n"); + list_for_each_entry(lkb, &res->res_waitqueue, lkb_statequeue) + print_lock(s, lkb, res); + + return 0; +} + +static int rsb_iter_next(struct rsb_iter *ri) +{ + struct dlm_ls *ls = ri->ls; + int i; + + if (!ri->next) { + top: + /* Find the next non-empty hash bucket */ + for (i = ri->entry; i < ls->ls_rsbtbl_size; i++) { + read_lock(&ls->ls_rsbtbl[i].lock); + if (!list_empty(&ls->ls_rsbtbl[i].list)) { + ri->next = ls->ls_rsbtbl[i].list.next; + read_unlock(&ls->ls_rsbtbl[i].lock); + break; + } + read_unlock(&ls->ls_rsbtbl[i].lock); + } + ri->entry = i; + + if (ri->entry >= ls->ls_rsbtbl_size) + return 1; + } else { + i = ri->entry; + read_lock(&ls->ls_rsbtbl[i].lock); + ri->next = ri->next->next; + if (ri->next->next == ls->ls_rsbtbl[i].list.next) { + /* End of list - move to next bucket */ + ri->next = NULL; + ri->entry++; + read_unlock(&ls->ls_rsbtbl[i].lock); + goto top; + } + read_unlock(&ls->ls_rsbtbl[i].lock); + } + ri->rsb = list_entry(ri->next, struct dlm_rsb, res_hashchain); + + return 0; +} + +static void rsb_iter_free(struct rsb_iter *ri) +{ + kfree(ri); +} + +static struct rsb_iter *rsb_iter_init(struct dlm_ls *ls) +{ + struct rsb_iter *ri; + + ri = kmalloc(sizeof *ri, GFP_KERNEL); + if (!ri) + return NULL; + + ri->ls = ls; + ri->entry = 0; + ri->next = NULL; + + if (rsb_iter_next(ri)) { + rsb_iter_free(ri); + return NULL; + } + + return ri; +} + +static void *seq_start(struct seq_file *file, loff_t *pos) +{ + struct rsb_iter *ri; + loff_t n = *pos; + + ri = rsb_iter_init(file->private); + if (!ri) + return NULL; + + while (n--) { + if (rsb_iter_next(ri)) { + rsb_iter_free(ri); + return NULL; + } + } + + return ri; +} + +static void *seq_next(struct seq_file *file, void *iter_ptr, loff_t *pos) +{ + struct rsb_iter *ri = iter_ptr; + + (*pos)++; + + if (rsb_iter_next(ri)) { + rsb_iter_free(ri); + return NULL; + } + + return ri; +} + +static void seq_stop(struct seq_file *file, void *iter_ptr) +{ + /* nothing for now */ +} + +static int seq_show(struct seq_file *file, void *iter_ptr) +{ + struct rsb_iter *ri = iter_ptr; + + print_resource(ri->rsb, file); + + return 0; +} + +static struct seq_operations dlm_seq_ops = { + .start = seq_start, + .next = seq_next, + .stop = seq_stop, + .show = seq_show, +}; + +static int do_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int ret; + + ret = seq_open(file, &dlm_seq_ops); + if (ret) + return ret; + + seq = file->private_data; + seq->private = inode->u.generic_ip; + + return 0; +} + +static struct file_operations dlm_fops = { + .owner = THIS_MODULE, + .open = do_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release +}; + +int dlm_create_debug_file(struct dlm_ls *ls) +{ + ls->ls_debug_dentry = debugfs_create_file(ls->ls_name, + S_IFREG | S_IRUGO, + dlm_root, + ls, + &dlm_fops); + return ls->ls_debug_dentry ? 0 : -ENOMEM; +} + +void dlm_delete_debug_file(struct dlm_ls *ls) +{ + if (ls->ls_debug_dentry) + debugfs_remove(ls->ls_debug_dentry); +} + +int dlm_register_debugfs(void) +{ + dlm_root = debugfs_create_dir("dlm", NULL); + return dlm_root ? 0 : -ENOMEM; +} + +void dlm_unregister_debugfs(void) +{ + debugfs_remove(dlm_root); +} + diff --git a/fs/dlm/device.c b/fs/dlm/device.c new file mode 100644 index 000000000000..a8bf600ed13d --- /dev/null +++ b/fs/dlm/device.c @@ -0,0 +1,1084 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +/* + * device.c + * + * This is the userland interface to the DLM. + * + * The locking is done via a misc char device (find the + * registered minor number in /proc/misc). + * + * User code should not use this interface directly but + * call the library routines in libdlm.a instead. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "lvb_table.h" + +static struct file_operations _dlm_fops; +static const char *name_prefix="dlm"; +static struct list_head user_ls_list; +static struct semaphore user_ls_lock; + +/* Lock infos are stored in here indexed by lock ID */ +static DEFINE_IDR(lockinfo_idr); +static rwlock_t lockinfo_lock; + +/* Flags in li_flags */ +#define LI_FLAG_COMPLETE 1 +#define LI_FLAG_FIRSTLOCK 2 +#define LI_FLAG_PERSISTENT 3 + +/* flags in ls_flags*/ +#define LS_FLAG_DELETED 1 +#define LS_FLAG_AUTOFREE 2 + + +#define LOCKINFO_MAGIC 0x53595324 + +struct lock_info { + uint32_t li_magic; + uint8_t li_cmd; + int8_t li_grmode; + int8_t li_rqmode; + struct dlm_lksb li_lksb; + wait_queue_head_t li_waitq; + unsigned long li_flags; + void __user *li_castparam; + void __user *li_castaddr; + void __user *li_bastparam; + void __user *li_bastaddr; + void __user *li_pend_bastparam; + void __user *li_pend_bastaddr; + struct list_head li_ownerqueue; + struct file_info *li_file; + struct dlm_lksb __user *li_user_lksb; + struct semaphore li_firstlock; +}; + +/* A queued AST no less */ +struct ast_info { + struct dlm_lock_result result; + struct list_head list; + uint32_t lvb_updated; + uint32_t progress; /* How much has been read */ +}; + +/* One of these per userland lockspace */ +struct user_ls { + void *ls_lockspace; + atomic_t ls_refcnt; + long ls_flags; + + /* Passed into misc_register() */ + struct miscdevice ls_miscinfo; + struct list_head ls_list; +}; + +/* misc_device info for the control device */ +static struct miscdevice ctl_device; + +/* + * Stuff we hang off the file struct. + * The first two are to cope with unlocking all the + * locks help by a process when it dies. + */ +struct file_info { + struct list_head fi_li_list; /* List of active lock_infos */ + spinlock_t fi_li_lock; + struct list_head fi_ast_list; /* Queue of ASTs to be delivered */ + spinlock_t fi_ast_lock; + wait_queue_head_t fi_wait; + struct user_ls *fi_ls; + atomic_t fi_refcnt; /* Number of users */ + unsigned long fi_flags; /* Bit 1 means the device is open */ +}; + + +/* get and put ops for file_info. + Actually I don't really like "get" and "put", but everyone + else seems to use them and I can't think of anything + nicer at the moment */ +static void get_file_info(struct file_info *f) +{ + atomic_inc(&f->fi_refcnt); +} + +static void put_file_info(struct file_info *f) +{ + if (atomic_dec_and_test(&f->fi_refcnt)) + kfree(f); +} + +static void release_lockinfo(struct lock_info *li) +{ + put_file_info(li->li_file); + + write_lock(&lockinfo_lock); + idr_remove(&lockinfo_idr, li->li_lksb.sb_lkid); + write_unlock(&lockinfo_lock); + + if (li->li_lksb.sb_lvbptr) + kfree(li->li_lksb.sb_lvbptr); + kfree(li); + + module_put(THIS_MODULE); +} + +static struct lock_info *get_lockinfo(uint32_t lockid) +{ + struct lock_info *li; + + read_lock(&lockinfo_lock); + li = idr_find(&lockinfo_idr, lockid); + read_unlock(&lockinfo_lock); + + return li; +} + +static int add_lockinfo(struct lock_info *li) +{ + int n; + int r; + int ret = -EINVAL; + + write_lock(&lockinfo_lock); + + if (idr_find(&lockinfo_idr, li->li_lksb.sb_lkid)) + goto out_up; + + ret = -ENOMEM; + r = idr_pre_get(&lockinfo_idr, GFP_KERNEL); + if (!r) + goto out_up; + + r = idr_get_new_above(&lockinfo_idr, li, li->li_lksb.sb_lkid, &n); + if (r) + goto out_up; + + if (n != li->li_lksb.sb_lkid) { + idr_remove(&lockinfo_idr, n); + goto out_up; + } + + ret = 0; + + out_up: + write_unlock(&lockinfo_lock); + + return ret; +} + + +static struct user_ls *__find_lockspace(int minor) +{ + struct user_ls *lsinfo; + + list_for_each_entry(lsinfo, &user_ls_list, ls_list) { + if (lsinfo->ls_miscinfo.minor == minor) + return lsinfo; + } + return NULL; +} + +/* Find a lockspace struct given the device minor number */ +static struct user_ls *find_lockspace(int minor) +{ + struct user_ls *lsinfo; + + down(&user_ls_lock); + lsinfo = __find_lockspace(minor); + up(&user_ls_lock); + + return lsinfo; +} + +static void add_lockspace_to_list(struct user_ls *lsinfo) +{ + down(&user_ls_lock); + list_add(&lsinfo->ls_list, &user_ls_list); + up(&user_ls_lock); +} + +/* Register a lockspace with the DLM and create a misc + device for userland to access it */ +static int register_lockspace(char *name, struct user_ls **ls, int flags) +{ + struct user_ls *newls; + int status; + int namelen; + + namelen = strlen(name)+strlen(name_prefix)+2; + + newls = kmalloc(sizeof(struct user_ls), GFP_KERNEL); + if (!newls) + return -ENOMEM; + memset(newls, 0, sizeof(struct user_ls)); + + newls->ls_miscinfo.name = kmalloc(namelen, GFP_KERNEL); + if (!newls->ls_miscinfo.name) { + kfree(newls); + return -ENOMEM; + } + + status = dlm_new_lockspace(name, strlen(name), &newls->ls_lockspace, 0, + DLM_USER_LVB_LEN); + if (status != 0) { + kfree(newls->ls_miscinfo.name); + kfree(newls); + return status; + } + + snprintf((char*)newls->ls_miscinfo.name, namelen, "%s_%s", + name_prefix, name); + + newls->ls_miscinfo.fops = &_dlm_fops; + newls->ls_miscinfo.minor = MISC_DYNAMIC_MINOR; + + status = misc_register(&newls->ls_miscinfo); + if (status) { + printk(KERN_ERR "dlm: misc register failed for %s\n", name); + dlm_release_lockspace(newls->ls_lockspace, 0); + kfree(newls->ls_miscinfo.name); + kfree(newls); + return status; + } + + if (flags & DLM_USER_LSFLG_AUTOFREE) + set_bit(LS_FLAG_AUTOFREE, &newls->ls_flags); + + add_lockspace_to_list(newls); + *ls = newls; + return 0; +} + +/* Called with the user_ls_lock semaphore held */ +static int unregister_lockspace(struct user_ls *lsinfo, int force) +{ + int status; + + status = dlm_release_lockspace(lsinfo->ls_lockspace, force); + if (status) + return status; + + status = misc_deregister(&lsinfo->ls_miscinfo); + if (status) + return status; + + list_del(&lsinfo->ls_list); + set_bit(LS_FLAG_DELETED, &lsinfo->ls_flags); + lsinfo->ls_lockspace = NULL; + if (atomic_read(&lsinfo->ls_refcnt) == 0) { + kfree(lsinfo->ls_miscinfo.name); + kfree(lsinfo); + } + + return 0; +} + +/* Add it to userland's AST queue */ +static void add_to_astqueue(struct lock_info *li, void *astaddr, void *astparam, + int lvb_updated) +{ + struct ast_info *ast = kmalloc(sizeof(struct ast_info), GFP_KERNEL); + if (!ast) + return; + + memset(ast, 0, sizeof(*ast)); + ast->result.user_astparam = astparam; + ast->result.user_astaddr = astaddr; + ast->result.user_lksb = li->li_user_lksb; + memcpy(&ast->result.lksb, &li->li_lksb, sizeof(struct dlm_lksb)); + ast->lvb_updated = lvb_updated; + + spin_lock(&li->li_file->fi_ast_lock); + list_add_tail(&ast->list, &li->li_file->fi_ast_list); + spin_unlock(&li->li_file->fi_ast_lock); + wake_up_interruptible(&li->li_file->fi_wait); +} + +static void bast_routine(void *param, int mode) +{ + struct lock_info *li = param; + + if (li && li->li_bastaddr) + add_to_astqueue(li, li->li_bastaddr, li->li_bastparam, 0); +} + +/* + * This is the kernel's AST routine. + * All lock, unlock & query operations complete here. + * The only syncronous ops are those done during device close. + */ +static void ast_routine(void *param) +{ + struct lock_info *li = param; + + /* Param may be NULL if a persistent lock is unlocked by someone else */ + if (!li) + return; + + /* If this is a succesful conversion then activate the blocking ast + * args from the conversion request */ + if (!test_bit(LI_FLAG_FIRSTLOCK, &li->li_flags) && + li->li_lksb.sb_status == 0) { + + li->li_bastparam = li->li_pend_bastparam; + li->li_bastaddr = li->li_pend_bastaddr; + li->li_pend_bastaddr = NULL; + } + + /* If it's an async request then post data to the user's AST queue. */ + if (li->li_castaddr) { + int lvb_updated = 0; + + /* See if the lvb has been updated */ + if (dlm_lvb_operations[li->li_grmode+1][li->li_rqmode+1] == 1) + lvb_updated = 1; + + if (li->li_lksb.sb_status == 0) + li->li_grmode = li->li_rqmode; + + /* Only queue AST if the device is still open */ + if (test_bit(1, &li->li_file->fi_flags)) + add_to_astqueue(li, li->li_castaddr, li->li_castparam, + lvb_updated); + + /* If it's a new lock operation that failed, then + * remove it from the owner queue and free the + * lock_info. + */ + if (test_and_clear_bit(LI_FLAG_FIRSTLOCK, &li->li_flags) && + li->li_lksb.sb_status != 0) { + + /* Wait till dlm_lock() has finished */ + down(&li->li_firstlock); + up(&li->li_firstlock); + + spin_lock(&li->li_file->fi_li_lock); + list_del(&li->li_ownerqueue); + spin_unlock(&li->li_file->fi_li_lock); + release_lockinfo(li); + return; + } + /* Free unlocks & queries */ + if (li->li_lksb.sb_status == -DLM_EUNLOCK || + li->li_cmd == DLM_USER_QUERY) { + release_lockinfo(li); + } + } else { + /* Synchronous request, just wake up the caller */ + set_bit(LI_FLAG_COMPLETE, &li->li_flags); + wake_up_interruptible(&li->li_waitq); + } +} + +/* + * Wait for the lock op to complete and return the status. + */ +static int wait_for_ast(struct lock_info *li) +{ + /* Wait for the AST routine to complete */ + set_task_state(current, TASK_INTERRUPTIBLE); + while (!test_bit(LI_FLAG_COMPLETE, &li->li_flags)) + schedule(); + + set_task_state(current, TASK_RUNNING); + + return li->li_lksb.sb_status; +} + + +/* Open on control device */ +static int dlm_ctl_open(struct inode *inode, struct file *file) +{ + file->private_data = NULL; + return 0; +} + +/* Close on control device */ +static int dlm_ctl_close(struct inode *inode, struct file *file) +{ + return 0; +} + +/* Open on lockspace device */ +static int dlm_open(struct inode *inode, struct file *file) +{ + struct file_info *f; + struct user_ls *lsinfo; + + lsinfo = find_lockspace(iminor(inode)); + if (!lsinfo) + return -ENOENT; + + f = kmalloc(sizeof(struct file_info), GFP_KERNEL); + if (!f) + return -ENOMEM; + + atomic_inc(&lsinfo->ls_refcnt); + INIT_LIST_HEAD(&f->fi_li_list); + INIT_LIST_HEAD(&f->fi_ast_list); + spin_lock_init(&f->fi_li_lock); + spin_lock_init(&f->fi_ast_lock); + init_waitqueue_head(&f->fi_wait); + f->fi_ls = lsinfo; + f->fi_flags = 0; + get_file_info(f); + set_bit(1, &f->fi_flags); + + file->private_data = f; + + return 0; +} + +/* Check the user's version matches ours */ +static int check_version(struct dlm_write_request *req) +{ + if (req->version[0] != DLM_DEVICE_VERSION_MAJOR || + (req->version[0] == DLM_DEVICE_VERSION_MAJOR && + req->version[1] > DLM_DEVICE_VERSION_MINOR)) { + + printk(KERN_DEBUG "dlm: process %s (%d) version mismatch " + "user (%d.%d.%d) kernel (%d.%d.%d)\n", + current->comm, + current->pid, + req->version[0], + req->version[1], + req->version[2], + DLM_DEVICE_VERSION_MAJOR, + DLM_DEVICE_VERSION_MINOR, + DLM_DEVICE_VERSION_PATCH); + return -EINVAL; + } + return 0; +} + +/* Close on lockspace device */ +static int dlm_close(struct inode *inode, struct file *file) +{ + struct file_info *f = file->private_data; + struct lock_info li; + struct lock_info *old_li, *safe; + sigset_t tmpsig; + sigset_t allsigs; + struct user_ls *lsinfo; + DECLARE_WAITQUEUE(wq, current); + + lsinfo = find_lockspace(iminor(inode)); + if (!lsinfo) + return -ENOENT; + + /* Mark this closed so that ASTs will not be delivered any more */ + clear_bit(1, &f->fi_flags); + + /* Block signals while we are doing this */ + sigfillset(&allsigs); + sigprocmask(SIG_BLOCK, &allsigs, &tmpsig); + + /* We use our own lock_info struct here, so that any + * outstanding "real" ASTs will be delivered with the + * corresponding "real" params, thus freeing the lock_info + * that belongs the lock. This catches the corner case where + * a lock is BUSY when we try to unlock it here + */ + memset(&li, 0, sizeof(li)); + clear_bit(LI_FLAG_COMPLETE, &li.li_flags); + init_waitqueue_head(&li.li_waitq); + add_wait_queue(&li.li_waitq, &wq); + + /* + * Free any outstanding locks, they are on the + * list in LIFO order so there should be no problems + * about unlocking parents before children. + */ + list_for_each_entry_safe(old_li, safe, &f->fi_li_list, li_ownerqueue) { + int status; + int flags = 0; + + /* Don't unlock persistent locks, just mark them orphaned */ + if (test_bit(LI_FLAG_PERSISTENT, &old_li->li_flags)) { + list_del(&old_li->li_ownerqueue); + + /* Update master copy */ + /* TODO: Check locking core updates the local and + remote ORPHAN flags */ + li.li_lksb.sb_lkid = old_li->li_lksb.sb_lkid; + status = dlm_lock(f->fi_ls->ls_lockspace, + old_li->li_grmode, &li.li_lksb, + DLM_LKF_CONVERT|DLM_LKF_ORPHAN, + NULL, 0, 0, ast_routine, NULL, + NULL, NULL); + if (status != 0) + printk("dlm: Error orphaning lock %x: %d\n", + old_li->li_lksb.sb_lkid, status); + + /* But tidy our references in it */ + release_lockinfo(old_li); + continue; + } + + clear_bit(LI_FLAG_COMPLETE, &li.li_flags); + + flags = DLM_LKF_FORCEUNLOCK; + if (old_li->li_grmode >= DLM_LOCK_PW) + flags |= DLM_LKF_IVVALBLK; + + status = dlm_unlock(f->fi_ls->ls_lockspace, + old_li->li_lksb.sb_lkid, flags, + &li.li_lksb, &li); + + /* Must wait for it to complete as the next lock could be its + * parent */ + if (status == 0) + wait_for_ast(&li); + + /* Unlock suceeded, free the lock_info struct. */ + if (status == 0) + release_lockinfo(old_li); + } + + remove_wait_queue(&li.li_waitq, &wq); + + /* + * If this is the last reference to the lockspace + * then free the struct. If it's an AUTOFREE lockspace + * then free the whole thing. + */ + down(&user_ls_lock); + if (atomic_dec_and_test(&lsinfo->ls_refcnt)) { + + if (lsinfo->ls_lockspace) { + if (test_bit(LS_FLAG_AUTOFREE, &lsinfo->ls_flags)) { + unregister_lockspace(lsinfo, 1); + } + } else { + kfree(lsinfo->ls_miscinfo.name); + kfree(lsinfo); + } + } + up(&user_ls_lock); + put_file_info(f); + + /* Restore signals */ + sigprocmask(SIG_SETMASK, &tmpsig, NULL); + recalc_sigpending(); + + return 0; +} + +static int do_user_create_lockspace(struct file_info *fi, uint8_t cmd, + struct dlm_lspace_params *kparams) +{ + int status; + struct user_ls *lsinfo; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + status = register_lockspace(kparams->name, &lsinfo, kparams->flags); + + /* If it succeeded then return the minor number */ + if (status == 0) + status = lsinfo->ls_miscinfo.minor; + + return status; +} + +static int do_user_remove_lockspace(struct file_info *fi, uint8_t cmd, + struct dlm_lspace_params *kparams) +{ + int status; + int force = 1; + struct user_ls *lsinfo; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + down(&user_ls_lock); + lsinfo = __find_lockspace(kparams->minor); + if (!lsinfo) { + up(&user_ls_lock); + return -EINVAL; + } + + if (kparams->flags & DLM_USER_LSFLG_FORCEFREE) + force = 2; + + status = unregister_lockspace(lsinfo, force); + up(&user_ls_lock); + + return status; +} + +/* Read call, might block if no ASTs are waiting. + * It will only ever return one message at a time, regardless + * of how many are pending. + */ +static ssize_t dlm_read(struct file *file, char __user *buffer, size_t count, + loff_t *ppos) +{ + struct file_info *fi = file->private_data; + struct ast_info *ast; + int data_size; + int offset; + DECLARE_WAITQUEUE(wait, current); + + if (count < sizeof(struct dlm_lock_result)) + return -EINVAL; + + spin_lock(&fi->fi_ast_lock); + if (list_empty(&fi->fi_ast_list)) { + + /* No waiting ASTs. + * Return EOF if the lockspace been deleted. + */ + if (test_bit(LS_FLAG_DELETED, &fi->fi_ls->ls_flags)) + return 0; + + if (file->f_flags & O_NONBLOCK) { + spin_unlock(&fi->fi_ast_lock); + return -EAGAIN; + } + + add_wait_queue(&fi->fi_wait, &wait); + + repeat: + set_current_state(TASK_INTERRUPTIBLE); + if (list_empty(&fi->fi_ast_list) && + !signal_pending(current)) { + + spin_unlock(&fi->fi_ast_lock); + schedule(); + spin_lock(&fi->fi_ast_lock); + goto repeat; + } + + current->state = TASK_RUNNING; + remove_wait_queue(&fi->fi_wait, &wait); + + if (signal_pending(current)) { + spin_unlock(&fi->fi_ast_lock); + return -ERESTARTSYS; + } + } + + ast = list_entry(fi->fi_ast_list.next, struct ast_info, list); + list_del(&ast->list); + spin_unlock(&fi->fi_ast_lock); + + /* Work out the size of the returned data */ + data_size = sizeof(struct dlm_lock_result); + if (ast->lvb_updated && ast->result.lksb.sb_lvbptr) + data_size += DLM_USER_LVB_LEN; + + offset = sizeof(struct dlm_lock_result); + + /* Room for the extended data ? */ + if (count >= data_size) { + + if (ast->lvb_updated && ast->result.lksb.sb_lvbptr) { + if (copy_to_user(buffer+offset, + ast->result.lksb.sb_lvbptr, + DLM_USER_LVB_LEN)) + return -EFAULT; + ast->result.lvb_offset = offset; + offset += DLM_USER_LVB_LEN; + } + } + + ast->result.length = data_size; + /* Copy the header now it has all the offsets in it */ + if (copy_to_user(buffer, &ast->result, sizeof(struct dlm_lock_result))) + offset = -EFAULT; + + /* If we only returned a header and there's more to come then put it + back on the list */ + if (count < data_size) { + spin_lock(&fi->fi_ast_lock); + list_add(&ast->list, &fi->fi_ast_list); + spin_unlock(&fi->fi_ast_lock); + } else + kfree(ast); + return offset; +} + +static unsigned int dlm_poll(struct file *file, poll_table *wait) +{ + struct file_info *fi = file->private_data; + + poll_wait(file, &fi->fi_wait, wait); + + spin_lock(&fi->fi_ast_lock); + if (!list_empty(&fi->fi_ast_list)) { + spin_unlock(&fi->fi_ast_lock); + return POLLIN | POLLRDNORM; + } + + spin_unlock(&fi->fi_ast_lock); + return 0; +} + +static struct lock_info *allocate_lockinfo(struct file_info *fi, uint8_t cmd, + struct dlm_lock_params *kparams) +{ + struct lock_info *li; + + if (!try_module_get(THIS_MODULE)) + return NULL; + + li = kmalloc(sizeof(struct lock_info), GFP_KERNEL); + if (li) { + li->li_magic = LOCKINFO_MAGIC; + li->li_file = fi; + li->li_cmd = cmd; + li->li_flags = 0; + li->li_grmode = -1; + li->li_rqmode = -1; + li->li_pend_bastparam = NULL; + li->li_pend_bastaddr = NULL; + li->li_castaddr = NULL; + li->li_castparam = NULL; + li->li_lksb.sb_lvbptr = NULL; + li->li_bastaddr = kparams->bastaddr; + li->li_bastparam = kparams->bastparam; + + get_file_info(fi); + } + return li; +} + +static int do_user_lock(struct file_info *fi, uint8_t cmd, + struct dlm_lock_params *kparams) +{ + struct lock_info *li; + int status; + + /* + * Validate things that we need to have correct. + */ + if (!kparams->castaddr) + return -EINVAL; + + if (!kparams->lksb) + return -EINVAL; + + /* Persistent child locks are not available yet */ + if ((kparams->flags & DLM_LKF_PERSISTENT) && kparams->parent) + return -EINVAL; + + /* For conversions, there should already be a lockinfo struct, + unless we are adopting an orphaned persistent lock */ + if (kparams->flags & DLM_LKF_CONVERT) { + + li = get_lockinfo(kparams->lkid); + + /* If this is a persistent lock we will have to create a + lockinfo again */ + if (!li && DLM_LKF_PERSISTENT) { + li = allocate_lockinfo(fi, cmd, kparams); + + li->li_lksb.sb_lkid = kparams->lkid; + li->li_castaddr = kparams->castaddr; + li->li_castparam = kparams->castparam; + + /* OK, this isn;t exactly a FIRSTLOCK but it is the + first time we've used this lockinfo, and if things + fail we want rid of it */ + init_MUTEX_LOCKED(&li->li_firstlock); + set_bit(LI_FLAG_FIRSTLOCK, &li->li_flags); + add_lockinfo(li); + + /* TODO: do a query to get the current state ?? */ + } + if (!li) + return -EINVAL; + + if (li->li_magic != LOCKINFO_MAGIC) + return -EINVAL; + + /* For conversions don't overwrite the current blocking AST + info so that: + a) if a blocking AST fires before the conversion is queued + it runs the current handler + b) if the conversion is cancelled, the original blocking AST + declaration is active + The pend_ info is made active when the conversion + completes. + */ + li->li_pend_bastaddr = kparams->bastaddr; + li->li_pend_bastparam = kparams->bastparam; + } else { + li = allocate_lockinfo(fi, cmd, kparams); + if (!li) + return -ENOMEM; + + /* semaphore to allow us to complete our work before + the AST routine runs. In fact we only need (and use) this + when the initial lock fails */ + init_MUTEX_LOCKED(&li->li_firstlock); + set_bit(LI_FLAG_FIRSTLOCK, &li->li_flags); + } + + li->li_user_lksb = kparams->lksb; + li->li_castaddr = kparams->castaddr; + li->li_castparam = kparams->castparam; + li->li_lksb.sb_lkid = kparams->lkid; + li->li_rqmode = kparams->mode; + if (kparams->flags & DLM_LKF_PERSISTENT) + set_bit(LI_FLAG_PERSISTENT, &li->li_flags); + + /* Copy in the value block */ + if (kparams->flags & DLM_LKF_VALBLK) { + if (!li->li_lksb.sb_lvbptr) { + li->li_lksb.sb_lvbptr = kmalloc(DLM_USER_LVB_LEN, + GFP_KERNEL); + if (!li->li_lksb.sb_lvbptr) { + status = -ENOMEM; + goto out_err; + } + } + + memcpy(li->li_lksb.sb_lvbptr, kparams->lvb, DLM_USER_LVB_LEN); + } + + /* Lock it ... */ + status = dlm_lock(fi->fi_ls->ls_lockspace, + kparams->mode, &li->li_lksb, + kparams->flags, + kparams->name, kparams->namelen, + kparams->parent, + ast_routine, + li, + (li->li_pend_bastaddr || li->li_bastaddr) ? + bast_routine : NULL, + kparams->range.ra_end ? &kparams->range : NULL); + if (status) + goto out_err; + + /* If it succeeded (this far) with a new lock then keep track of + it on the file's lockinfo list */ + if (!status && test_bit(LI_FLAG_FIRSTLOCK, &li->li_flags)) { + + spin_lock(&fi->fi_li_lock); + list_add(&li->li_ownerqueue, &fi->fi_li_list); + spin_unlock(&fi->fi_li_lock); + if (add_lockinfo(li)) + printk(KERN_WARNING "Add lockinfo failed\n"); + + up(&li->li_firstlock); + } + + /* Return the lockid as the user needs it /now/ */ + return li->li_lksb.sb_lkid; + + out_err: + if (test_bit(LI_FLAG_FIRSTLOCK, &li->li_flags)) + release_lockinfo(li); + return status; + +} + +static int do_user_unlock(struct file_info *fi, uint8_t cmd, + struct dlm_lock_params *kparams) +{ + struct lock_info *li; + int status; + int convert_cancel = 0; + + li = get_lockinfo(kparams->lkid); + if (!li) { + li = allocate_lockinfo(fi, cmd, kparams); + spin_lock(&fi->fi_li_lock); + list_add(&li->li_ownerqueue, &fi->fi_li_list); + spin_unlock(&fi->fi_li_lock); + } + if (!li) + return -ENOMEM; + + if (li->li_magic != LOCKINFO_MAGIC) + return -EINVAL; + + li->li_user_lksb = kparams->lksb; + li->li_castparam = kparams->castparam; + li->li_cmd = cmd; + + /* Cancelling a conversion doesn't remove the lock...*/ + if (kparams->flags & DLM_LKF_CANCEL && li->li_grmode != -1) + convert_cancel = 1; + + /* dlm_unlock() passes a 0 for castaddr which means don't overwrite + the existing li_castaddr as that's the completion routine for + unlocks. dlm_unlock_wait() specifies a new AST routine to be + executed when the unlock completes. */ + if (kparams->castaddr) + li->li_castaddr = kparams->castaddr; + + /* Use existing lksb & astparams */ + status = dlm_unlock(fi->fi_ls->ls_lockspace, + kparams->lkid, + kparams->flags, &li->li_lksb, li); + + if (!status && !convert_cancel) { + spin_lock(&fi->fi_li_lock); + list_del(&li->li_ownerqueue); + spin_unlock(&fi->fi_li_lock); + } + + return status; +} + +/* Write call, submit a locking request */ +static ssize_t dlm_write(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos) +{ + struct file_info *fi = file->private_data; + struct dlm_write_request *kparams; + sigset_t tmpsig; + sigset_t allsigs; + int status; + + /* -1 because lock name is optional */ + if (count < sizeof(struct dlm_write_request)-1) + return -EINVAL; + + /* Has the lockspace been deleted */ + if (fi && test_bit(LS_FLAG_DELETED, &fi->fi_ls->ls_flags)) + return -ENOENT; + + kparams = kmalloc(count, GFP_KERNEL); + if (!kparams) + return -ENOMEM; + + status = -EFAULT; + /* Get the command info */ + if (copy_from_user(kparams, buffer, count)) + goto out_free; + + status = -EBADE; + if (check_version(kparams)) + goto out_free; + + /* Block signals while we are doing this */ + sigfillset(&allsigs); + sigprocmask(SIG_BLOCK, &allsigs, &tmpsig); + + status = -EINVAL; + switch (kparams->cmd) + { + case DLM_USER_LOCK: + if (!fi) goto out_sig; + status = do_user_lock(fi, kparams->cmd, &kparams->i.lock); + break; + + case DLM_USER_UNLOCK: + if (!fi) goto out_sig; + status = do_user_unlock(fi, kparams->cmd, &kparams->i.lock); + break; + + case DLM_USER_CREATE_LOCKSPACE: + if (fi) goto out_sig; + status = do_user_create_lockspace(fi, kparams->cmd, + &kparams->i.lspace); + break; + + case DLM_USER_REMOVE_LOCKSPACE: + if (fi) goto ou