From 09adc80c611bb8902daa8ccfe34dbbc009d6befe Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 4 Feb 2011 21:38:47 -0800 Subject: ceph: preserve I_COMPLETE across rename d_move puts the renamed dentry at the end of d_subdirs, screwing with our cached dentry directory offsets. We were just clearing I_COMPLETE to avoid any possibility of trouble. However, assigning the renamed dentry an offset at the end of the directory (to match it's new d_subdirs position) is sufficient to maintain correct behavior and hold onto I_COMPLETE. This is especially important for workloads like rsync, which renames files into place. Before, we would lose I_COMPLETE and do MDS lookups for each file. With this patch we only talk to the MDS on create and rename. Signed-off-by: Sage Weil --- fs/ceph/inode.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 193bfa5e9cbd..60456361e07d 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1030,9 +1030,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, dout("fill_trace doing d_move %p -> %p\n", req->r_old_dentry, dn); - /* d_move screws up d_subdirs order */ - ceph_i_clear(dir, CEPH_I_COMPLETE); - d_move(req->r_old_dentry, dn); dout(" src %p '%.*s' dst %p '%.*s'\n", req->r_old_dentry, @@ -1044,12 +1041,15 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, rehashing bug in vfs_rename_dir */ ceph_invalidate_dentry_lease(dn); - /* take overwritten dentry's readdir offset */ - dout("dn %p gets %p offset %lld (old offset %lld)\n", - req->r_old_dentry, dn, ceph_dentry(dn)->offset, + /* + * d_move() puts the renamed dentry at the end of + * d_subdirs. We need to assign it an appropriate + * directory offset so we can behave when holding + * I_COMPLETE. + */ + ceph_set_dentry_offset(req->r_old_dentry); + dout("dn %p gets new offset %lld\n", req->r_old_dentry, ceph_dentry(req->r_old_dentry)->offset); - ceph_dentry(req->r_old_dentry)->offset = - ceph_dentry(dn)->offset; dn = req->r_old_dentry; /* use old_dentry */ in = dn->d_inode; -- cgit v1.2.3 From 21f3b5f1bbc3c27e82a8c9fc9861fa20bcb31f26 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 19 Jan 2011 09:45:22 -0800 Subject: ceph: remove debugfs debug cruft Whoops! Signed-off-by: Sage Weil --- fs/ceph/debugfs.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'fs') diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 08f65faac112..0dba6915712b 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -210,8 +210,6 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc) if (!fsc->debugfs_congestion_kb) goto out; - dout("a\n"); - snprintf(name, sizeof(name), "../../bdi/%s", dev_name(fsc->backing_dev_info.dev)); fsc->debugfs_bdi = @@ -221,7 +219,6 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc) if (!fsc->debugfs_bdi) goto out; - dout("b\n"); fsc->debugfs_mdsmap = debugfs_create_file("mdsmap", 0600, fsc->client->debugfs_dir, @@ -230,7 +227,6 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc) if (!fsc->debugfs_mdsmap) goto out; - dout("ca\n"); fsc->debugfs_mdsc = debugfs_create_file("mdsc", 0600, fsc->client->debugfs_dir, @@ -239,7 +235,6 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc) if (!fsc->debugfs_mdsc) goto out; - dout("da\n"); fsc->debugfs_caps = debugfs_create_file("caps", 0400, fsc->client->debugfs_dir, @@ -248,7 +243,6 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc) if (!fsc->debugfs_caps) goto out; - dout("ea\n"); fsc->debugfs_dentry_lru = debugfs_create_file("dentry_lru", 0600, fsc->client->debugfs_dir, -- cgit v1.2.3 From ad1fee96cbaf873520064252c5dc3212c9844861 Mon Sep 17 00:00:00 2001 From: Yehuda Sadeh Date: Fri, 21 Jan 2011 16:44:03 -0800 Subject: ceph: add ino32 mount option The ino32 mount option forces the ceph fs to report 32 bit ino values. This is useful for 64 bit kernels with 32 bit userspace. Signed-off-by: Yehuda Sadeh --- fs/ceph/dir.c | 11 ++++++---- fs/ceph/inode.c | 9 +++++++- fs/ceph/super.c | 5 +++++ fs/ceph/super.h | 65 +++++++++++++++++++++++++++++++++++++++------------------ 4 files changed, 65 insertions(+), 25 deletions(-) (limited to 'fs') diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index ebafa65a29b6..cbe875d3a522 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -161,7 +161,7 @@ more: filp->f_pos = di->offset; err = filldir(dirent, dentry->d_name.name, dentry->d_name.len, di->offset, - dentry->d_inode->i_ino, + ceph_translate_ino(dentry->d_sb, dentry->d_inode->i_ino), dentry->d_inode->i_mode >> 12); if (last) { @@ -245,15 +245,17 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir) dout("readdir off 0 -> '.'\n"); if (filldir(dirent, ".", 1, ceph_make_fpos(0, 0), - inode->i_ino, inode->i_mode >> 12) < 0) + ceph_translate_ino(inode->i_sb, inode->i_ino), + inode->i_mode >> 12) < 0) return 0; filp->f_pos = 1; off = 1; } if (filp->f_pos == 1) { + ino_t ino = filp->f_dentry->d_parent->d_inode->i_ino; dout("readdir off 1 -> '..'\n"); if (filldir(dirent, "..", 2, ceph_make_fpos(0, 1), - filp->f_dentry->d_parent->d_inode->i_ino, + ceph_translate_ino(inode->i_sb, ino), inode->i_mode >> 12) < 0) return 0; filp->f_pos = 2; @@ -377,7 +379,8 @@ more: if (filldir(dirent, rinfo->dir_dname[off - fi->offset], rinfo->dir_dname_len[off - fi->offset], - pos, ino, ftype) < 0) { + pos, + ceph_translate_ino(inode->i_sb, ino), ftype) < 0) { dout("filldir stopping us...\n"); return 0; } diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 60456361e07d..b54c97da1c43 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -36,6 +36,13 @@ static void ceph_vmtruncate_work(struct work_struct *work); /* * find or create an inode, given the ceph ino number */ +static int ceph_set_ino_cb(struct inode *inode, void *data) +{ + ceph_inode(inode)->i_vino = *(struct ceph_vino *)data; + inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data); + return 0; +} + struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino) { struct inode *inode; @@ -1809,7 +1816,7 @@ int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry, err = ceph_do_getattr(inode, CEPH_STAT_CAP_INODE_ALL); if (!err) { generic_fillattr(inode, stat); - stat->ino = inode->i_ino; + stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino); if (ceph_snap(inode) != CEPH_NOSNAP) stat->dev = ceph_snap(inode); else diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 9c5085465a63..e39ea78c4894 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -131,6 +131,7 @@ enum { Opt_rbytes, Opt_norbytes, Opt_noasyncreaddir, + Opt_ino32, }; static match_table_t fsopt_tokens = { @@ -150,6 +151,7 @@ static match_table_t fsopt_tokens = { {Opt_rbytes, "rbytes"}, {Opt_norbytes, "norbytes"}, {Opt_noasyncreaddir, "noasyncreaddir"}, + {Opt_ino32, "ino32"}, {-1, NULL} }; @@ -225,6 +227,9 @@ static int parse_fsopt_token(char *c, void *private) case Opt_noasyncreaddir: fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR; break; + case Opt_ino32: + fsopt->flags |= CEPH_MOUNT_OPT_INO32; + break; default: BUG_ON(token); } diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 20b907d76ae2..5405c903704e 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -27,6 +27,7 @@ #define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */ #define CEPH_MOUNT_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */ #define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */ +#define CEPH_MOUNT_OPT_INO32 (1<<8) /* 32 bit inos */ #define CEPH_MOUNT_OPT_DEFAULT (CEPH_MOUNT_OPT_RBYTES) @@ -319,6 +320,16 @@ static inline struct ceph_inode_info *ceph_inode(struct inode *inode) return container_of(inode, struct ceph_inode_info, vfs_inode); } +static inline struct ceph_fs_client *ceph_inode_to_client(struct inode *inode) +{ + return (struct ceph_fs_client *)inode->i_sb->s_fs_info; +} + +static inline struct ceph_fs_client *ceph_sb_to_client(struct super_block *sb) +{ + return (struct ceph_fs_client *)sb->s_fs_info; +} + static inline struct ceph_vino ceph_vino(struct inode *inode) { return ceph_inode(inode)->i_vino; @@ -327,19 +338,49 @@ static inline struct ceph_vino ceph_vino(struct inode *inode) /* * ino_t is <64 bits on many architectures, blech. * - * don't include snap in ino hash, at least for now. + * i_ino (kernel inode) st_ino (userspace) + * i386 32 32 + * x86_64+ino32 64 32 + * x86_64 64 64 + */ +static inline u32 ceph_ino_to_ino32(ino_t ino) +{ + ino ^= ino >> (sizeof(ino) * 8 - 32); + if (!ino) + ino = 1; + return ino; +} + +/* + * kernel i_ino value */ static inline ino_t ceph_vino_to_ino(struct ceph_vino vino) { ino_t ino = (ino_t)vino.ino; /* ^ (vino.snap << 20); */ #if BITS_PER_LONG == 32 - ino ^= vino.ino >> (sizeof(u64)-sizeof(ino_t)) * 8; - if (!ino) - ino = 1; + ino = ceph_ino_to_ino32(ino); #endif return ino; } +/* + * user-visible ino (stat, filldir) + */ +#if BITS_PER_LONG == 32 +static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino) +{ + return ino; +} +#else +static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino) +{ + if (ceph_test_mount_opt(ceph_sb_to_client(sb), INO32)) + ino = ceph_ino_to_ino32(ino); + return ino; +} +#endif + + /* for printf-style formatting */ #define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap @@ -428,13 +469,6 @@ static inline loff_t ceph_make_fpos(unsigned frag, unsigned off) return ((loff_t)frag << 32) | (loff_t)off; } -static inline int ceph_set_ino_cb(struct inode *inode, void *data) -{ - ceph_inode(inode)->i_vino = *(struct ceph_vino *)data; - inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data); - return 0; -} - /* * caps helpers */ @@ -503,15 +537,6 @@ extern void ceph_reservation_status(struct ceph_fs_client *client, int *total, int *avail, int *used, int *reserved, int *min); -static inline struct ceph_fs_client *ceph_inode_to_client(struct inode *inode) -{ - return (struct ceph_fs_client *)inode->i_sb->s_fs_info; -} - -static inline struct ceph_fs_client *ceph_sb_to_client(struct super_block *sb) -{ - return (struct ceph_fs_client *)sb->s_fs_info; -} /* -- cgit v1.2.3 From 80456f8672f7e69d05c01627da03587dc1ea1603 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 10 Mar 2011 13:33:26 -0800 Subject: ceph: move readahead default to fs/ceph from libceph Signed-off-by: Sage Weil --- fs/ceph/super.c | 4 ++-- fs/ceph/super.h | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ceph/super.c b/fs/ceph/super.c index e39ea78c4894..a9e78b4a258c 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -293,7 +293,7 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt, fsopt->sb_flags = flags; fsopt->flags = CEPH_MOUNT_OPT_DEFAULT; - fsopt->rsize = CEPH_MOUNT_RSIZE_DEFAULT; + fsopt->rsize = CEPH_RSIZE_DEFAULT; fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT; fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT; @@ -375,7 +375,7 @@ static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt) if (fsopt->wsize) seq_printf(m, ",wsize=%d", fsopt->wsize); - if (fsopt->rsize != CEPH_MOUNT_RSIZE_DEFAULT) + if (fsopt->rsize != CEPH_RSIZE_DEFAULT) seq_printf(m, ",rsize=%d", fsopt->rsize); if (fsopt->congestion_kb != default_congestion_kb()) seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 5405c903704e..619fe719968f 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -36,6 +36,7 @@ #define ceph_test_mount_opt(fsc, opt) \ (!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt)) +#define CEPH_RSIZE_DEFAULT (512*1024) /* readahead */ #define CEPH_MAX_READDIR_DEFAULT 1024 #define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024) #define CEPH_SNAPDIRNAME_DEFAULT ".snap" -- cgit v1.2.3 From 78a255654fa7f01945dea0dcedcf5113b3ad9f93 Mon Sep 17 00:00:00 2001 From: Henry C Chang Date: Tue, 15 Mar 2011 09:18:01 +0000 Subject: ceph: remove request from unsafe list if it is canceled/timed out This fixes the list corruption warning like this: ------------[ cut here ]------------ WARNING: at lib/list_debug.c:30 __list_add+0x68/0x81() Hardware name: X8DTU list_add corruption. prev->next should be next (ffff880618931250), but was (null). (prev=ffff880c188b9130). Modules linked in: nfsd lockd nfs_acl auth_rpcgss exportfs ceph libceph libcrc32c sunrpc ipv6 fuse igb i2c_i801 ioatdma i2c_core iTCO_wdt iTCO_vendor_support joydev dca serio_raw usb_storage [last unloaded: scsi_wait_scan] Pid: 10977, comm: smbd Tainted: G W 2.6.32.23-170.Elaster.xendom0.fc12.x86_64 #1 Call Trace: [] warn_slowpath_common+0x7c/0x94 [] warn_slowpath_fmt+0x41/0x43 [] __list_add+0x68/0x81 [] ceph_aio_write+0x614/0x8a2 [ceph] [] do_sync_write+0xe8/0x125 [] ? autoremove_wake_function+0x0/0x39 [] ? selinux_file_permission+0x5c/0xb3 [] ? security_file_permission+0x16/0x18 [] vfs_write+0xae/0x10b [] sys_pwrite64+0x5a/0x76 [] system_call_fastpath+0x16/0x1b ---[ end trace 08573eb9f07ff6f4 ]--- Signed-off-by: Henry C Chang Signed-off-by: Sage Weil --- fs/ceph/file.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs') diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 7d0e4a82d898..db5d86309744 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -568,7 +568,14 @@ more: spin_unlock(&ci->i_unsafe_lock); ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR); } + ret = ceph_osdc_wait_request(&fsc->client->osdc, req); + if (ret < 0 && req->r_safe_callback) { + spin_lock(&ci->i_unsafe_lock); + list_del_init(&req->r_unsafe_item); + spin_unlock(&ci->i_unsafe_lock); + ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR); + } } if (file->f_flags & O_DIRECT) -- cgit v1.2.3 From 49bcb93236ce1c60d9b7eb21a0aea1999f4d8709 Mon Sep 17 00:00:00 2001 From: Henry C Chang Date: Tue, 15 Mar 2011 09:18:02 +0000 Subject: ceph: add request to the tail of unsafe write list In sync_write_wait(), we assume that the newest request is at the tail of unsafe write list. We should maintain the semantics here. Signed-off-by: Henry C Chang Signed-off-by: Sage Weil --- fs/ceph/file.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/file.c b/fs/ceph/file.c index db5d86309744..159b512d5a27 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -564,7 +564,8 @@ more: * start_request so that a tid has been assigned. */ spin_lock(&ci->i_unsafe_lock); - list_add(&req->r_unsafe_item, &ci->i_unsafe_writes); + list_add_tail(&req->r_unsafe_item, + &ci->i_unsafe_writes); spin_unlock(&ci->i_unsafe_lock); ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR); } -- cgit v1.2.3 From 147851d2dc4d2be2f60d40276d12d7ef82f8a7ce Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 15 Mar 2011 14:57:41 -0700 Subject: ceph: rename dentry_release -> d_release, fix comment Just for consistency's sake. Fix obsolete comment too. Signed-off-by: Sage Weil --- fs/ceph/dir.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index cbe875d3a522..1a867a3601ae 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -1027,14 +1027,13 @@ out_touch: } /* - * When a dentry is released, clear the dir I_COMPLETE if it was part - * of the current dir gen or if this is in the snapshot namespace. + * Release our ceph_dentry_info. */ -static void ceph_dentry_release(struct dentry *dentry) +static void ceph_d_release(struct dentry *dentry) { struct ceph_dentry_info *di = ceph_dentry(dentry); - dout("dentry_release %p\n", dentry); + dout("d_release %p\n", dentry); if (di) { ceph_dentry_lru_del(dentry); if (di->lease_session) @@ -1259,14 +1258,14 @@ const struct inode_operations ceph_dir_iops = { const struct dentry_operations ceph_dentry_ops = { .d_revalidate = ceph_d_revalidate, - .d_release = ceph_dentry_release, + .d_release = ceph_d_release, }; const struct dentry_operations ceph_snapdir_dentry_ops = { .d_revalidate = ceph_snapdir_d_revalidate, - .d_release = ceph_dentry_release, + .d_release = ceph_d_release, }; const struct dentry_operations ceph_snap_dentry_ops = { - .d_release = ceph_dentry_release, + .d_release = ceph_d_release, }; -- cgit v1.2.3