From 81fca444001e5a41ab80ce8cf9a5734c00ec6546 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 6 Oct 2010 10:47:47 +0200 Subject: fs: move permission check back into __lookup_hash The caller that didn't need it is gone. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/namei.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 24896e833565..f1ef97dbc6c4 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1121,11 +1121,13 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt, static struct dentry *__lookup_hash(struct qstr *name, struct dentry *base, struct nameidata *nd) { + struct inode *inode = base->d_inode; struct dentry *dentry; - struct inode *inode; int err; - inode = base->d_inode; + err = exec_permission(inode); + if (err) + return ERR_PTR(err); /* * See if the low-level filesystem might want @@ -1161,11 +1163,6 @@ out: */ static struct dentry *lookup_hash(struct nameidata *nd) { - int err; - - err = exec_permission(nd->path.dentry->d_inode); - if (err) - return ERR_PTR(err); return __lookup_hash(&nd->last, nd->path.dentry, nd); } @@ -1213,9 +1210,6 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) if (err) return ERR_PTR(err); - err = exec_permission(base->d_inode); - if (err) - return ERR_PTR(err); return __lookup_hash(&this, base, NULL); } -- cgit v1.2.3 From c37650161a53c01ddd88587675f9a4adc909a73e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 6 Oct 2010 10:48:20 +0200 Subject: fs: add sync_inode_metadata Add a new helper to write out the inode using the writeback code, that is including the correct dirty bit and list manipulation. A few of filesystems already opencode this, and a lot of others should be using it instead of using write_inode_now which also writes out the data. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- drivers/staging/pohmelfs/inode.c | 6 +----- fs/exofs/file.c | 6 +----- fs/ext2/dir.c | 2 +- fs/ext2/ext2.h | 1 - fs/ext2/inode.c | 11 +---------- fs/ext2/xattr.c | 2 +- fs/fs-writeback.c | 20 ++++++++++++++++++++ fs/libfs.c | 6 +----- fs/nfsd/vfs.c | 16 +++------------- include/linux/fs.h | 1 + 10 files changed, 30 insertions(+), 41 deletions(-) diff --git a/drivers/staging/pohmelfs/inode.c b/drivers/staging/pohmelfs/inode.c index 97dae297ca3c..c62d30017c07 100644 --- a/drivers/staging/pohmelfs/inode.c +++ b/drivers/staging/pohmelfs/inode.c @@ -882,12 +882,8 @@ static struct inode *pohmelfs_alloc_inode(struct super_block *sb) static int pohmelfs_fsync(struct file *file, int datasync) { struct inode *inode = file->f_mapping->host; - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = 0, /* sys_fsync did this */ - }; - return sync_inode(inode, &wbc); + return sync_inode_metadata(inode, 1); } ssize_t pohmelfs_write(struct file *file, const char __user *buf, diff --git a/fs/exofs/file.c b/fs/exofs/file.c index 68cb23e3bb98..b905c79b4f0a 100644 --- a/fs/exofs/file.c +++ b/fs/exofs/file.c @@ -46,10 +46,6 @@ static int exofs_file_fsync(struct file *filp, int datasync) { int ret; struct inode *inode = filp->f_mapping->host; - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = 0, /* metadata-only; caller takes care of data */ - }; struct super_block *sb; if (!(inode->i_state & I_DIRTY)) @@ -57,7 +53,7 @@ static int exofs_file_fsync(struct file *filp, int datasync) if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) return 0; - ret = sync_inode(inode, &wbc); + ret = sync_inode_metadata(inode, 1); /* This is a good place to write the sb */ /* TODO: Sechedule an sb-sync on create */ diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index 764109886ec0..2709b34206ab 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -98,7 +98,7 @@ static int ext2_commit_chunk(struct page *page, loff_t pos, unsigned len) if (IS_DIRSYNC(dir)) { err = write_one_page(page, 1); if (!err) - err = ext2_sync_inode(dir); + err = sync_inode_metadata(dir, 1); } else { unlock_page(page); } diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index 416daa62242c..6346a2acf326 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -120,7 +120,6 @@ extern unsigned long ext2_count_free (struct buffer_head *, unsigned); extern struct inode *ext2_iget (struct super_block *, unsigned long); extern int ext2_write_inode (struct inode *, struct writeback_control *); extern void ext2_evict_inode(struct inode *); -extern int ext2_sync_inode (struct inode *); extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int); extern int ext2_setattr (struct dentry *, struct iattr *); extern void ext2_set_inode_flags(struct inode *inode); diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 533699c16040..40ad210a5049 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -1203,7 +1203,7 @@ static int ext2_setsize(struct inode *inode, loff_t newsize) inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; if (inode_needs_sync(inode)) { sync_mapping_buffers(inode->i_mapping); - ext2_sync_inode (inode); + sync_inode_metadata(inode, 1); } else { mark_inode_dirty(inode); } @@ -1523,15 +1523,6 @@ int ext2_write_inode(struct inode *inode, struct writeback_control *wbc) return __ext2_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL); } -int ext2_sync_inode(struct inode *inode) -{ - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = 0, /* sys_fsync did this */ - }; - return sync_inode(inode, &wbc); -} - int ext2_setattr(struct dentry *dentry, struct iattr *iattr) { struct inode *inode = dentry->d_inode; diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c index 8c29ae15129e..f84700be3274 100644 --- a/fs/ext2/xattr.c +++ b/fs/ext2/xattr.c @@ -699,7 +699,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, EXT2_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0; inode->i_ctime = CURRENT_TIME_SEC; if (IS_SYNC(inode)) { - error = ext2_sync_inode (inode); + error = sync_inode_metadata(inode, 1); /* In case sync failed due to ENOSPC the inode was actually * written (only some dirty data were not) so we just proceed * as if nothing happened and cleanup the unused block */ diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index ab38fef1c9a1..29e3f409bbd0 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1198,3 +1198,23 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc) return ret; } EXPORT_SYMBOL(sync_inode); + +/** + * sync_inode - write an inode to disk + * @inode: the inode to sync + * @wait: wait for I/O to complete. + * + * Write an inode to disk and adjust it's dirty state after completion. + * + * Note: only writes the actual inode, no associated data or other metadata. + */ +int sync_inode_metadata(struct inode *inode, int wait) +{ + struct writeback_control wbc = { + .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE, + .nr_to_write = 0, /* metadata-only */ + }; + + return sync_inode(inode, &wbc); +} +EXPORT_SYMBOL(sync_inode_metadata); diff --git a/fs/libfs.c b/fs/libfs.c index 62baa0387d6e..2dbf4877d7ef 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -892,10 +892,6 @@ EXPORT_SYMBOL_GPL(generic_fh_to_parent); */ int generic_file_fsync(struct file *file, int datasync) { - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = 0, /* metadata-only; caller takes care of data */ - }; struct inode *inode = file->f_mapping->host; int err; int ret; @@ -906,7 +902,7 @@ int generic_file_fsync(struct file *file, int datasync) if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) return ret; - err = sync_inode(inode, &wbc); + err = sync_inode_metadata(inode, 1); if (ret == 0) ret = err; return ret; diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 661a6cf8e826..184938fcff04 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -281,23 +281,13 @@ commit_metadata(struct svc_fh *fhp) { struct inode *inode = fhp->fh_dentry->d_inode; const struct export_operations *export_ops = inode->i_sb->s_export_op; - int error = 0; if (!EX_ISSYNC(fhp->fh_export)) return 0; - if (export_ops->commit_metadata) { - error = export_ops->commit_metadata(inode); - } else { - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = 0, /* metadata only */ - }; - - error = sync_inode(inode, &wbc); - } - - return error; + if (export_ops->commit_metadata) + return export_ops->commit_metadata(inode); + return sync_inode_metadata(inode, 1); } /* diff --git a/include/linux/fs.h b/include/linux/fs.h index 4f34ff6e5558..0b03f490572f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1734,6 +1734,7 @@ static inline void file_accessed(struct file *file) } int sync_inode(struct inode *inode, struct writeback_control *wbc); +int sync_inode_metadata(struct inode *inode, int wait); struct file_system_type { const char *name; -- cgit v1.2.3 From 56b0dacfa2b8416815a2f2a5f4f51e46be4cf14c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 6 Oct 2010 10:48:55 +0200 Subject: fs: mark destroy_inode static Hugetlbfs used to need it, but after the destroy_inode and evict_inode changes it's not required anymore. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/inode.c | 2 +- include/linux/fs.h | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/inode.c b/fs/inode.c index 86464332e590..3368abd64bb5 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -235,7 +235,7 @@ void __destroy_inode(struct inode *inode) } EXPORT_SYMBOL(__destroy_inode); -void destroy_inode(struct inode *inode) +static void destroy_inode(struct inode *inode) { __destroy_inode(inode); if (inode->i_sb->s_op->destroy_inode) diff --git a/include/linux/fs.h b/include/linux/fs.h index 0b03f490572f..0a5d83633884 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2187,7 +2187,6 @@ extern void unlock_new_inode(struct inode *); extern void __iget(struct inode * inode); extern void iget_failed(struct inode *); extern void end_writeback(struct inode *); -extern void destroy_inode(struct inode *); extern void __destroy_inode(struct inode *); extern struct inode *new_inode(struct super_block *); extern int should_remove_suid(struct dentry *); -- cgit v1.2.3 From ebdec241d509cf69f6ebf1ecdc036359d3dbe154 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 6 Oct 2010 10:47:23 +0200 Subject: fs: kill block_prepare_write __block_write_begin and block_prepare_write are identical except for slightly different calling conventions. Convert all callers to the __block_write_begin calling conventions and drop block_prepare_write. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/buffer.c | 17 +++++------------ fs/ext3/inode.c | 4 ++-- fs/ext4/inode.c | 11 +++++------ fs/gfs2/aops.c | 3 +-- fs/gfs2/ops_inode.c | 6 +++--- fs/ocfs2/aops.c | 19 ++----------------- fs/ocfs2/aops.h | 3 --- fs/ocfs2/file.c | 9 ++++----- fs/reiserfs/inode.c | 24 +++++++++++------------- fs/reiserfs/ioctl.c | 6 ++---- fs/reiserfs/xattr.c | 5 +---- fs/xfs/linux-2.6/xfs_super.c | 2 +- include/linux/buffer_head.h | 1 - include/linux/reiserfs_fs.h | 2 ++ 14 files changed, 39 insertions(+), 73 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index 7f0b9b083f77..a7b8f3c59a4e 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1834,9 +1834,11 @@ void page_zero_new_buffers(struct page *page, unsigned from, unsigned to) } EXPORT_SYMBOL(page_zero_new_buffers); -int block_prepare_write(struct page *page, unsigned from, unsigned to, +int __block_write_begin(struct page *page, loff_t pos, unsigned len, get_block_t *get_block) { + unsigned from = pos & (PAGE_CACHE_SIZE - 1); + unsigned to = from + len; struct inode *inode = page->mapping->host; unsigned block_start, block_end; sector_t block; @@ -1916,7 +1918,7 @@ int block_prepare_write(struct page *page, unsigned from, unsigned to, } return err; } -EXPORT_SYMBOL(block_prepare_write); +EXPORT_SYMBOL(__block_write_begin); static int __block_commit_write(struct inode *inode, struct page *page, unsigned from, unsigned to) @@ -1953,15 +1955,6 @@ static int __block_commit_write(struct inode *inode, struct page *page, return 0; } -int __block_write_begin(struct page *page, loff_t pos, unsigned len, - get_block_t *get_block) -{ - unsigned start = pos & (PAGE_CACHE_SIZE - 1); - - return block_prepare_write(page, start, start + len, get_block); -} -EXPORT_SYMBOL(__block_write_begin); - /* * block_write_begin takes care of the basic task of block allocation and * bringing partial write blocks uptodate first. @@ -2379,7 +2372,7 @@ block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, else end = PAGE_CACHE_SIZE; - ret = block_prepare_write(page, 0, end, get_block); + ret = __block_write_begin(page, 0, end, get_block); if (!ret) ret = block_commit_write(page, 0, end); diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 5e0faf4cda79..ad05353040a1 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -1696,8 +1696,8 @@ static int ext3_journalled_writepage(struct page *page, * doesn't seem much point in redirtying the page here. */ ClearPageChecked(page); - ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, - ext3_get_block); + ret = __block_write_begin(page, 0, PAGE_CACHE_SIZE, + ext3_get_block); if (ret != 0) { ext3_journal_stop(handle); goto out_unlock; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 4b8debeb3965..49635ef236f8 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1538,10 +1538,10 @@ static int do_journal_get_write_access(handle_t *handle, if (!buffer_mapped(bh) || buffer_freed(bh)) return 0; /* - * __block_prepare_write() could have dirtied some buffers. Clean + * __block_write_begin() could have dirtied some buffers. Clean * the dirty bit as jbd2_journal_get_write_access() could complain * otherwise about fs integrity issues. Setting of the dirty bit - * by __block_prepare_write() isn't a real problem here as we clear + * by __block_write_begin() isn't a real problem here as we clear * the bit before releasing a page lock and thus writeback cannot * ever write the buffer. */ @@ -2550,8 +2550,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, if (buffer_delay(bh)) return 0; /* Not sure this could or should happen */ /* - * XXX: __block_prepare_write() unmaps passed block, - * is it OK? + * XXX: __block_write_begin() unmaps passed block, is it OK? */ ret = ext4_da_reserve_space(inode, iblock); if (ret) @@ -2583,7 +2582,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, /* * This function is used as a standard get_block_t calback function * when there is no desire to allocate any blocks. It is used as a - * callback function for block_prepare_write() and block_write_full_page(). + * callback function for block_write_begin() and block_write_full_page(). * These functions should only try to map a single block at a time. * * Since this function doesn't do block allocations even if the caller @@ -2743,7 +2742,7 @@ static int ext4_writepage(struct page *page, * all are mapped and non delay. We don't want to * do block allocation here. */ - ret = block_prepare_write(page, 0, len, + ret = __block_write_begin(page, 0, len, noalloc_get_block_write); if (!ret) { page_bufs = page_buffers(page); diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 6b24afb96aae..4f36f8832b9b 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -618,7 +618,6 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping, struct gfs2_alloc *al = NULL; pgoff_t index = pos >> PAGE_CACHE_SHIFT; unsigned from = pos & (PAGE_CACHE_SIZE - 1); - unsigned to = from + len; struct page *page; gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh); @@ -691,7 +690,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping, } prepare_write: - error = block_prepare_write(page, from, to, gfs2_block_map); + error = __block_write_begin(page, from, len, gfs2_block_map); out: if (error == 0) return 0; diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c index 0534510200d5..48a274f1674c 100644 --- a/fs/gfs2/ops_inode.c +++ b/fs/gfs2/ops_inode.c @@ -1294,7 +1294,7 @@ static int write_empty_blocks(struct page *page, unsigned from, unsigned to) int error; if (!page_has_buffers(page)) { - error = block_prepare_write(page, from, to, gfs2_block_map); + error = __block_write_begin(page, from, to - from, gfs2_block_map); if (unlikely(error)) return error; @@ -1313,7 +1313,7 @@ static int write_empty_blocks(struct page *page, unsigned from, unsigned to) next += bh->b_size; if (buffer_mapped(bh)) { if (end) { - error = block_prepare_write(page, start, end, + error = __block_write_begin(page, start, end - start, gfs2_block_map); if (unlikely(error)) return error; @@ -1328,7 +1328,7 @@ static int write_empty_blocks(struct page *page, unsigned from, unsigned to) } while (next < to); if (end) { - error = block_prepare_write(page, start, end, gfs2_block_map); + error = __block_write_begin(page, start, end - start, gfs2_block_map); if (unlikely(error)) return error; empty_write_end(page, start, end); diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 5cfeee118158..f1e962cb3b73 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -165,7 +165,7 @@ int ocfs2_get_block(struct inode *inode, sector_t iblock, * ocfs2 never allocates in this function - the only time we * need to use BH_New is when we're extending i_size on a file * system which doesn't support holes, in which case BH_New - * allows block_prepare_write() to zero. + * allows __block_write_begin() to zero. * * If we see this on a sparse file system, then a truncate has * raced us and removed the cluster. In this case, we clear @@ -407,21 +407,6 @@ static int ocfs2_writepage(struct page *page, struct writeback_control *wbc) return ret; } -/* - * This is called from ocfs2_write_zero_page() which has handled it's - * own cluster locking and has ensured allocation exists for those - * blocks to be written. - */ -int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page, - unsigned from, unsigned to) -{ - int ret; - - ret = block_prepare_write(page, from, to, ocfs2_get_block); - - return ret; -} - /* Taken from ext3. We don't necessarily need the full blown * functionality yet, but IMHO it's better to cut and paste the whole * thing so we can avoid introducing our own bugs (and easily pick up @@ -732,7 +717,7 @@ static int ocfs2_should_read_blk(struct inode *inode, struct page *page, } /* - * Some of this taken from block_prepare_write(). We already have our + * Some of this taken from __block_write_begin(). We already have our * mapping by now though, and the entire write will be allocating or * it won't, so not much need to use BH_New. * diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h index 7606f663da6d..76bfdfda691a 100644 --- a/fs/ocfs2/aops.h +++ b/fs/ocfs2/aops.h @@ -22,9 +22,6 @@ #ifndef OCFS2_AOPS_H #define OCFS2_AOPS_H -int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page, - unsigned from, unsigned to); - handle_t *ocfs2_start_walk_page_trans(struct inode *inode, struct page *page, unsigned from, diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 1ca6867935bb..77b4c04a2809 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -796,13 +796,12 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, block_end = block_start + (1 << inode->i_blkbits); /* - * block_start is block-aligned. Bump it by one to - * force ocfs2_{prepare,commit}_write() to zero the + * block_start is block-aligned. Bump it by one to force + * __block_write_begin and block_commit_write to zero the * whole block. */ - ret = ocfs2_prepare_write_nolock(inode, page, - block_start + 1, - block_start + 1); + ret = __block_write_begin(page, block_start + 1, 0, + ocfs2_get_block); if (ret < 0) { mlog_errno(ret); goto out_unlock; diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index caa758377d66..4dcb88046030 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -22,8 +22,6 @@ int reiserfs_commit_write(struct file *f, struct page *page, unsigned from, unsigned to); -int reiserfs_prepare_write(struct file *f, struct page *page, - unsigned from, unsigned to); void reiserfs_evict_inode(struct inode *inode) { @@ -165,7 +163,7 @@ inline void make_le_item_head(struct item_head *ih, const struct cpu_key *key, ** but tail is still sitting in a direct item, and we can't write to ** it. So, look through this page, and check all the mapped buffers ** to make sure they have valid block numbers. Any that don't need -** to be unmapped, so that block_prepare_write will correctly call +** to be unmapped, so that __block_write_begin will correctly call ** reiserfs_get_block to convert the tail into an unformatted node */ static inline void fix_tail_page_for_writing(struct page *page) @@ -439,13 +437,13 @@ static int reiserfs_bmap(struct inode *inode, sector_t block, } /* special version of get_block that is only used by grab_tail_page right -** now. It is sent to block_prepare_write, and when you try to get a +** now. It is sent to __block_write_begin, and when you try to get a ** block past the end of the file (or a block from a hole) it returns -** -ENOENT instead of a valid buffer. block_prepare_write expects to +** -ENOENT instead of a valid buffer. __block_write_begin expects to ** be able to do i/o on the buffers returned, unless an error value ** is also returned. ** -** So, this allows block_prepare_write to be used for reading a single block +** So, this allows __block_write_begin to be used for reading a single block ** in a page. Where it does not produce a valid page for holes, or past the ** end of the file. This turns out to be exactly what we need for reading ** tails for conversion. @@ -558,11 +556,12 @@ static int convert_tail_for_hole(struct inode *inode, ** ** We must fix the tail page for writing because it might have buffers ** that are mapped, but have a block number of 0. This indicates tail - ** data that has been read directly into the page, and block_prepare_write - ** won't trigger a get_block in this case. + ** data that has been read directly into the page, and + ** __block_write_begin won't trigger a get_block in this case. */ fix_tail_page_for_writing(tail_page); - retval = reiserfs_prepare_write(NULL, tail_page, tail_start, tail_end); + retval = __reiserfs_write_begin(tail_page, tail_start, + tail_end - tail_start); if (retval) goto unlock; @@ -2033,7 +2032,7 @@ static int grab_tail_page(struct inode *inode, /* start within the page of the last block in the file */ start = (offset / blocksize) * blocksize; - error = block_prepare_write(page, start, offset, + error = __block_write_begin(page, start, offset - start, reiserfs_get_block_create_0); if (error) goto unlock; @@ -2628,8 +2627,7 @@ static int reiserfs_write_begin(struct file *file, return ret; } -int reiserfs_prepare_write(struct file *f, struct page *page, - unsigned from, unsigned to) +int __reiserfs_write_begin(struct page *page, unsigned from, unsigned len) { struct inode *inode = page->mapping->host; int ret; @@ -2650,7 +2648,7 @@ int reiserfs_prepare_write(struct file *f, struct page *page, th->t_refcount++; } - ret = block_prepare_write(page, from, to, reiserfs_get_block); + ret = __block_write_begin(page, from, len, reiserfs_get_block); if (ret && reiserfs_transaction_running(inode->i_sb)) { struct reiserfs_transaction_handle *th = current->journal_info; /* this gets a little ugly. If reiserfs_get_block returned an diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c index 5cbb81e134ac..adf22b485cea 100644 --- a/fs/reiserfs/ioctl.c +++ b/fs/reiserfs/ioctl.c @@ -160,8 +160,6 @@ long reiserfs_compat_ioctl(struct file *file, unsigned int cmd, int reiserfs_commit_write(struct file *f, struct page *page, unsigned from, unsigned to); -int reiserfs_prepare_write(struct file *f, struct page *page, - unsigned from, unsigned to); /* ** reiserfs_unpack ** Function try to convert tail from direct item into indirect. @@ -200,7 +198,7 @@ int reiserfs_unpack(struct inode *inode, struct file *filp) } /* we unpack by finding the page with the tail, and calling - ** reiserfs_prepare_write on that page. This will force a + ** __reiserfs_write_begin on that page. This will force a ** reiserfs_get_block to unpack the tail for us. */ index = inode->i_size >> PAGE_CACHE_SHIFT; @@ -210,7 +208,7 @@ int reiserfs_unpack(struct inode *inode, struct file *filp) if (!page) { goto out; } - retval = reiserfs_prepare_write(NULL, page, write_from, write_from); + retval = __reiserfs_write_begin(page, write_from, 0); if (retval) goto out_unlock; diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 8c4cf273c672..f7415de13878 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -418,8 +418,6 @@ static inline __u32 xattr_hash(const char *msg, int len) int reiserfs_commit_write(struct file *f, struct page *page, unsigned from, unsigned to); -int reiserfs_prepare_write(struct file *f, struct page *page, - unsigned from, unsigned to); static void update_ctime(struct inode *inode) { @@ -532,8 +530,7 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th, rxh->h_hash = cpu_to_le32(xahash); } - err = reiserfs_prepare_write(NULL, page, page_offset, - page_offset + chunk + skip); + err = __reiserfs_write_begin(page, page_offset, chunk + skip); if (!err) { if (buffer) memcpy(data + skip, buffer + buffer_pos, chunk); diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index ab31ce5aeaf9..cf808782c065 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -576,7 +576,7 @@ xfs_max_file_offset( /* Figure out maximum filesize, on Linux this can depend on * the filesystem blocksize (on 32 bit platforms). - * __block_prepare_write does this in an [unsigned] long... + * __block_write_begin does this in an [unsigned] long... * page->index << (PAGE_CACHE_SHIFT - bbits) * So, for page sized blocks (4K on 32 bit platforms), * this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index dd1b25b2641c..68d1fe7b877c 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -212,7 +212,6 @@ int generic_write_end(struct file *, struct address_space *, loff_t, unsigned, unsigned, struct page *, void *); void page_zero_new_buffers(struct page *page, unsigned from, unsigned to); -int block_prepare_write(struct page*, unsigned, unsigned, get_block_t*); int cont_write_begin(struct file *, struct address_space *, loff_t, unsigned, unsigned, struct page **, void **, get_block_t *, loff_t *); diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h index 91a4177e60ce..5ca47e59b727 100644 --- a/include/linux/reiserfs_fs.h +++ b/include/linux/reiserfs_fs.h @@ -2072,6 +2072,8 @@ void sd_attrs_to_i_attrs(__u16 sd_attrs, struct inode *inode); void i_attrs_to_sd_attrs(struct inode *inode, __u16 * sd_attrs); int reiserfs_setattr(struct dentry *dentry, struct iattr *attr); +int __reiserfs_write_begin(struct page *page, unsigned from, unsigned len); + /* namei.c */ void set_de_name_and_namelen(struct reiserfs_dir_entry *de); int search_by_entry_key(struct super_block *sb, const struct cpu_key *key, -- cgit v1.2.3 From fde214d414218fb6cace35708730986bcc94fb53 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 4 Oct 2010 11:37:37 +0200 Subject: isofs: Fix isofs_get_blocks for 8TB files Currently isofs_get_blocks() was limited to handle only 4TB files on 32-bit architectures because of unnecessary use of iblock variable which was signed long. Just remove the variable. The error messages that were using this variable should have rather used b_off anyway because that is the block we are currently mapping. Signed-off-by: Jan Kara Signed-off-by: Al Viro --- fs/isofs/inode.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 09ff41a752a0..60c2b944d762 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -962,25 +962,23 @@ static int isofs_statfs (struct dentry *dentry, struct kstatfs *buf) * or getblk() if they are not. Returns the number of blocks inserted * (-ve == error.) */ -int isofs_get_blocks(struct inode *inode, sector_t iblock_s, +int isofs_get_blocks(struct inode *inode, sector_t iblock, struct buffer_head **bh, unsigned long nblocks) { - unsigned long b_off; + unsigned long b_off = iblock; unsigned offset, sect_size; unsigned int firstext; unsigned long nextblk, nextoff; - long iblock = (long)iblock_s; int section, rv, error; struct iso_inode_info *ei = ISOFS_I(inode); error = -EIO; rv = 0; - if (iblock < 0 || iblock != iblock_s) { + if (iblock != b_off) { printk(KERN_DEBUG "%s: block number too large\n", __func__); goto abort; } - b_off = iblock; offset = 0; firstext = ei->i_first_extent; @@ -998,8 +996,9 @@ int isofs_get_blocks(struct inode *inode, sector_t iblock_s, * I/O errors. */ if (b_off > ((inode->i_size + PAGE_CACHE_SIZE - 1) >> ISOFS_BUFFER_BITS(inode))) { - printk(KERN_DEBUG "%s: block >= EOF (%ld, %ld)\n", - __func__, iblock, (unsigned long) inode->i_size); + printk(KERN_DEBUG "%s: block >= EOF (%lu, %llu)\n", + __func__, b_off, + (unsigned long long)inode->i_size); goto abort; } @@ -1025,9 +1024,9 @@ int isofs_get_blocks(struct inode *inode, sector_t iblock_s, if (++section > 100) { printk(KERN_DEBUG "%s: More than 100 file sections ?!?" " aborting...\n", __func__); - printk(KERN_DEBUG "%s: block=%ld firstext=%u sect_size=%u " + printk(KERN_DEBUG "%s: block=%lu firstext=%u sect_size=%u " "nextblk=%lu nextoff=%lu\n", __func__, - iblock, firstext, (unsigned) sect_size, + b_off, firstext, (unsigned) sect_size, nextblk, nextoff); goto abort; } -- cgit v1.2.3 From 7e360c38abe2c70eae3ba5a8a17f17671d8b77c5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 5 Oct 2010 09:32:55 +0200 Subject: fs: allow for more than 2^31 files Andrew, Could you please review this patch, you probably are the right guy to take it, because it crosses fs and net trees. Note : /proc/sys/fs/file-nr is a read-only file, so this patch doesnt depend on previous patch (sysctl: fix min/max handling in __do_proc_doulongvec_minmax()) Thanks ! [PATCH V4] fs: allow for more than 2^31 files Robin Holt tried to boot a 16TB system and found af_unix was overflowing a 32bit value : We were seeing a failure which prevented boot. The kernel was incapable of creating either a named pipe or unix domain socket. This comes down to a common kernel function called unix_create1() which does: atomic_inc(&unix_nr_socks); if (atomic_read(&unix_nr_socks) > 2 * get_max_files()) goto out; The function get_max_files() is a simple return of files_stat.max_files. files_stat.max_files is a signed integer and is computed in fs/file_table.c's files_init(). n = (mempages * (PAGE_SIZE / 1024)) / 10; files_stat.max_files = n; In our case, mempages (total_ram_pages) is approx 3,758,096,384 (0xe0000000). That leaves max_files at approximately 1,503,238,553. This causes 2 * get_max_files() to integer overflow. Fix is to let /proc/sys/fs/file-nr & /proc/sys/fs/file-max use long integers, and change af_unix to use an atomic_long_t instead of atomic_t. get_max_files() is changed to return an unsigned long. get_nr_files() is changed to return a long. unix_nr_socks is changed from atomic_t to atomic_long_t, while not strictly needed to address Robin problem. Before patch (on a 64bit kernel) : # echo 2147483648 >/proc/sys/fs/file-max # cat /proc/sys/fs/file-max -18446744071562067968 After patch: # echo 2147483648 >/proc/sys/fs/file-max # cat /proc/sys/fs/file-max 2147483648 # cat /proc/sys/fs/file-nr 704 0 2147483648 Reported-by: Robin Holt Signed-off-by: Eric Dumazet Acked-by: David Miller Reviewed-by: Robin Holt Tested-by: Robin Holt Signed-off-by: Al Viro --- fs/file_table.c | 17 +++++++---------- include/linux/fs.h | 8 ++++---- kernel/sysctl.c | 6 +++--- net/unix/af_unix.c | 14 +++++++------- 4 files changed, 21 insertions(+), 24 deletions(-) diff --git a/fs/file_table.c b/fs/file_table.c index a04bdd81c11c..c3dee381f1b4 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -60,7 +60,7 @@ static inline void file_free(struct file *f) /* * Return the total number of open files in the system */ -static int get_nr_files(void) +static long get_nr_files(void) { return percpu_counter_read_positive(&nr_files); } @@ -68,7 +68,7 @@ static int get_nr_files(void) /* * Return the maximum number of open files in the system */ -int get_max_files(void) +unsigned long get_max_files(void) { return files_stat.max_files; } @@ -82,7 +82,7 @@ int proc_nr_files(ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { files_stat.nr_files = get_nr_files(); - return proc_dointvec(table, write, buffer, lenp, ppos); + return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } #else int proc_nr_files(ctl_table *table, int write, @@ -105,7 +105,7 @@ int proc_nr_files(ctl_table *table, int write, struct file *get_empty_filp(void) { const struct cred *cred = current_cred(); - static int old_max; + static long old_max; struct file * f; /* @@ -140,8 +140,7 @@ struct file *get_empty_filp(void) over: /* Ran out of filps - report that */ if (get_nr_files() > old_max) { - printk(KERN_INFO "VFS: file-max limit %d reached\n", - get_max_files()); + pr_info("VFS: file-max limit %lu reached\n", get_max_files()); old_max = get_nr_files(); } goto fail; @@ -487,7 +486,7 @@ retry: void __init files_init(unsigned long mempages) { - int n; + unsigned long n; filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); @@ -498,9 +497,7 @@ void __init files_init(unsigned long mempages) */ n = (mempages * (PAGE_SIZE / 1024)) / 10; - files_stat.max_files = n; - if (files_stat.max_files < NR_FILE) - files_stat.max_files = NR_FILE; + files_stat.max_files = max_t(unsigned long, n, NR_FILE); files_defer_init(); lg_lock_init(files_lglock); percpu_counter_init(&nr_files, 0); diff --git a/include/linux/fs.h b/include/linux/fs.h index 0a5d83633884..0cd6821013a0 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -34,9 +34,9 @@ /* And dynamically-tunable limits and defaults: */ struct files_stat_struct { - int nr_files; /* read only */ - int nr_free_files; /* read only */ - int max_files; /* tunable */ + unsigned long nr_files; /* read only */ + unsigned long nr_free_files; /* read only */ + unsigned long max_files; /* tunable */ }; struct inodes_stat_t { @@ -400,7 +400,7 @@ extern void __init inode_init_early(void); extern void __init files_init(unsigned long); extern struct files_stat_struct files_stat; -extern int get_max_files(void); +extern unsigned long get_max_files(void); extern int sysctl_nr_open; extern struct inodes_stat_t inodes_stat; extern int leases_enable, lease_break_time; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 3a45c224770f..694b140852c2 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1352,16 +1352,16 @@ static struct ctl_table fs_table[] = { { .procname = "file-nr", .data = &files_stat, - .maxlen = 3*sizeof(int), + .maxlen = sizeof(files_stat), .mode = 0444, .proc_handler = proc_nr_files, }, { .procname = "file-max", .data = &files_stat.max_files, - .maxlen = sizeof(int), + .maxlen = sizeof(files_stat.max_files), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_doulongvec_minmax, }, { .procname = "nr_open", diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 0ebc777a6660..3c95304a0817 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -117,7 +117,7 @@ static struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1]; static DEFINE_SPINLOCK(unix_table_lock); -static atomic_t unix_nr_socks = ATOMIC_INIT(0); +static atomic_long_t unix_nr_socks; #define unix_sockets_unbound (&unix_socket_table[UNIX_HASH_SIZE]) @@ -360,13 +360,13 @@ static void unix_sock_destructor(struct sock *sk) if (u->addr) unix_release_addr(u->addr); - atomic_dec(&unix_nr_socks); + atomic_long_dec(&unix_nr_socks); local_bh_disable(); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); local_bh_enable(); #ifdef UNIX_REFCNT_DEBUG - printk(KERN_DEBUG "UNIX %p is destroyed, %d are still alive.\n", sk, - atomic_read(&unix_nr_socks)); + printk(KERN_DEBUG "UNIX %p is destroyed, %ld are still alive.\n", sk, + atomic_long_read(&unix_nr_socks)); #endif } @@ -606,8 +606,8 @@ static struct sock *unix_create1(struct net *net, struct socket *sock) struct sock *sk = NULL; struct unix_sock *u; - atomic_inc(&unix_nr_socks); - if (atomic_read(&unix_nr_socks) > 2 * get_max_files()) + atomic_long_inc(&unix_nr_socks); + if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) goto out; sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto); @@ -632,7 +632,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock) unix_insert_socket(unix_sockets_unbound, sk); out: if (sk == NULL) - atomic_dec(&unix_nr_socks); + atomic_long_dec(&unix_nr_socks); else { local_bh_disable(); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); -- cgit v1.2.3 From 0e45b67d5aeb3dcfb6b149cf61c30b9a8e503f74 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 25 Aug 2010 09:05:27 +0200 Subject: affs: testing the wrong variable The intent was to verify that bh = affs_bread_ino(...) returned a valid pointer. We checked "ext_bh" earlier in the function and it's valid here. Signed-off-by: Dan Carpenter Signed-off-by: Al Viro --- fs/affs/file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/affs/file.c b/fs/affs/file.c index c4a9875bd1a6..0a90dcd46de2 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -894,9 +894,9 @@ affs_truncate(struct inode *inode) if (AFFS_SB(sb)->s_flags & SF_OFS) { struct buffer_head *bh = affs_bread_ino(inode, last_blk, 0); u32 tmp; - if (IS_ERR(ext_bh)) { + if (IS_ERR(bh)) { affs_warning(sb, "truncate", "unexpected read error for last block %u (%d)", - ext, PTR_ERR(ext_bh)); + ext, PTR_ERR(bh)); return; } tmp = be32_to_cpu(AFFS_DATA_HEAD(bh)->next); -- cgit v1.2.3 From d9d1dc802ffae507956ceb350eb3f4a995734f1e Mon Sep 17 00:00:00 2001 From: Valerie Aurora Date: Mon, 30 Aug 2010 17:23:12 -0400 Subject: Documentation: Fix trivial typo in filesystems/sharedsubtree.txt Documentation: Fix trivial typo in filesystems/sharedsubtree.txt This typo is easy to ignore unless you have spent a great deal of time thinking about how to eliminate duplicate dentries in unions. Signed-off-by: Valerie Aurora Signed-off-by: Al Viro --- Documentation/filesystems/sharedsubtree.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/filesystems/sharedsubtree.txt b/Documentation/filesystems/sharedsubtree.txt index fc0e39af43c3..4ede421c9687 100644 --- a/Documentation/filesystems/sharedsubtree.txt +++ b/Documentation/filesystems/sharedsubtree.txt @@ -62,10 +62,10 @@ replicas continue to be exactly same. # mount /dev/sd0 /tmp/a #ls /tmp/a - t1 t2 t2 + t1 t2 t3 #ls /mnt/a - t1 t2 t2 + t1 t2 t3 Note that the mount has propagated to the mount at /mnt as well. -- cgit v1.2.3 From ba10f486658c0ca1bc84c936f6a996e40d071453 Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Tue, 19 Oct 2010 14:27:59 -0700 Subject: hostfs: fix UML crash: remove f_spare from hostfs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 365b1818 ("add f_flags to struct statfs(64)") resized f_spare within struct statfs which caused a UML crash. There is no need to copy f_spare. Signed-off-by: Richard Weinberger Reported-by: Toralf Förster Tested-by: Toralf Förster Cc: Christoph Hellwig Cc: Al Viro Cc: Jeff Dike Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- fs/hostfs/hostfs.h | 3 +-- fs/hostfs/hostfs_kern.c | 2 +- fs/hostfs/hostfs_user.c | 9 ++------- 3 files changed, 4 insertions(+), 10 deletions(-) diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h index 7c232c1487ee..bf15a43016b9 100644 --- a/fs/hostfs/hostfs.h +++ b/fs/hostfs/hostfs.h @@ -91,7 +91,6 @@ extern int rename_file(char *from, char *to); extern int do_statfs(char *root, long *bsize_out, long long *blocks_out, long long *bfree_out, long long *bavail_out, long long *files_out, long long *ffree_out, - void *fsid_out, int fsid_size, long *namelen_out, - long *spare_out); + void *fsid_out, int fsid_size, long *namelen_out); #endif diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index f7dc9b5f9ef8..cd7c93917cc7 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -217,7 +217,7 @@ int hostfs_statfs(struct dentry *dentry, struct kstatfs *sf) err = do_statfs(dentry->d_sb->s_fs_info, &sf->f_bsize, &f_blocks, &f_bfree, &f_bavail, &f_files, &f_ffree, &sf->f_fsid, sizeof(sf->f_fsid), - &sf->f_namelen, sf->f_spare); + &sf->f_namelen); if (err) return err; sf->f_blocks = f_blocks; diff --git a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c index 6777aa06ce2c..8d02683585e0 100644 --- a/fs/hostfs/hostfs_user.c +++ b/fs/hostfs/hostfs_user.c @@ -364,8 +364,7 @@ int rename_file(char *from, char *to) int do_statfs(char *root, long *bsize_out, long long *blocks_out, long long *bfree_out, long long *bavail_out, long long *files_out, long long *ffree_out, - void *fsid_out, int fsid_size, long *namelen_out, - long *spare_out) + void *fsid_out, int fsid_size, long *namelen_out) { struct statfs64 buf; int err; @@ -384,10 +383,6 @@ int do_statfs(char *root, long *bsize_out, long long *blocks_out, sizeof(buf.f_fsid) > fsid_size ? fsid_size : sizeof(buf.f_fsid)); *namelen_out = buf.f_namelen; - spare_out[0] = buf.f_spare[0]; - spare_out[1] = buf.f_spare[1]; - spare_out[2] = buf.f_spare[2]; - spare_out[3] = buf.f_spare[3]; - spare_out[4] = buf.f_spare[4]; + return 0; } -- cgit v1.2.3 From 4a3956c790290efeb647bbb0c3a90476bb57800e Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Fri, 1 Oct 2010 14:20:22 -0700 Subject: vfs: introduce FMODE_UNSIGNED_OFFSET for allowing negative f_pos Now, rw_verify_area() checsk f_pos is negative or not. And if negative, returns -EINVAL. But, some special files as /dev/(k)mem and /proc//mem etc.. has negative offsets. And we can't do any access via read/write to the file(device). So introduce FMODE_UNSIGNED_OFFSET to allow negative file offsets. Signed-off-by: Wu Fengguang Signed-off-by: KAMEZAWA Hiroyuki Cc: Al Viro Cc: Heiko Carstens Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- drivers/char/mem.c | 4 ++++ fs/proc/base.c | 2 ++ fs/read_write.c | 28 ++++++++++++++++++++++++---- include/linux/fs.h | 3 +++ 4 files changed, 33 insertions(+), 4 deletions(-) diff --git a/drivers/char/mem.c b/drivers/char/mem.c index e985b1c2730e..1256454b2d43 100644 --- a/drivers/char/mem.c +++ b/drivers/char/mem.c @@ -876,6 +876,10 @@ static int memory_open(struct inode *inode, struct file *filp) if (dev->dev_info) filp->f_mapping->backing_dev_info = dev->dev_info; + /* Is /dev/mem or /dev/kmem ? */ + if (dev->dev_info == &directly_mappable_cdev_bdi) + filp->f_mode |= FMODE_UNSIGNED_OFFSET; + if (dev->fops->open) return dev->fops->open(inode, filp); diff --git a/fs/proc/base.c b/fs/proc/base.c index dc5d5f51f3fe..fb2a5abd4e4f 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -771,6 +771,8 @@ static const struct file_operations proc_single_file_operations = { static int mem_open(struct inode* inode, struct file* file) { file->private_data = (void*)((long)current->self_exec_id); + /* OK to pass negative loff_t, we can catch out-of-range */ + file->f_mode |= FMODE_UNSIGNED_OFFSET; return 0; } diff --git a/fs/read_write.c b/fs/read_write.c index e757ef26e4ce..9cd9d148105d 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -31,6 +31,20 @@ const struct file_operations generic_ro_fops = { EXPORT_SYMBOL(generic_ro_fops); +static int +__negative_fpos_check(struct file *file, loff_t pos, size_t count) +{ + /* + * pos or pos+count is negative here, check overflow. + * too big "count" will be caught in rw_verify_area(). + */ + if ((pos < 0) && (pos + count < pos)) + return -EOVERFLOW; + if (file->f_mode & FMODE_UNSIGNED_OFFSET) + return 0; + return -EINVAL; +} + /** * generic_file_llseek_unlocked - lockless generic llseek implementation * @file: file structure to seek on @@ -62,7 +76,9 @@ generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin) break; } - if (offset < 0 || offset > inode->i_sb->s_maxbytes) + if (offset < 0 && __negative_fpos_check(file, offset, 0)) + return -EINVAL; + if (offset > inode->i_sb->s_maxbytes) return -EINVAL; /* Special lock needed here? */ @@ -137,7 +153,7 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin) offset += file->f_pos; } retval = -EINVAL; - if (offset >= 0) { + if (offset >= 0 || !__negative_fpos_check(file, offset, 0)) { if (offset != file->f_pos) { file->f_pos = offset; file->f_version = 0; @@ -221,6 +237,7 @@ bad: } #endif + /* * rw_verify_area doesn't like huge counts. We limit * them to something that fits in "int" so that others @@ -238,8 +255,11 @@ int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count if (unlikely((ssize_t) count < 0)) return retval; pos = *ppos; - if (unlikely((pos < 0) || (loff_t) (pos + count) < 0)) - return retval; + if (unlikely((pos < 0) || (loff_t) (pos + count) < 0)) { + retval = __negative_fpos_check(file, pos, count); + if (retval) + return retval; + } if (unlikely(inode->i_flock && mandatory_lock(inode))) { retval = locks_mandatory_area( diff --git a/include/linux/fs.h b/include/linux/fs.h index 0cd6821013a0..7fc126df1c42 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -92,6 +92,9 @@ struct inodes_stat_t { /* Expect random access pattern */ #define FMODE_RANDOM ((__force fmode_t)0x1000) +/* File is huge (eg. /dev/kmem): treat loff_t as unsigned */ +#define FMODE_UNSIGNED_OFFSET ((__force fmode_t)0x2000) + /* File was opened by fanotify and shouldn't generate fanotify events */ #define FMODE_NONOTIFY ((__force fmode_t)0x1000000) -- cgit v1.2.3 From 3072b90c4726396e38160cc1f7c93191e2f4a566 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 6 Oct 2010 10:49:17 +0200 Subject: hfs: use sync_dirty_buffer Use sync_dirty_buffer instead of the incorrect opencoding it. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/hfs/hfs_fs.h | 11 ----------- fs/hfs/mdb.c | 4 ++-- 2 files changed, 2 insertions(+), 13 deletions(-) diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h index 4f55651aaa51..1efcbc765d3c 100644 --- a/fs/hfs/hfs_fs.h +++ b/fs/hfs/hfs_fs.h @@ -254,17 +254,6 @@ static inline void hfs_bitmap_dirty(struct super_block *sb) sb->s_dirt = 1; } -static inline void hfs_buffer_sync(struct buffer_head *bh) -{ - while (buffer_locked(bh)) { - wait_on_buffer(bh); - } - if (buffer_dirty(bh)) { - ll_rw_block(WRITE, 1, &bh); - wait_on_buffer(bh); - } -} - #define sb_bread512(sb, sec, data) ({ \ struct buffer_head *__bh; \ sector_t __block; \ diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c index 86428f5ac991..1563d5ce5764 100644 --- a/fs/hfs/mdb.c +++ b/fs/hfs/mdb.c @@ -220,7 +220,7 @@ int hfs_mdb_get(struct super_block *sb) mdb->drLsMod = hfs_mtime(); mark_buffer_dirty(HFS_SB(sb)->mdb_bh); - hfs_buffer_sync(HFS_SB(sb)->mdb_bh); + sync_dirty_buffer(HFS_SB(sb)->mdb_bh); } return 0; @@ -287,7 +287,7 @@ void hfs_mdb_commit(struct super_block *sb) HFS_SB(sb)->alt_mdb->drAtrb |= cpu_to_be16(HFS_SB_ATTRIB_UNMNT); HFS_SB(sb)->alt_mdb->drAtrb &= cpu_to_be16(~HFS_SB_ATTRIB_INCNSTNT); mark_buffer_dirty(HFS_SB(sb)->alt_mdb_bh); - hfs_buffer_sync(HFS_SB(sb)->alt_mdb_bh); + sync_dirty_buffer(HFS_SB(sb)->alt_mdb_bh); } if (test_and_clear_bit(HFS_FLG_BITMAP_DIRTY, &HFS_SB(sb)->flags)) { -- cgit v1.2.3 From bb1e5f8c022ae2f6bf1ec0c6b861db3c35976377 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 15 Oct 2010 16:32:27 -0700 Subject: fs: move exportfs since it is not a networking filesystem Move the EXPORTFS kconfig symbol out of the NETWORK_FILESYSTEMS block since it provides a library function that can be (and is) used by other (non-network) filesystems. This also eliminates a kconfig dependency warning: warning: (XFS_FS && BLOCK || NFSD && NETWORK_FILESYSTEMS && INET && FILE_LOCKING && BKL) selects EXPORTFS which has unmet direct dependencies (NETWORK_FILESYSTEMS) Signed-off-by: Randy Dunlap Cc: Dave Chinner Cc: Christoph Hellwig Cc: Alex Elder Cc: xfs-masters@oss.sgi.com Signed-off-by: Al Viro --- fs/Kconfig | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/Kconfig b/fs/Kconfig index 65781de44fc0..b5e582bd769d 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -47,6 +47,9 @@ source "fs/nilfs2/Kconfig" endif # BLOCK +config EXPORTFS + tristate + config FILE_LOCKING bool "Enable POSIX file locking API" if EMBEDDED default y @@ -222,9 +225,6 @@ config LOCKD_V4 depends on FILE_LOCKING default y -config EXPORTFS - tristate - config NFS_ACL_SUPPORT tristate select FS_POSIX_ACL -- cgit v1.2.3 From 8358e7d71e712d3bd4e20ecf23e6fd7480c83684 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Sat, 16 Oct 2010 12:40:33 +0900 Subject: fs/buffer.c: remove duplicated assignment on b_private bh->b_private is initialized within init_buffer(), thus the assignment should be redundant. Remove it. Signed-off-by: Namhyung Kim Signed-off-by: Al Viro --- fs/buffer.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/buffer.c b/fs/buffer.c index a7b8f3c59a4e..74566c6f67b1 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -905,7 +905,6 @@ try_again: bh->b_state = 0; atomic_set(&bh->b_count, 0); - bh->b_private = NULL; bh->b_size = size; /* Link the buffer to its page */ -- cgit v1.2.3 From e1455d1bdccbe056ba53479741b1452106ce59aa Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 6 Oct 2010 10:46:53 +0200 Subject: update block_device_operations documentation Updated Documentation/filesystems/Locking to match the code. Signed-off-by: Christoph Hellwig --- Documentation/filesystems/Locking | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 2db4283efa8d..8a817f656f0a 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -349,21 +349,36 @@ call this method upon the IO completion. --------------------------- block_device_operations ----------------------- prototypes: - int (*open) (struct inode *, struct file *); - int (*release) (struct inode *, struct file *); - int (*ioctl) (struct inode *, struct file *, unsigned, unsigned long); + int (*open) (struct block_device *, fmode_t); + int (*release) (struct gendisk *, fmode_t); + int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); + int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); + int (*direct_access) (struct block_device *, sector_t, void **, unsigned long *); int (*media_changed) (struct gendisk *); + void (*unlock_native_capacity) (struct gendisk *); int (*revalidate_disk) (struct gendisk *); + int (*getgeo)(struct block_device *, struct hd_geometry *); + void (*swap_slot_free_notify) (struct block_device *, unsigned long); locking rules: - BKL bd_sem -open: yes yes -release: yes yes -ioctl: yes no + BKL bd_mutex +open: no yes +release: no yes +ioctl: no no +compat_ioctl: no no +direct_access: no no media_changed: no no +unlock_native_capacity: no no revalidate_disk: no no +getgeo: no no +swap_slot_free_notify: no no (see below) + +media_changed, unlock_native_capacity and revalidate_disk are called only from +check_disk_change(). + +swap_slot_free_notify is called with swap_lock and sometimes the page lock +held. -The last two are called only from check_disk_change(). --------------------------- file_operations ------------------------------- prototypes: -- cgit v1.2.3 From 306fb0979443419288594446a348155a8027dcf2 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 23 Aug 2010 10:47:55 -0400 Subject: aio: bump i_count instead of using igrab The aio batching code is using igrab to get an extra reference on the inode so it can safely batch. igrab will go ahead and take the global inode spinlock, which can be a bottleneck on large machines doing lots of AIO. In this case, igrab isn't required because we already have a reference on the file handle. It is safe to just bump the i_count directly on the inode. Benchmarking shows this patch brings IOP/s on tons of flash up by about 2.5X. Signed-off-by: Chris Mason --- fs/aio.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/fs/aio.c b/fs/aio.c index 250b0a73c8a8..9e319a04780e 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1543,7 +1543,20 @@ static void aio_batch_add(struct address_space *mapping, } abe = mempool_alloc(abe_pool, GFP_KERNEL); - BUG_ON(!igrab(mapping->host)); + + /* + * we should be using igrab here, but + * we don't want to hammer on the global + * inode spinlock just to take an extra + * reference on a file that we must already + * have a reference to. + * + * When we're called, we always have a reference + * on the file, so we must always have a reference + * on the inode, so igrab must always just + * bump the count and move on. + */ + atomic_inc(&mapping->host->i_count); abe->mapping = mapping; hlist_add_head(&abe->list, &batch_hash[bucket]); return; -- cgit v1.2.3 From a3314a0ed389f51a51695120b429eccd45b3a165 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 11 Oct 2010 22:38:00 +0900 Subject: lockdep: fixup checking of dir inode annotation Since inode->i_mode shares its bits for S_IFMT, S_ISDIR should be used to distinguish whether it is a dir or not. Signed-off-by: Namhyung Kim Signed-off-by: Al Viro --- fs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/inode.c b/fs/inode.c index 3368abd64bb5..4f0a67c54f89 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -663,7 +663,7 @@ EXPORT_SYMBOL(new_inode); void unlock_new_inode(struct inode *inode) { #ifdef CONFIG_DEBUG_LOCK_ALLOC - if (inode->i_mode & S_IFDIR) { + if (S_ISDIR(inode->i_mode)) { struct file_system_type *type = inode->i_sb->s_type; /* Set new key only if filesystem hasn't already changed it */ -- cgit v1.2.3 From 309f77ad9bea057d55b04580b5a711e9e3727e83 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 25 Oct 2010 15:01:12 +0900 Subject: fs/buffer.c: call __block_write_begin() if we have page If we have the appropriate page already, call __block_write_begin() directly instead of releasing and regrabbing it inside of block_write_begin(). Signed-off-by: Namhyung Kim Signed-off-by: Al Viro --- fs/buffer.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index 74566c6f67b1..d895d9fd5b71 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2458,11 +2458,10 @@ int nobh_write_begin(struct address_space *mapping, *fsdata = NULL; if (page_has_buffers(page)) { - unlock_page(page); - page_cache_release(page); - *pagep = NULL; - return block_write_begin(mapping, pos, len, flags, pagep, - get_block); + ret = __block_write_begin(page, pos, len, get_block); + if (unlikely(ret)) + goto out_release; + return ret; } if (PageMappedToDisk(page)) -- cgit v1.2.3 From 8e3b9a072d071700e83e88b0bf59115c59042885 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 24 Oct 2010 09:45:22 -0400 Subject: ext2_remount: don't bother with invalidate_inodes() It's pointless - we *do* have busy inodes (root directory, for one), so that call will fail and attempt to change XIP flag will be ignored. Signed-off-by: Al Viro --- fs/ext2/super.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 85df87d0f7b7..0901320671da 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -1221,9 +1221,7 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data) } es = sbi->s_es; - if (((sbi->s_mount_opt & EXT2_MOUNT_XIP) != - (old_mount_opt & EXT2_MOUNT_XIP)) && - invalidate_inodes(sb)) { + if ((sbi->s_mount_opt ^ old_mount_opt) & EXT2_MOUNT_XIP) { ext2_msg(sb, KERN_WARNING, "warning: refusing change of " "xip flag with busy inodes while remounting"); sbi->s_mount_opt &= ~EXT2_MOUNT_XIP; -- cgit v1.2.3 From 9dcefee508d547eed88f3c578dc92819bdeaa952 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 24 Oct 2010 09:46:50 -0400 Subject: gfs2: invalidate_inodes() is no-op there In fill_super() we hadn't MS_ACTIVE set yet, so there won't be any inodes with zero i_count sitting around. In put_super() we already have MS_ACTIVE removed *and* we had called invalidate_inodes() since then. So again there won't be any inodes with zero i_count... Signed-off-by: Al Viro --- fs/gfs2/ops_fstype.c | 1 - fs/gfs2/super.c | 1 - 2 files changed, 2 deletions(-) diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index aeafc233dc89..cade1acbcea9 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -1219,7 +1219,6 @@ fail_sb: fail_locking: init_locking(sdp, &mount_gh, UNDO); fail_lm: - invalidate_inodes(sb); gfs2_gl_hash_clear(sdp); gfs2_lm_unmount(sdp); fail_sys: diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 047d1176096c..2b2c4997430b 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -857,7 +857,6 @@ restart: gfs2_clear_rgrpd(sdp); gfs2_jindex_free(sdp); /* Take apart glock structures and buffer lists */ - invalidate_inodes(sdp->sd_vfs); gfs2_gl_hash_clear(sdp); /* Unmount the locking protocol */ gfs2_lm_unmount(sdp); -- cgit v1.2.3 From 70fd136ecc44ca8c364f4412f447de1d940c3639 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 24 Oct 2010 09:50:11 -0400 Subject: ntfs: don't call invalidate_inodes() We are in fill_super(); again, no inodes with zero i_count could be around until we set MS_ACTIVE. Signed-off-by: Al Viro --- fs/ntfs/super.c | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index 19c5180f8a28..d4dec2d32117 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -3021,21 +3021,6 @@ iput_tmp_ino_err_out_now: if (vol->mft_ino && vol->mft_ino != tmp_ino) iput(vol->mft_ino); vol->mft_ino = NULL; - /* - * This is needed to get ntfs_clear_extent_inode() called for each - * inode we have ever called ntfs_iget()/iput() on, otherwise we A) - * leak resources and B) a subsequent mount fails automatically due to - * ntfs_iget() never calling down into our ntfs_read_locked_inode() - * method again... FIXME: Do we need to do this twice now because of - * attribute inodes? I think not, so leave as is for now... (AIA) - */ - if (invalidate_inodes(sb)) { - ntfs_error(sb, "Busy inodes left. This is most likely a NTFS " - "driver bug."); - /* Copied from fs/super.c. I just love this message. (-; */ - printk("NTFS: Busy inodes after umount. Self-destruct in 5 " - "seconds. Have a nice day...\n"); - } /* Errors at this stage are irrelevant. */ err_out_now: sb->s_fs_info = NULL; -- cgit v1.2.3 From 61ebdb4254e3ecb59022d2c730b57b04d0eeecc6 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 24 Oct 2010 10:48:15 -0400 Subject: smbfs never retains inodes with zero refcount in the first place Signed-off-by: Al Viro --- fs/smbfs/inode.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c index 8fc5e50e142f..f6e9ee59757e 100644 --- a/fs/smbfs/inode.c +++ b/fs/smbfs/inode.c @@ -229,7 +229,6 @@ smb_invalidate_inodes(struct smb_sb_info *server) { VERBOSE("\n"); shrink_dcache_sb(SB_of(server)); - invalidate_inodes(SB_of(server)); } /* -- cgit v1.2.3 From a8dade34e3df581bc36ca2afe6e27055e178801c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 24 Oct 2010 11:13:10 -0400 Subject: unexport invalidate_inodes Signed-off-by: Al Viro --- fs/inode.c | 1 - fs/internal.h | 5 +++++ include/linux/fs.h | 1 - 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/inode.c b/fs/inode.c index 4f0a67c54f89..db7c74c7dd80 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -417,7 +417,6 @@ int invalidate_inodes(struct super_block *sb) return busy; } -EXPORT_SYMBOL(invalidate_inodes); static int can_unuse(struct inode *inode) { diff --git a/fs/internal.h b/fs/internal.h index a6910e91cee8..f6dce46d80dc 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -101,3 +101,8 @@ extern void put_super(struct super_block *sb); struct nameidata; extern struct file *nameidata_to_filp(struct nameidata *); extern void release_open_intent(struct nameidata *); + +/* + * inode.c + */ +extern int invalidate_inodes(struct super_block *); diff --git a/include/linux/fs.h b/include/linux/fs.h index 7fc126df1c42..c3f6daf749cc 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2082,7 +2082,6 @@ extern int check_disk_change(struct block_device *); extern int __invalidate_device(struct block_device *); extern int invalidate_partition(struct gendisk *, int); #endif -extern int invalidate_inodes(struct super_block *); unsigned long invalidate_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t end); -- cgit v1.2.3 From 1d3382cbf02986e4833849f528d451367ea0b4cb Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 23 Oct 2010 15:19:20 -0400 Subject: new helper: inode_unhashed() note: for race-free uses you inode_lock held Signed-off-by: Al Viro