From 04bedd79a7037ee7af816b06c60c738144475c4a Mon Sep 17 00:00:00 2001 From: David Teigland Date: Fri, 18 Sep 2009 14:31:47 -0500 Subject: dlm: fix lowcomms_connect_node for sctp The recently added dlm_lowcomms_connect_node() from 391fbdc5d527149578490db2f1619951d91f3561 does not work when using SCTP instead of TCP. The sctp connection code has nothing to do without data to send. Check for no data in the sctp connection code and do nothing instead of triggering a BUG. Also have connect_node() do nothing when the protocol is sctp. Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 240cef14fe58..a3350e4c8184 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -316,6 +316,10 @@ int dlm_lowcomms_connect_node(int nodeid) { struct connection *con; + /* with sctp there's no connecting without sending */ + if (dlm_config.ci_protocol != 0) + return 0; + if (nodeid == dlm_our_nodeid()) return 0; @@ -855,11 +859,14 @@ static void sctp_init_assoc(struct connection *con) outmessage.msg_flags = MSG_EOR; spin_lock(&con->writequeue_lock); - e = list_entry(con->writequeue.next, struct writequeue_entry, - list); - BUG_ON((struct list_head *) e == &con->writequeue); + if (list_empty(&con->writequeue)) { + spin_unlock(&con->writequeue_lock); + log_print("writequeue empty for nodeid %d", con->nodeid); + return; + } + e = list_first_entry(&con->writequeue, struct writequeue_entry, list); len = e->len; offset = e->offset; spin_unlock(&con->writequeue_lock); -- cgit v1.2.3 From 6861f350785bf476c2d4e3b9cb69ee36b78df2fc Mon Sep 17 00:00:00 2001 From: David Teigland Date: Thu, 24 Sep 2009 15:58:23 -0500 Subject: dlm: fix socket fd translation The code to set up sctp sockets was not using the sockfd_lookup() and sockfd_put() routines to translate an fd to a socket. The direct fget and fput calls were resulting in error messages from alloc_fd(). Also clean up two log messages and remove a third, related to setting up sctp associations. Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index a3350e4c8184..70736eb4b516 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -459,9 +459,9 @@ static void process_sctp_notification(struct connection *con, int prim_len, ret; int addr_len; struct connection *new_con; - struct file *file; sctp_peeloff_arg_t parg; int parglen = sizeof(parg); + int err; /* * We get this before any data for an association. @@ -516,19 +516,22 @@ static void process_sctp_notification(struct connection *con, ret = kernel_getsockopt(con->sock, IPPROTO_SCTP, SCTP_SOCKOPT_PEELOFF, (void *)&parg, &parglen); - if (ret) { + if (ret < 0) { log_print("Can't peel off a socket for " - "connection %d to node %d: err=%d\n", + "connection %d to node %d: err=%d", parg.associd, nodeid, ret); + return; + } + new_con->sock = sockfd_lookup(parg.sd, &err); + if (!new_con->sock) { + log_print("sockfd_lookup error %d", err); + return; } - file = fget(parg.sd); - new_con->sock = SOCKET_I(file->f_dentry->d_inode); add_sock(new_con->sock, new_con); - fput(file); - put_unused_fd(parg.sd); + sockfd_put(new_con->sock); - log_print("got new/restarted association %d nodeid %d", - (int)sn->sn_assoc_change.sac_assoc_id, nodeid); + log_print("connecting to %d sctp association %d", + nodeid, (int)sn->sn_assoc_change.sac_assoc_id); /* Send any pending writes */ clear_bit(CF_CONNECT_PENDING, &new_con->flags); @@ -841,8 +844,6 @@ static void sctp_init_assoc(struct connection *con) if (con->retries++ > MAX_CONNECT_RETRIES) return; - log_print("Initiating association with node %d", con->nodeid); - if (nodeid_to_addr(con->nodeid, (struct sockaddr *)&rem_addr)) { log_print("no address for nodeid %d", con->nodeid); return; -- cgit v1.2.3 From 61d92c328c16419fc96dc50dd16f8b8c695409ec Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 2 Oct 2009 19:11:56 -0400 Subject: Btrfs: fix deadlock on async thread startup The btrfs async worker threads are used for a wide variety of things, including processing bio end_io functions. This means that when the endio threads aren't running, the rest of the FS isn't able to do the final processing required to clear PageWriteback. The endio threads also try to exit as they become idle and start more as the work piles up. The problem is that starting more threads means kthreadd may need to allocate ram, and that allocation may wait until the global number of writeback pages on the system is below a certain limit. The result of that throttling is that end IO threads wait on kthreadd, who is waiting on IO to end, which will never happen. This commit fixes the deadlock by handing off thread startup to a dedicated thread. It also fixes a bug where the on-demand thread creation was creating far too many threads because it didn't take into account threads being started by other procs. Signed-off-by: Chris Mason --- fs/btrfs/async-thread.c | 81 ++++++++++++++++++++++++++++++++++++++++++++----- fs/btrfs/async-thread.h | 10 ++++-- fs/btrfs/ctree.h | 1 + fs/btrfs/disk-io.c | 42 +++++++++++++------------ fs/btrfs/relocation.c | 4 +-- 5 files changed, 106 insertions(+), 32 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 282ca085c2fb..c0861e781cdb 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -63,6 +63,51 @@ struct btrfs_worker_thread { int idle; }; +/* + * btrfs_start_workers uses kthread_run, which can block waiting for memory + * for a very long time. It will actually throttle on page writeback, + * and so it may not make progress until after our btrfs worker threads + * process all of the pending work structs in their queue + * + * This means we can't use btrfs_start_workers from inside a btrfs worker + * thread that is used as part of cleaning dirty memory, which pretty much + * involves all of the worker threads. + * + * Instead we have a helper queue who never has more than one thread + * where we scheduler thread start operations. This worker_start struct + * is used to contain the work and hold a pointer to the queue that needs + * another worker. + */ +struct worker_start { + struct btrfs_work work; + struct btrfs_workers *queue; +}; + +static void start_new_worker_func(struct btrfs_work *work) +{ + struct worker_start *start; + start = container_of(work, struct worker_start, work); + btrfs_start_workers(start->queue, 1); + kfree(start); +} + +static int start_new_worker(struct btrfs_workers *queue) +{ + struct worker_start *start; + int ret; + + start = kzalloc(sizeof(*start), GFP_NOFS); + if (!start) + return -ENOMEM; + + start->work.func = start_new_worker_func; + start->queue = queue; + ret = btrfs_queue_worker(queue->atomic_worker_start, &start->work); + if (ret) + kfree(start); + return ret; +} + /* * helper function to move a thread onto the idle list after it * has finished some requests. @@ -118,11 +163,13 @@ static void check_pending_worker_creates(struct btrfs_worker_thread *worker) goto out; workers->atomic_start_pending = 0; - if (workers->num_workers >= workers->max_workers) + if (workers->num_workers + workers->num_workers_starting >= + workers->max_workers) goto out; + workers->num_workers_starting += 1; spin_unlock_irqrestore(&workers->lock, flags); - btrfs_start_workers(workers, 1); + start_new_worker(workers); return; out: @@ -390,9 +437,11 @@ int btrfs_stop_workers(struct btrfs_workers *workers) /* * simple init on struct btrfs_workers */ -void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max) +void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max, + struct btrfs_workers *async_helper) { workers->num_workers = 0; + workers->num_workers_starting = 0; INIT_LIST_HEAD(&workers->worker_list); INIT_LIST_HEAD(&workers->idle_list); INIT_LIST_HEAD(&workers->order_list); @@ -404,14 +453,15 @@ void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max) workers->name = name; workers->ordered = 0; workers->atomic_start_pending = 0; - workers->atomic_worker_start = 0; + workers->atomic_worker_start = async_helper; } /* * starts new worker threads. This does not enforce the max worker * count in case you need to temporarily go past it. */ -int btrfs_start_workers(struct btrfs_workers *workers, int num_workers) +static int __btrfs_start_workers(struct btrfs_workers *workers, + int num_workers) { struct btrfs_worker_thread *worker; int ret = 0; @@ -444,6 +494,8 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers) list_add_tail(&worker->worker_list, &workers->idle_list); worker->idle = 1; workers->num_workers++; + workers->num_workers_starting--; + WARN_ON(workers->num_workers_starting < 0); spin_unlock_irq(&workers->lock); } return 0; @@ -452,6 +504,14 @@ fail: return ret; } +int btrfs_start_workers(struct btrfs_workers *workers, int num_workers) +{ + spin_lock_irq(&workers->lock); + workers->num_workers_starting += num_workers; + spin_unlock_irq(&workers->lock); + return __btrfs_start_workers(workers, num_workers); +} + /* * run through the list and find a worker thread that doesn't have a lot * to do right now. This can return null if we aren't yet at the thread @@ -461,7 +521,10 @@ static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers) { struct btrfs_worker_thread *worker; struct list_head *next; - int enforce_min = workers->num_workers < workers->max_workers; + int enforce_min; + + enforce_min = (workers->num_workers + workers->num_workers_starting) < + workers->max_workers; /* * if we find an idle thread, don't move it to the end of the @@ -509,15 +572,17 @@ again: worker = next_worker(workers); if (!worker) { - if (workers->num_workers >= workers->max_workers) { + if (workers->num_workers + workers->num_workers_starting >= + workers->max_workers) { goto fallback; } else if (workers->atomic_worker_start) { workers->atomic_start_pending = 1; goto fallback; } else { + workers->num_workers_starting++; spin_unlock_irqrestore(&workers->lock, flags); /* we're below the limit, start another worker */ - btrfs_start_workers(workers, 1); + __btrfs_start_workers(workers, 1); goto again; } } diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index fc089b95ec14..5077746cf85e 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -64,6 +64,8 @@ struct btrfs_workers { /* current number of running workers */ int num_workers; + int num_workers_starting; + /* max number of workers allowed. changed by btrfs_start_workers */ int max_workers; @@ -78,9 +80,10 @@ struct btrfs_workers { /* * are we allowed to sleep while starting workers or are we required - * to start them at a later time? + * to start them at a later time? If we can't sleep, this indicates + * which queue we need to use to schedule thread creation. */ - int atomic_worker_start; + struct btrfs_workers *atomic_worker_start; /* list with all the work threads. The workers on the idle thread * may be actively servicing jobs, but they haven't yet hit the @@ -109,7 +112,8 @@ struct btrfs_workers { int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work); int btrfs_start_workers(struct btrfs_workers *workers, int num_workers); int btrfs_stop_workers(struct btrfs_workers *workers); -void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max); +void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max, + struct btrfs_workers *async_starter); int btrfs_requeue_work(struct btrfs_work *work); void btrfs_set_work_high_prio(struct btrfs_work *work); #endif diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 8184f2feb2f3..1b920ffc6a59 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -907,6 +907,7 @@ struct btrfs_fs_info { * A third pool does submit_bio to avoid deadlocking with the other * two */ + struct btrfs_workers generic_worker; struct btrfs_workers workers; struct btrfs_workers delalloc_workers; struct btrfs_workers endio_workers; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 69dce50aabd2..9903f042765d 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1746,21 +1746,22 @@ struct btrfs_root *open_ctree(struct super_block *sb, err = -EINVAL; goto fail_iput; } -printk("thread pool is %d\n", fs_info->thread_pool_size); - /* - * we need to start all the end_io workers up front because the - * queue work function gets called at interrupt time, and so it - * cannot dynamically grow. - */ + + btrfs_init_workers(&fs_info->generic_worker, + "genwork", 1, NULL); + btrfs_init_workers(&fs_info->workers, "worker", - fs_info->thread_pool_size); + fs_info->thread_pool_size, + &fs_info->generic_worker); btrfs_init_workers(&fs_info->delalloc_workers, "delalloc", - fs_info->thread_pool_size); + fs_info->thread_pool_size, + &fs_info->generic_worker); btrfs_init_workers(&fs_info->submit_workers, "submit", min_t(u64, fs_devices->num_devices, - fs_info->thread_pool_size)); + fs_info->thread_pool_size), + &fs_info->generic_worker); /* a higher idle thresh on the submit workers makes it much more * likely that bios will be send down in a sane order to the @@ -1774,15 +1775,20 @@ printk("thread pool is %d\n", fs_info->thread_pool_size); fs_info->delalloc_workers.idle_thresh = 2; fs_info->delalloc_workers.ordered = 1; - btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1); + btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1, + &fs_info->generic_worker); btrfs_init_workers(&fs_info->endio_workers, "endio", - fs_info->thread_pool_size); + fs_info->thread_pool_size, + &fs_info->generic_worker); btrfs_init_workers(&fs_info->endio_meta_workers, "endio-meta", - fs_info->thread_pool_size); + fs_info->thread_pool_size, + &fs_info->generic_worker); btrfs_init_workers(&fs_info->endio_meta_write_workers, - "endio-meta-write", fs_info->thread_pool_size); + "endio-meta-write", fs_info->thread_pool_size, + &fs_info->generic_worker); btrfs_init_workers(&fs_info->endio_write_workers, "endio-write", - fs_info->thread_pool_size); + fs_info->thread_pool_size, + &fs_info->generic_worker); /* * endios are largely parallel and should have a very @@ -1794,12 +1800,8 @@ printk("thread pool is %d\n", fs_info->thread_pool_size); fs_info->endio_write_workers.idle_thresh = 2; fs_info->endio_meta_write_workers.idle_thresh = 2; - fs_info->endio_workers.atomic_worker_start = 1; - fs_info->endio_meta_workers.atomic_worker_start = 1; - fs_info->endio_write_workers.atomic_worker_start = 1; - fs_info->endio_meta_write_workers.atomic_worker_start = 1; - btrfs_start_workers(&fs_info->workers, 1); + btrfs_start_workers(&fs_info->generic_worker, 1); btrfs_start_workers(&fs_info->submit_workers, 1); btrfs_start_workers(&fs_info->delalloc_workers, 1); btrfs_start_workers(&fs_info->fixup_workers, 1); @@ -2012,6 +2014,7 @@ fail_chunk_root: free_extent_buffer(chunk_root->node); free_extent_buffer(chunk_root->commit_root); fail_sb_buffer: + btrfs_stop_workers(&fs_info->generic_worker); btrfs_stop_workers(&fs_info->fixup_workers); btrfs_stop_workers(&fs_info->delalloc_workers); btrfs_stop_workers(&fs_info->workers); @@ -2437,6 +2440,7 @@ int close_ctree(struct btrfs_root *root) iput(fs_info->btree_inode); + btrfs_stop_workers(&fs_info->generic_worker); btrfs_stop_workers(&fs_info->fixup_workers); btrfs_stop_workers(&fs_info->delalloc_workers); btrfs_stop_workers(&fs_info->workers); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 361ad323faac..cfcc93c93a7b 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3518,7 +3518,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) BUG_ON(!rc->block_group); btrfs_init_workers(&rc->workers, "relocate", - fs_info->thread_pool_size); + fs_info->thread_pool_size, NULL); rc->extent_root = extent_root; btrfs_prepare_block_group_relocation(extent_root, rc->block_group); @@ -3701,7 +3701,7 @@ int btrfs_recover_relocation(struct btrfs_root *root) mapping_tree_init(&rc->reloc_root_tree); INIT_LIST_HEAD(&rc->reloc_roots); btrfs_init_workers(&rc->workers, "relocate", - root->fs_info->thread_pool_size); + root->fs_info->thread_pool_size, NULL); rc->extent_root = root->fs_info->extent_root; set_reloc_control(rc); -- cgit v1.2.3 From 1cdda9b81ac0e6ee986f034fa02f221679e1c11a Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 6 Oct 2009 10:04:28 -0400 Subject: Btrfs: fix possible softlockup in the allocator Like the cluster allocating stuff, we can lockup the box with the normal allocation path. This happens when we 1) Start to cache a block group that is severely fragmented, but has a decent amount of free space. 2) Start to commit a transaction 3) Have the commit try and empty out some of the delalloc inodes with extents that are relatively large. The inodes will not be able to make the allocations because they will ask for allocations larger than a contiguous area in the free space cache. So we will wait for more progress to be made on the block group, but since we're in a commit the caching kthread won't make any more progress and it already has enough free space that wait_block_group_cache_progress will just return. So, if we wait and fail to make the allocation the next time around, just loop and go to the next block group. This keeps us from getting stuck in a softlockup. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index d119c0388af1..2f82fabd7011 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4028,6 +4028,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, int loop = 0; bool found_uncached_bg = false; bool failed_cluster_refill = false; + bool failed_alloc = false; WARN_ON(num_bytes < root->sectorsize); btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); @@ -4232,14 +4233,23 @@ refill_cluster: offset = btrfs_find_space_for_alloc(block_group, search_start, num_bytes, empty_size); - if (!offset && (cached || (!cached && - loop == LOOP_CACHING_NOWAIT))) { - goto loop; - } else if (!offset && (!cached && - loop > LOOP_CACHING_NOWAIT)) { + /* + * If we didn't find a chunk, and we haven't failed on this + * block group before, and this block group is in the middle of + * caching and we are ok with waiting, then go ahead and wait + * for progress to be made, and set failed_alloc to true. + * + * If failed_alloc is true then we've already waited on this + * block group once and should move on to the next block group. + */ + if (!offset && !failed_alloc && !cached && + loop > LOOP_CACHING_NOWAIT) { wait_block_group_cache_progress(block_group, - num_bytes + empty_size); + num_bytes + empty_size); + failed_alloc = true; goto have_block_group; + } else if (!offset) { + goto loop; } checks: search_start = stripe_align(root, offset); @@ -4287,6 +4297,7 @@ checks: break; loop: failed_cluster_refill = false; + failed_alloc = false; btrfs_put_block_group(block_group); } up_read(&space_info->groups_sem); -- cgit v1.2.3 From 316d315bffa4026f28085f6b24ebcebede370ac7 Mon Sep 17 00:00:00 2001 From: Nikanth Karthikesan Date: Tue, 6 Oct 2009 20:16:55 +0200 Subject: block: Seperate read and write statistics of in_flight requests v2 Commit a9327cac440be4d8333bba975cbbf76045096275 added seperate read and write statistics of in_flight requests. And exported the number of read and write requests in progress seperately through sysfs. But Corrado Zoccolo reported getting strange output from "iostat -kx 2". Global values for service time and utilization were garbage. For interval values, utilization was always 100%, and service time is higher than normal. So this was reverted by commit 0f78ab9899e9d6acb09d5465def618704255963b The problem was in part_round_stats_single(), I missed the following: if (now == part->stamp) return; - if (part->in_flight) { + if (part_in_flight(part)) { __part_stat_add(cpu, part, time_in_queue, part_in_flight(part) * (now - part->stamp)); __part_stat_add(cpu, part, io_ticks, (now - part->stamp)); With this chunk included, the reported regression gets fixed. Signed-off-by: Nikanth Karthikesan -- Signed-off-by: Jens Axboe --- fs/partitions/check.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/partitions/check.c b/fs/partitions/check.c index f38fee0311a7..7b685e10cbad 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -248,11 +248,19 @@ ssize_t part_stat_show(struct device *dev, part_stat_read(p, merges[WRITE]), (unsigned long long)part_stat_read(p, sectors[WRITE]), jiffies_to_msecs(part_stat_read(p, ticks[WRITE])), - p->in_flight, + part_in_flight(p), jiffies_to_msecs(part_stat_read(p, io_ticks)), jiffies_to_msecs(part_stat_read(p, time_in_queue))); } +ssize_t part_inflight_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + + return sprintf(buf, "%8u %8u\n", p->in_flight[0], p->in_flight[1]); +} + #ifdef CONFIG_FAIL_MAKE_REQUEST ssize_t part_fail_show(struct device *dev, struct device_attribute *attr, char *buf) @@ -281,6 +289,7 @@ static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL); static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL); static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL); static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); +static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL); #ifdef CONFIG_FAIL_MAKE_REQUEST static struct device_attribute dev_attr_fail = __ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store); @@ -292,6 +301,7 @@ static struct attribute *part_attrs[] = { &dev_attr_size.attr, &dev_attr_alignment_offset.attr, &dev_attr_stat.attr, + &dev_attr_inflight.attr, #ifdef CONFIG_FAIL_MAKE_REQUEST &dev_attr_fail.attr, #endif -- cgit v1.2.3 From 8347a5cdd1422eea0470ed586274c7f29e274b47 Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 6 Oct 2009 18:31:29 +0000 Subject: [CIFS] Fixing to avoid invalid kfree() in cifs_get_tcp_session() trivial bug in fs/cifs/connect.c . The bug is caused by fail of extract_hostname() when mounting cifs file system. This is the situation when I noticed this bug. % sudo mount -t cifs //192.168.10.208 mountpoint -o options... Then my kernel says, [ 1461.807776] ------------[ cut here ]------------ [ 1461.807781] kernel BUG at mm/slab.c:521! [ 1461.807784] invalid opcode: 0000 [#2] PREEMPT SMP [ 1461.807790] last sysfs file: /sys/devices/pci0000:00/0000:00:1e.0/0000:09:02.0/resource [ 1461.807793] CPU 0 [ 1461.807796] Modules linked in: nls_iso8859_1 usbhid sbp2 uhci_hcd ehci_hcd i2c_i801 ohci1394 ieee1394 psmouse serio_raw pcspkr sky2 usbcore evdev [ 1461.807816] Pid: 3446, comm: mount Tainted: G D 2.6.32-rc2-vanilla [ 1461.807820] RIP: 0010:[] [] kfree+0x63/0x156 [ 1461.807829] RSP: 0018:ffff8800b4f7fbb8 EFLAGS: 00010046 [ 1461.807832] RAX: ffffea00033fff98 RBX: ffff8800afbae7e2 RCX: 0000000000000000 [ 1461.807836] RDX: ffffea0000000000 RSI: 000000000000005c RDI: ffffffffffffffea [ 1461.807839] RBP: ffff8800b4f7fbf8 R08: 0000000000000001 R09: 0000000000000000 [ 1461.807842] R10: 0000000000000000 R11: ffff8800b4f7fbf8 R12: 00000000ffffffea [ 1461.807845] R13: ffff8800afb23000 R14: ffff8800b4f87bc0 R15: ffffffffffffffea [ 1461.807849] FS: 00007f52b6f187c0(0000) GS:ffff880007600000(0000) knlGS:0000000000000000 [ 1461.807852] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [ 1461.807855] CR2: 0000000000613000 CR3: 00000000af8f9000 CR4: 00000000000006f0 [ 1461.807858] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 1461.807861] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 [ 1461.807865] Process mount (pid: 3446, threadinfo ffff8800b4f7e000, task ffff8800950e4380) [ 1461.807867] Stack: [ 1461.807869] 0000000000000202 0000000000000282 ffff8800b4f7fbf8 ffff8800afbae7e2 [ 1461.807876] <0> 00000000ffffffea ffff8800afb23000 ffff8800b4f87bc0 ffff8800b4f7fc28 [ 1461.807884] <0> ffff8800b4f7fcd8 ffffffff81159f6d ffffffff81147bc2 ffffffff816bfb48 [ 1461.807892] Call Trace: [ 1461.807899] [] cifs_get_tcp_session+0x440/0x44b [ 1461.807904] [] ? find_nls+0x1c/0xe9 [ 1461.807909] [] cifs_mount+0x16bc/0x2167 [ 1461.807917] [] ? _spin_unlock+0x30/0x4b [ 1461.807923] [] cifs_get_sb+0xa5/0x1a8 [ 1461.807928] [] vfs_kern_mount+0x56/0xc9 [ 1461.807933] [] do_kern_mount+0x47/0xe7 [ 1461.807938] [] do_mount+0x712/0x775 [ 1461.807943] [] ? copy_mount_options+0xcf/0x132 [ 1461.807948] [] sys_mount+0x7f/0xbf [ 1461.807953] [] ? lockdep_sys_exit_thunk+0x35/0x67 [ 1461.807960] [] system_call_fastpath+0x16/0x1b [ 1461.807963] Code: 00 00 00 00 ea ff ff 48 c1 e8 0c 48 6b c0 68 48 01 d0 66 83 38 00 79 04 48 8b 40 10 66 83 38 00 79 04 48 8b 40 10 80 38 00 78 04 <0f> 0b eb fe 4c 8b 70 58 4c 89 ff 41 8b 76 4c e8 b8 49 fb ff e8 [ 1461.808022] RIP [] kfree+0x63/0x156 [ 1461.808027] RSP [ 1461.808031] ---[ end trace ffe26fcdc72c0ce4 ]--- The reason of this bug is that the error handling code of cifs_get_tcp_session() calls kfree() when corresponding kmalloc() failed. (The kmalloc() is called by extract_hostname().) Signed-off-by: Hitoshi Mitake CC: Stable Reviewed-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/connect.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 43003e0bef18..b09098079916 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1577,7 +1577,8 @@ cifs_get_tcp_session(struct smb_vol *volume_info) out_err: if (tcp_ses) { - kfree(tcp_ses->hostname); + if (!IS_ERR(tcp_ses->hostname)) + kfree(tcp_ses->hostname); if (tcp_ses->ssocket) sock_release(tcp_ses->ssocket); kfree(tcp_ses); -- cgit v1.2.3 From c5811dbdd26284d63c19fca618bd740dd10ad53d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 6 Oct 2009 15:40:15 -0400 Subject: NFS: Fix a default mount regression... With the recent spate of changes, the nfs protocol version will now default to 2 instead of 3, while the mount protocol version defaults to 3. The following patch should ensure the defaults are consistent with the previous defaults of vers=3,proto=tcp,mountvers=3,mountproto=tcp. This fixes the bug http://bugzilla.kernel.org/show_bug.cgi?id=14259 Signed-off-by: Trond Myklebust --- fs/nfs/super.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 29786d3b9326..0343ebcfbbdc 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -728,22 +728,24 @@ static void nfs_umount_begin(struct super_block *sb) unlock_kernel(); } -static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(int flags) +static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(unsigned int version) { struct nfs_parsed_mount_data *data; data = kzalloc(sizeof(*data), GFP_KERNEL); if (data) { - data->flags = flags; data->rsize = NFS_MAX_FILE_IO_SIZE; data->wsize = NFS_MAX_FILE_IO_SIZE; data->acregmin = NFS_DEF_ACREGMIN; data->acregmax = NFS_DEF_ACREGMAX; data->acdirmin = NFS_DEF_ACDIRMIN; data->acdirmax = NFS_DEF_ACDIRMAX; + data->mount_server.port = NFS_UNSPEC_PORT; data->nfs_server.port = NFS_UNSPEC_PORT; + data->nfs_server.protocol = XPRT_TRANSPORT_TCP; data->auth_flavors[0] = RPC_AUTH_UNIX; data->auth_flavor_len = 1; + data->version = version; data->minorversion = 0; } return data; @@ -1711,8 +1713,6 @@ static int nfs_validate_mount_data(void *options, if (!(data->flags & NFS_MOUNT_TCP)) args->nfs_server.protocol = XPRT_TRANSPORT_UDP; - else - args->nfs_server.protocol = XPRT_TRANSPORT_TCP; /* N.B. caller will free nfs_server.hostname in all cases */ args->nfs_server.hostname = kstrdup(data->hostname, GFP_KERNEL); args->namlen = data->namlen; @@ -2106,7 +2106,7 @@ static int nfs_get_sb(struct file_system_type *fs_type, }; int error = -ENOMEM; - data = nfs_alloc_parsed_mount_data(NFS_MOUNT_VER3 | NFS_MOUNT_TCP); + data = nfs_alloc_parsed_mount_data(3); mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL); if (data == NULL || mntfh == NULL) goto out_free_fh; @@ -2376,7 +2376,6 @@ static int nfs4_validate_mount_data(void *options, if (data == NULL) goto out_no_data; - args->version = 4; switch (data->version) { case 1: if (data->host_addrlen > sizeof(args->nfs_server.address)) @@ -2660,7 +2659,7 @@ static int nfs4_get_sb(struct file_system_type *fs_type, struct nfs_parsed_mount_data *data; int error = -ENOMEM; - data = nfs_alloc_parsed_mount_data(0); + data = nfs_alloc_parsed_mount_data(4); if (data == NULL) goto out_free_data; -- cgit v1.2.3 From f5855fecda65c1965c894915ace1e086d4925154 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 6 Oct 2009 15:40:37 -0400 Subject: NFS: Fix port and mountport display in /proc/self/mountinfo Currently, the port and mount port will both display as 65535 if you do not specify a port number. That would be wrong... Signed-off-by: Trond Myklebust --- fs/nfs/super.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 0343ebcfbbdc..0d14704c539e 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -778,15 +778,13 @@ static int nfs_verify_server_address(struct sockaddr *addr) * Select between a default port value and a user-specified port value. * If a zero value is set, then autobind will be used. */ -static void nfs_set_default_port(struct sockaddr *sap, const int parsed_port, +static void nfs_set_port(struct sockaddr *sap, int *port, const unsigned short default_port) { - unsigned short port = default_port; + if (*port == NFS_UNSPEC_PORT) + *port = default_port; - if (parsed_port != NFS_UNSPEC_PORT) - port = parsed_port; - - rpc_set_port(sap, port); + rpc_set_port(sap, *port); } /* @@ -1477,7 +1475,7 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args, args->mount_server.addrlen = args->nfs_server.addrlen; } request.salen = args->mount_server.addrlen; - nfs_set_default_port(request.sap, args->mount_server.port, 0); + nfs_set_port(request.sap, &args->mount_server.port, 0); /* * Now ask the mount server to map our export path @@ -1767,7 +1765,7 @@ static int nfs_validate_mount_data(void *options, goto out_v4_not_compiled; #endif - nfs_set_default_port(sap, args->nfs_server.port, 0); + nfs_set_port(sap, &args->nfs_server.port, 0); nfs_set_mount_transport_protocol(args); @@ -2331,7 +2329,7 @@ static int nfs4_validate_text_mount_data(void *options, { struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address; - nfs_set_default_port(sap, args->nfs_server.port, NFS_PORT); + nfs_set_port(sap, &args->nfs_server.port, NFS_PORT); nfs_validate_transport_protocol(args); -- cgit v1.2.3 From bcd2ea17da6a329a7276cde7286d802f009af332 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 6 Oct 2009 15:41:22 -0400 Subject: NFS: Fix port initialisation in nfs_remount() The recent changeset 53a0b9c4c99ab0085a06421f71592722e5b3fd5f (NFS: Replace nfs_parse_ip_address() with rpc_pton()) broke nfs_remount, since the call to rpc_pton() will zero out the port number in data->nfs_server.address. This is actually due to a bug in nfs_remount: it should be looking at the port number in nfs_server.port instead... This fixes bug http://bugzilla.kernel.org/show_bug.cgi?id=14276 Signed-off-by: Trond Myklebust --- fs/nfs/super.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 0d14704c539e..fb3b280cacfe 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1846,9 +1846,10 @@ nfs_compare_remount_data(struct nfs_server *nfss, data->acdirmin != nfss->acdirmin / HZ || data->acdirmax != nfss->acdirmax / HZ || data->timeo != (10U * nfss->client->cl_timeout->to_initval / HZ) || + data->nfs_server.port != nfss->port || data->nfs_server.addrlen != nfss->nfs_client->cl_addrlen || - memcmp(&data->nfs_server.address, &nfss->nfs_client->cl_addr, - data->nfs_server.addrlen) != 0) + !rpc_cmp_addr(&data->nfs_server.address, + &nfss->nfs_client->cl_addr)) return -EINVAL; return 0; @@ -1891,6 +1892,7 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data) data->acdirmin = nfss->acdirmin / HZ; data->acdirmax = nfss->acdirmax / HZ; data->timeo = 10U * nfss->client->cl_timeout->to_initval / HZ; + data->nfs_server.port = nfss->port; data->nfs_server.addrlen = nfss->nfs_client->cl_addrlen; memcpy(&data->nfs_server.address, &nfss->nfs_client->cl_addr, data->nfs_server.addrlen); -- cgit v1.2.3 From f4373bf9e67e4a653c8854acd7b02dac9714c98a Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Tue, 6 Oct 2009 15:42:18 -0400 Subject: nfs: Avoid overrun when copying client IP address string As seen in , nfs4_init_client() can overrun the source string when copying the client IP address from nfs_parsed_mount_data::client_address to nfs_client::cl_ipaddr. Since these are both treated as null-terminated strings elsewhere, the copy should be done with strlcpy() not memcpy(). Signed-off-by: Ben Hutchings Signed-off-by: Trond Myklebust --- fs/nfs/client.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 63976c0ccc25..99ea196f071f 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -1180,7 +1180,7 @@ static int nfs4_init_client(struct nfs_client *clp, 1, flags & NFS_MOUNT_NORESVPORT); if (error < 0) goto error; - memcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr)); + strlcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr)); error = nfs_idmap_new(clp); if (error < 0) { -- cgit v1.2.3 From 517be09def6cd7bc231222ee756fde8ea245a6fe Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 6 Oct 2009 15:42:20 -0400 Subject: NFSv4: Fix the referral mount code Fix a typo which causes try_location() to use the wrong length argument when calling nfs_parse_server_name(). This again, causes the initialisation of the mount's sockaddr structure to fail. Also ensure that if nfs4_pathname_string() returns an error, then we pass that error back up the stack instead of ENOENT. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4namespace.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index 2636c26d56fa..fa3408f20112 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -121,7 +121,7 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata, mnt_path = nfs4_pathname_string(&location->rootpath, page2, PAGE_SIZE); if (IS_ERR(mnt_path)) - return mnt; + return ERR_CAST(mnt_path); mountdata->mnt_path = mnt_path; maxbuflen = mnt_path - 1 - page2; @@ -132,15 +132,15 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata, if (buf->len <= 0 || buf->len >= maxbuflen) continue; - mountdata->addr = (struct sockaddr *)&addr; - if (memchr(buf->data, IPV6_SCOPE_DELIMITER, buf->len)) continue; - mountdata->addrlen = nfs_parse_server_name(buf->data, - buf->len, - mountdata->addr, mountdata->addrlen); + + mountdata->addrlen = nfs_parse_server_name(buf->data, buf->len, + (struct sockaddr *)&addr, sizeof(addr)); if (mountdata->addrlen == 0) continue; + + mountdata->addr = (struct sockaddr *)&addr; rpc_set_port(mountdata->addr, NFS_PORT); memcpy(page2, buf->data, buf->len); -- cgit v1.2.3 From 4055e97318809638a57fbe1746b93bc7a90ef0d3 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Wed, 7 Oct 2009 16:32:24 -0700 Subject: fs: includecheck fix: proc, kcore.c fix the following 'make includecheck' warning: fs/proc/kcore.c: linux/mm.h is included more than once. Signed-off-by: Jaswinder Singh Rajput Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/kcore.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 56013371f9f3..a44a7897fd4d 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include -- cgit v1.2.3 From 253fb02d62571e5455eedc9e39b9d660e86a40f0 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 7 Oct 2009 16:32:27 -0700 Subject: pagemap: export KPF_HWPOISON This flag indicates a hardware detected memory corruption on the page. Any future access of the page data may bring down the machine. Signed-off-by: Wu Fengguang Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/page.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/proc/page.c b/fs/proc/page.c index 2281c2cbfe2b..5033ce0d254b 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -94,6 +94,7 @@ static const struct file_operations proc_kpagecount_operations = { #define KPF_COMPOUND_TAIL 16 #define KPF_HUGE 17 #define KPF_UNEVICTABLE 18 +#define KPF_HWPOISON 19 #define KPF_NOPAGE 20 #define KPF_KSM 21 @@ -180,6 +181,10 @@ static u64 get_uflags(struct page *page) u |= kpf_copy_bit(k, KPF_UNEVICTABLE, PG_unevictable); u |= kpf_copy_bit(k, KPF_MLOCKED, PG_mlocked); +#ifdef CONFIG_MEMORY_FAILURE + u |= kpf_copy_bit(k, KPF_HWPOISON, PG_hwpoison); +#endif + #ifdef CONFIG_IA64_UNCACHED_ALLOCATOR u |= kpf_copy_bit(k, KPF_UNCACHED, PG_uncached); #endif -- cgit v1.2.3 From 3050141bae57984dd660e6861632ccf9b8bca77e Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 8 Oct 2009 11:50:55 -0400 Subject: NFSv4: Kill nfs4_renewd_prepare_shutdown() The NFSv4 renew daemon is shared between all active super blocks that refer to a particular NFS server, so it is wrong to be shutting it down in nfs4_kill_super every time a super block is destroyed. This patch therefore kills nfs4_renewd_prepare_shutdown altogether, and leaves it up to nfs4_shutdown_client() to also shut down the renew daemon by means of the existing call to nfs4_kill_renewd(). Signed-off-by: Trond Myklebust --- fs/nfs/nfs4renewd.c | 6 ------ fs/nfs/super.c | 1 - 2 files changed, 7 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c index e27c6cef18f2..0156c01c212c 100644 --- a/fs/nfs/nfs4renewd.c +++ b/fs/nfs/nfs4renewd.c @@ -126,12 +126,6 @@ nfs4_schedule_state_renewal(struct nfs_client *clp) spin_unlock(&clp->cl_lock); } -void -nfs4_renewd_prepare_shutdown(struct nfs_server *server) -{ - cancel_delayed_work(&server->nfs_client->cl_renewd); -} - void nfs4_kill_renewd(struct nfs_client *clp) { diff --git a/fs/nfs/super.c b/fs/nfs/super.c index fb3b280cacfe..6dabf6feec94 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -2689,7 +2689,6 @@ static void nfs4_kill_super(struct super_block *sb) dprintk("--> %s\n", __func__); nfs_super_return_all_delegations(sb); kill_anon_super(sb); - nfs4_renewd_prepare_shutdown(server); nfs_fscache_release_super_cookie(sb); nfs_free_server(server); dprintk("<-- %s\n", __func__); -- cgit v1.2.3 From 664fc5a4e7d0d7f3487e5c856b79f7dac79567fd Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 28 Sep 2009 13:34:20 -0700 Subject: ecryptfs: depends on CRYPTO ecryptfs uses crypto APIs so it should depend on CRYPTO. Otherwise many build errors occur. [63 lines not pasted] Signed-off-by: Randy Dunlap Cc: Andrew Morton Cc: ecryptfs-devel@lists.launchpad.net Signed-off-by: Tyler Hicks --- fs/ecryptfs/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ecryptfs/Kconfig b/fs/ecryptfs/Kconfig index 8aadb99b7634..89ebed3a4869 100644 --- a/fs/ecryptfs/Kconfig +++ b/fs/ecryptfs/Kconfig @@ -1,6 +1,6 @@ config ECRYPT_FS tristate "eCrypt filesystem layer support (EXPERIMENTAL)" - depends on EXPERIMENTAL && KEYS && NET + depends on EXPERIMENTAL && KEYS && NET && CRYPTO select CRYPTO_ECB select CRYPTO_CBC help -- cgit v1.2.3 From ed1f21857e76a92a006e0f890a3d7f72953b1469 Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Tue, 29 Sep 2009 02:33:59 -0500 Subject: eCryptfs: Remove Kconfig NET dependency and select MD5 eCryptfs no longer uses a netlink interface to communicate with ecryptfsd, so NET is not a valid dependency anymore. MD5 is required and must be built for eCryptfs to be of any use. Signed-off-by: Tyler Hicks --- fs/ecryptfs/Kconfig | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ecryptfs/Kconfig b/fs/ecryptfs/Kconfig index 89ebed3a4869..1cd6d9d3e29a 100644 --- a/fs/ecryptfs/Kconfig +++ b/fs/ecryptfs/Kconfig @@ -1,8 +1,9 @@ config ECRYPT_FS tristate "eCrypt filesystem layer support (EXPERIMENTAL)" - depends on EXPERIMENTAL && KEYS && NET && CRYPTO + depends on EXPERIMENTAL && KEYS && CRYPTO select CRYPTO_ECB select CRYPTO_CBC + select CRYPTO_MD5 help Encrypted filesystem that operates on the VFS layer. See to learn more about -- cgit v1.2.3 From 36520be8e32b49bd85a63b7b8b40cd07c3da59a5 Mon Sep 17 00:00:00 2001 From: Mimi Zohar Date: Mon, 5 Oct 2009 14:25:44 -0400 Subject: ima: ecryptfs fix imbalance message The unencrypted files are being measured. Update the counters to get rid of the ecryptfs imbalance message. (http://bugzilla.redhat.com/519737) Reported-by: Sachin Garg Cc: Eric Paris Cc: Dustin Kirkland Cc: James Morris Cc: David Safford Cc: stable@kernel.org Signed-off-by: Mimi Zohar Signed-off-by: Tyler Hicks --- fs/ecryptfs/main.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 101fe4c7b1ee..c6ac85d6c701 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -35,6 +35,7 @@ #include #include #include +#include #include "ecryptfs_kernel.h" /** @@ -118,6 +119,7 @@ int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry) const struct cred *cred = current_cred(); struct ecryptfs_inode_info *inode_info = ecryptfs_inode_to_private(ecryptfs_dentry->d_inode); + int opened_lower_file = 0; int rc = 0; mutex_lock(&inode_info->lower_file_mutex); @@ -134,9 +136,12 @@ int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry) "for lower_dentry [0x%p] and lower_mnt [0x%p]; " "rc = [%d]\n", lower_dentry, lower_mnt, rc); inode_info->lower_file = NULL; - } + } else + opened_lower_file = 1; } mutex_unlock(&inode_info->lower_file_mutex); + if (opened_lower_file) + ima_counts_get(inode_info->lower_file); return rc; } -- cgit v1.2.3 From f9581b1443abac50c90168301d40a7734b13a5dc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 6 Oct 2009 20:29:26 +0000 Subject: xfs: implement ->dirty_inode to fix timestamp handling This is picking up on Felix's repost of Dave's patch to implement a .dirty_inode method. We really need this notification because the VFS keeps writing directly into the inode structure instead of going through methods to update this state. In addition to the long-known atime issue we now also have a caller in VM code that updates c/mtime that way for shared writeable mmaps. And I found another one that no one has noticed in practice in the FIFO code. So implement ->dirty_inode to set i_update_core whenever the inode gets externally dirtied, and switch the c/mtime handling to the same scheme we already use for atime (always picking up the value from the Linux inode). Note that this patch also removes the xfs_synchronize_atime call in xfs_reclaim it was superflous as we already synchronize the time when writing the inode via the log (xfs_inode_item_format) or the normal buffers (xfs_iflush_int). In addition also remove the I_CLEAR check before copying the Linux timestamps - now that we always have the Linux inode available we can always use the timestamps in it. Also switch to just using file_update_time for regular reads/writes - that will get us all optimization done to it for free and make sure we notice early when it breaks. Signed-off-by: Christoph Hellwig Reviewed-by: Felix Blyakher Reviewed-by: Alex Elder Signed-off-by: Alex Elder --- fs/xfs/linux-2.6/xfs_aops.c | 1 - fs/xfs/linux-2.6/xfs_iops.c | 41 +++++++++++++++-------------------------- fs/xfs/linux-2.6/xfs_lrw.c | 2 +- fs/xfs/linux-2.6/xfs_super.c | 23 +++++++++++++++++++++++ fs/xfs/xfs_dfrag.c | 8 ++++---- fs/xfs/xfs_inode.c | 4 ++-- fs/xfs/xfs_inode.h | 2 +- fs/xfs/xfs_inode_item.c | 18 +++++++++++------- fs/xfs/xfs_itable.c | 21 +++++++++++++-------- fs/xfs/xfs_vnodeops.c | 6 ------ 10 files changed, 70 insertions(+), 56 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index d5e5559e31db..40d2226060d1 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -215,7 +215,6 @@ xfs_setfilesize( if (ip->i_d.di_size < isize) { ip->i_d.di_size = isize; - ip->i_update_core = 1; xfs_mark_inode_dirty_sync(ip); } diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index 626b474b2cdd..cdf7114d41fc 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -57,19 +57,22 @@ #include /* - * Bring the atime in the XFS inode uptodate. - * Used before logging the inode to disk or when the Linux inode goes away. + * Bring the timestamps in the XFS inode uptodate. + * + * Used before writing the inode to disk. */ void -xfs_synchronize_atime( +xfs_synchronize_times( xfs_inode_t *ip) { struct inode *inode = VFS_I(ip); - if (!(inode->i_state & I_CLEAR)) { - ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec; - ip->i_d.di_atime.t_nsec = (__int32_t)inode->i_atime.tv_nsec; - } + ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec; + ip->i_d.di_atime.t_nsec = (__int32_t)inode->i_atime.tv_nsec; + ip->i_d.di_ctime.t_sec = (__int32_t)inode->i_ctime.tv_sec; + ip->i_d.di_ctime.t_nsec = (__int32_t)inode->i_ctime.tv_nsec; + ip->i_d.di_mtime.t_sec = (__int32_t)inode->i_mtime.tv_sec; + ip->i_d.di_mtime.t_nsec = (__int32_t)inode->i_mtime.tv_nsec; } /* @@ -106,32 +109,20 @@ xfs_ichgtime( if ((flags & XFS_ICHGTIME_MOD) && !timespec_equal(&inode->i_mtime, &tv)) { inode->i_mtime = tv; - ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec; - ip->i_d.di_mtime.t_nsec = (__int32_t)tv.tv_nsec; sync_it = 1; } if ((flags & XFS_ICHGTIME_CHG) && !timespec_equal(&inode->i_ctime, &tv)) { inode->i_ctime = tv; - ip->i_d.di_ctime.t_sec = (__int32_t)tv.tv_sec; - ip->i_d.di_ctime.t_nsec = (__int32_t)tv.tv_nsec; sync_it = 1; } /* - * We update the i_update_core field _after_ changing - * the timestamps in order to coordinate properly with - * xfs_iflush() so that we don't lose timestamp updates. - * This keeps us from having to hold the inode lock - * while doing this. We use the SYNCHRONIZE macro to - * ensure that the compiler does not reorder the update - * of i_update_core above the timestamp updates above. + * Update complete - now make sure everyone knows that the inode + * is dirty. */ - if (sync_it) { - SYNCHRONIZE(); - ip->i_update_core = 1; + if (sync_it) xfs_mark_inode_dirty_sync(ip); - } } /* @@ -514,10 +505,8 @@ xfs_vn_getattr( stat->gid = ip->i_d.di_gid; stat->ino = ip->i_ino; stat->atime = inode->i_atime; - stat->mtime.tv_sec = ip->i_d.di_mtime.t_sec; - stat->mtime.tv_nsec = ip->i_d.di_mtime.t_nsec; - stat->ctime.tv_sec = ip->i_d.di_ctime.t_sec; - stat->ctime.tv_nsec = ip->i_d.di_ctime.t_nsec; + stat->mtime = inode->i_mtime; + stat->ctime = inode->i_ctime; stat->blocks = XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks); diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c index 49e4a6aea73c..072050f8d346 100644 --- a/fs/xfs/linux-2.6/xfs_lrw.c +++ b/fs/xfs/linux-2.6/xfs_lrw.c @@ -667,7 +667,7 @@ start: xip->i_new_size = new_size; if (likely(!(ioflags & IO_INVIS))) - xfs_ichgtime(xip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + file_update_time(file); /* * If the offset is beyond the size of the file, we have a couple diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 5d7c60ac77b4..1ea65b6e5ab6 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -976,6 +976,28 @@ xfs_fs_inode_init_once( mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); } +/* + * Dirty the XFS inode when mark_inode_dirty_sync() is called so that + * we catch unlogged VFS level updates to the inode. Care must be taken + * here - the transaction code calls mark_inode_dirty_sync() to mark the + * VFS inode dirty in a transaction and clears the i_update_core field; + * it must clear the field after calling mark_inode_dirty_sync() to + * correctly indicate that the dirty state has been propagated into the + * inode log item. + * + * We need the barrier() to maintain correct ordering between unlogged + * updates and the transaction commit code that clears the i_update_core + * field. This requires all updates to be completed before marking the + * inode dirty. + */ +STATIC void +xfs_fs_dirty_inode( + struct inode *inode) +{ + barrier(); + XFS_I(inode)->i_update_core = 1; +} + /* * Attempt to flush the inode, this will actually fail * if the inode is pinned, but we dirty the inode again @@ -1539,6 +1561,7 @@ xfs_fs_get_sb( static struct super_operations xfs_super_operations = { .alloc_inode = xfs_fs_alloc_inode, .destroy_inode = xfs_fs_destroy_inode, + .dirty_inode = xfs_fs_dirty_inode, .write_inode = xfs_fs_write_inode, .clear_inode = xfs_fs_clear_inode, .put_super = xfs_fs_put_super, diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index 7465f9ee125f..ab89a7e94a0f 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -206,10 +206,10 @@ xfs_swap_extents( * process that the file was not changed out from * under it. */ - if ((sbp->bs_ctime.tv_sec != ip->i_d.di_ctime.t_sec) || - (sbp->bs_ctime.tv_nsec != ip->i_d.di_ctime.t_nsec) || - (sbp->bs_mtime.tv_sec != ip->i_d.di_mtime.t_sec) || - (sbp->bs_mtime.tv_nsec != ip->i_d.di_mtime.t_nsec)) { + if ((sbp->bs_ctime.tv_sec != VFS_I(ip)->i_ctime.tv_sec) || + (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) || + (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) || + (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) { error = XFS_ERROR(EBUSY); goto out_unlock; } diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index c1dc7ef5a1d8..b92a4fa2a0a1 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -3068,9 +3068,9 @@ xfs_iflush_int( SYNCHRONIZE(); /* - * Make sure to get the latest atime from the Linux inode. + * Make sure to get the latest timestamps from the Linux inode. */ - xfs_synchronize_atime(ip); + xfs_synchronize_times(ip); if (XFS_TEST_ERROR(be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC, mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) { diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 0b38b9a869ec..41555de1d1db 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -504,7 +504,7 @@ void xfs_ichgtime(xfs_inode_t *, int); void xfs_lock_inodes(xfs_inode_t **, int, uint); void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint); -void xfs_synchronize_atime(xfs_inode_t *); +void xfs_synchronize_times(xfs_inode_t *); void xfs_mark_inode_dirty_sync(xfs_inode_t *); #if defined(XFS_INODE_TRACE) diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 47d5b663c37e..9794b876d6ff 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -231,6 +231,15 @@ xfs_inode_item_format( vecp++; nvecs = 1; + /* + * Make sure the linux inode is dirty. We do this before + * clearing i_update_core as the VFS will call back into + * XFS here and set i_update_core, so we need to dirty the + * inode first so that the ordering of i_update_core and + * unlogged modifications still works as described below. + */ + xfs_mark_inode_dirty_sync(ip); + /* * Clear i_update_core if the timestamps (or any other * non-transactional modification) need flushing/logging @@ -263,14 +272,9 @@ xfs_inode_item_format( } /* - * Make sure to get the latest atime from the Linux inode. + * Make sure to get the latest timestamps from the Linux inode. */ - xfs_synchronize_atime(ip); - - /* - * make sure the linux inode is dirty - */ - xfs_mark_inode_dirty_sync(ip); + xfs_synchronize_times(ip); vecp->i_addr = (xfs_caddr_t)&ip->i_d; vecp->i_len = sizeof(struct xfs_icdinode); diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index b68f9107e26c..62efab2f3839 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -59,6 +59,7 @@ xfs_bulkstat_one_iget( { xfs_icdinode_t *dic; /* dinode core info pointer */ xfs_inode_t *ip; /* incore inode pointer */ + struct inode *inode; int error; error = xfs_iget(mp, NULL, ino, @@ -72,6 +73,7 @@ xfs_bulkstat_one_iget( ASSERT(ip->i_imap.im_blkno != 0); dic = &ip->i_d; + inode = VFS_I(ip); /* xfs_iget returns the following without needing * further change. @@ -83,16 +85,19 @@ xfs_bulkstat_one_iget( buf->bs_uid = dic->di_uid; buf->bs_gid = dic->di_gid; buf->bs_size = dic->di_size; + /* - * We are reading the atime from the Linux inode because the - * dinode might not be uptodate. + * We need to read the timestamps from the Linux inode because + * the VFS keeps writing directly into the inode structure instead + * of telling us about the updates. */ - buf->bs_atime.tv_sec = VFS_I(ip)->i_atime.tv_sec; - buf->bs_atime.tv_nsec = VFS_I(ip)->i_atime.tv_nsec; - buf->bs_mtime.tv_sec = dic->di_mtime.t_sec; - buf->bs_mtime.tv_nsec = dic->di_mtime.t_nsec; - buf->bs_ctime.tv_sec = dic->di_ctime.t_sec; - buf->bs_ctime.tv_nsec = dic->di_ctime.t_nsec; + buf->bs_atime.tv_sec = inode->i_atime.tv_sec; + buf->bs_atime.tv_nsec = inode->i_atime.tv_nsec; + buf->bs_mtime.tv_sec = inode->i_mtime.tv_sec; + buf->bs_mtime.tv_nsec = inode->i_mtime.tv_nsec; + buf->bs_ctime.tv_sec = inode->i_ctime.tv_sec; + buf->bs_ctime.tv_nsec = inode->i_ctime.tv_nsec; + buf->bs_xflags = xfs_ip2xflags(ip); buf->bs_extsize = dic->di_extsize << mp->m_sb.sb_blocklog; buf->bs_extents = dic->di_nextents; diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index a434f287962d..b572f7e840e0 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -2475,12 +2475,6 @@ xfs_reclaim( ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0); - /* - * Make sure the atime in the XFS inode is correct before freeing the - * Linux inode. - */ - xfs_synchronize_atime(ip); - /* * If we have nothing to flush with this inode then complete the * teardown now, otherwise break the link between the xfs inode and the -- cgit v1.2.3 From c90b07e8dd9f263d2e2af1d9648ba269f4d0d8fd Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 6 Oct 2009 20:29:27 +0000 Subject: xfs: fix xfs_quiesce_data We need to do a synchronous xfs_sync_fsdata to make sure the superblock actually is on disk when we return. Also remove SYNC_BDFLUSH flag to xfs_sync_inodes because that particular flag is never checked. Move xfs_filestream_flush call later to only release inodes after they have been written out. Signed-off-by: Dave Chinner Signed-off-by: Christoph Hellwig Reviewed-by: Alex Elder Signed-off-by: Alex Elder --- fs/xfs/linux-2.6/xfs_sync.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index 320be6aea492..7647b8db3ca3 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -419,14 +419,16 @@ xfs_quiesce_data( /* push non-blocking */ xfs_sync_data(mp, 0); xfs_qm_sync(mp, SYNC_TRYLOCK); - xfs_filestream_flush(mp); - /* push and block */ + /* push and block till complete */ xfs_sync_data(mp, SYNC_WAIT); xfs_qm_sync(mp, SYNC_WAIT); + /* drop inode references pinned by filestreams */ + xfs_filestream_flush(mp); + /* write superblock and hoover up shutdown errors */ - error = xfs_sync_fsdata(mp, 0); + error = xfs_sync_fsdata(mp, SYNC_WAIT); /* flush data-only devices */ if (mp->m_rtdev_targp) -- cgit v1.2.3 From 69961a26b82931601e07c9e3656bd90c598f2395 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 6 Oct 2009 20:29:28 +0000 Subject: xfs: cleanup ->sync_fs Sort out ->sync_fs to not perform a superblock writeback for the wait = 0 case as that is just an optional first pass and the superblock will be written back properly in the next call with wait = 1. Instead perform an opportunistic quota writeback to have less work later. Also remove the freeze special case as we do a proper wait = 1 call in the freeze code anyway. Also rename the function to xfs_fs_sync_fs to match the normal naming convention, update comments and avoid calling into the laptop_mode logic on an error. Signed-off-by: Christoph Hellwig Reviewed-by: Alex Elder Signed-off-by: Alex Elder --- fs/xfs/linux-2.6/xfs_super.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 1ea65b6e5ab6..eefea0d9d03d 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -1148,7 +1148,7 @@ xfs_fs_put_super( } STATIC int -xfs_fs_sync_super( +xfs_fs_sync_fs( struct super_block *sb, int wait) { @@ -1156,23 +1156,23 @@ xfs_fs_sync_super( int error; /* - * Treat a sync operation like a freeze. This is to work - * around a race in sync_inodes() which works in two phases - * - an asynchronous flush, which can write out an inode - * without waiting for file size updates to complete, and a - * synchronous flush, which wont do anything because the - * async flush removed the inode's dirty flag. Also - * sync_inodes() will not see any files that just have - * outstanding transactions to be flushed because we don't - * dirty the Linux inode until after the transaction I/O - * completes. + * Not much we can do for the first async pass. Writing out the + * superblock would be counter-productive as we are going to redirty + * when writing out other data and metadata (and writing out a single + * block is quite fast anyway). + * + * Try to asynchronously kick off quota syncing at least. */ - if (wait || unlikely(sb->s_frozen == SB_FREEZE_WRITE)) - error = xfs_quiesce_data(mp); - else - error = xfs_sync_fsdata(mp, 0); + if (!wait) { + xfs_qm_sync(mp, SYNC_TRYLOCK); + return 0; + } + + error = xfs_quiesce_data(mp); + if (error) + return -error; - if (unlikely(laptop_mode)) { + if (laptop_mode) { int prev_sync_seq = mp->m_sync_seq; /* @@ -1191,7 +1191,7 @@ xfs_fs_sync_super( mp->m_sync_seq != prev_sync_seq); } - return -error; + return 0; } STATIC int @@ -1565,7 +1565,7 @@ static struct super_operations xfs_super_operations = { .write_inode = xfs_fs_write_inode, .clear_inode = xfs_fs_clear_inode, .put_super = xfs_fs_put_super, - .sync_fs = xfs_fs_sync_super, + .sync_fs = xfs_fs_sync_fs, .freeze_fs = xfs_fs_freeze, .statfs = xfs_fs_statfs, .remount_fs = xfs_fs_remount, -- cgit v1.2.3 From 932640e8adaff3d0c28ee32d4863c7a7a74df055 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 6 Oct 2009 20:29:29 +0000 Subject: xfs: mark inodes dirty before issuing I/O To make sure they get properly waited on in sync when I/O is in flight and we latter need to update the inode size. Requires a new helper to check if an ioend structure is beyond the current EOF. Signed-off-by: Dave Chinner Signed-off-by: Christoph Hellwig Reviewed-by: Alex Elder Signed-off-by: Alex Elder --- fs/xfs/linux-2.6/xfs_aops.c | 37 ++++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index 40d2226060d1..6a2910ee8a28 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -185,6 +185,24 @@ xfs_destroy_ioend( mempool_free(ioend, xfs_ioend_pool); } +/* + * If the end of the current ioend is beyond the current EOF, + * return the new EOF value, otherwise zero. + */ +STATIC xfs_fsize_t +xfs_ioend_new_eof( + xfs_ioend_t *ioend) +{ + xfs_inode_t *ip = XFS_I(ioend->io_inode); + xfs_fsize_t isize; + xfs_fsize_t bsize; + + bsize = ioend->io_offset + ioend->io_size; + isize = MAX(ip->i_size, ip->i_new_size); + isize = MIN(isize, bsize); + return isize > ip->i_d.di_size ? isize : 0; +} + /* * Update on-disk file size now that data has been written to disk. * The current in-memory file size is i_size. If a write is beyond @@ -192,13 +210,13 @@ xfs_destroy_ioend( * updated. If this write does not extend all the way to the valid * file size then restrict this update to the end of the write. */ + STATIC void xfs_setfilesize( xfs_ioend_t *ioend) { xfs_inode_t *ip = XFS_I(ioend->io_inode); xfs_fsize_t isize; - xfs_fsize_t bsize; ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG); ASSERT(ioend->io_type != IOMAP_READ); @@ -206,14 +224,9 @@ xfs_setfilesize( if (unlikely(ioend->io_error)) return; - bsize = ioend->io_offset + ioend->io_size; - xfs_ilock(ip, XFS_ILOCK_EXCL); - - isize = MAX(ip->i_size, ip->i_new_size); - isize = MIN(isize, bsize); - - if (ip->i_d.di_size < isize) { + isize = xfs_ioend_new_eof(ioend); + if (isize) { ip->i_d.di_size =