summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--block/bfq-cgroup.c3
-rw-r--r--block/bio-integrity.c2
-rw-r--r--block/bio.c3
-rw-r--r--block/blk-zoned.c149
-rw-r--r--block/blk.h4
-rw-r--r--block/ioctl.c2
-rw-r--r--drivers/block/brd.c5
-rw-r--r--drivers/block/null_blk_main.c40
-rw-r--r--drivers/block/xen-blkback/blkback.c2
-rw-r--r--drivers/md/dm-table.c12
-rw-r--r--drivers/md/dm-zoned-target.c2
-rw-r--r--drivers/scsi/sd_zbc.c2
-rw-r--r--fs/block_dev.c2
-rw-r--r--fs/io-wq.c2
-rw-r--r--fs/io-wq.h11
-rw-r--r--fs/io_uring.c694
-rw-r--r--include/linux/blkdev.h24
-rw-r--r--include/linux/bvec.h22
-rw-r--r--include/linux/socket.h20
-rw-r--r--include/uapi/linux/io_uring.h1
-rw-r--r--net/socket.c76
21 files changed, 672 insertions, 406 deletions
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index cea0ae12f937..e1419edde2ec 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -351,6 +351,9 @@ void bfqg_stats_update_legacy_io(struct request_queue *q, struct request *rq)
{
struct bfq_group *bfqg = blkg_to_bfqg(rq->bio->bi_blkg);
+ if (!bfqg)
+ return;
+
blkg_rwstat_add(&bfqg->stats.bytes, rq->cmd_flags, blk_rq_bytes(rq));
blkg_rwstat_add(&bfqg->stats.ios, rq->cmd_flags, 1);
}
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index fb95dbb21dd8..bf62c25cde8f 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -87,7 +87,7 @@ EXPORT_SYMBOL(bio_integrity_alloc);
* Description: Used to free the integrity portion of a bio. Usually
* called from bio_free().
*/
-static void bio_integrity_free(struct bio *bio)
+void bio_integrity_free(struct bio *bio)
{
struct bio_integrity_payload *bip = bio_integrity(bio);
struct bio_set *bs = bio->bi_pool;
diff --git a/block/bio.c b/block/bio.c
index b1170ec18464..9d54aa37ce6c 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -233,6 +233,9 @@ fallback:
void bio_uninit(struct bio *bio)
{
bio_disassociate_blkg(bio);
+
+ if (bio_integrity(bio))
+ bio_integrity_free(bio);
}
EXPORT_SYMBOL(bio_uninit);
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 6fad6f3f6980..d00fcfd71dfe 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -70,30 +70,20 @@ void __blk_req_zone_write_unlock(struct request *rq)
}
EXPORT_SYMBOL_GPL(__blk_req_zone_write_unlock);
-static inline unsigned int __blkdev_nr_zones(struct request_queue *q,
- sector_t nr_sectors)
-{
- sector_t zone_sectors = blk_queue_zone_sectors(q);
-
- return (nr_sectors + zone_sectors - 1) >> ilog2(zone_sectors);
-}
-
/**
* blkdev_nr_zones - Get number of zones
- * @bdev: Target block device
+ * @disk: Target gendisk
*
- * Description:
- * Return the total number of zones of a zoned block device.
- * For a regular block device, the number of zones is always 0.
+ * Return the total number of zones of a zoned block device. For a block
+ * device without zone capabilities, the number of zones is always 0.
*/
-unsigned int blkdev_nr_zones(struct block_device *bdev)
+unsigned int blkdev_nr_zones(struct gendisk *disk)
{
- struct request_queue *q = bdev_get_queue(bdev);
+ sector_t zone_sectors = blk_queue_zone_sectors(disk->queue);
- if (!blk_queue_is_zoned(q))
+ if (!blk_queue_is_zoned(disk->queue))
return 0;
-
- return __blkdev_nr_zones(q, get_capacity(bdev->bd_disk));
+ return (get_capacity(disk) + zone_sectors - 1) >> ilog2(zone_sectors);
}
EXPORT_SYMBOL_GPL(blkdev_nr_zones);
@@ -342,16 +332,18 @@ static inline unsigned long *blk_alloc_zone_bitmap(int node,
void blk_queue_free_zone_bitmaps(struct request_queue *q)
{
- kfree(q->seq_zones_bitmap);
- q->seq_zones_bitmap = NULL;
+ kfree(q->conv_zones_bitmap);
+ q->conv_zones_bitmap = NULL;
kfree(q->seq_zones_wlock);
q->seq_zones_wlock = NULL;
}
struct blk_revalidate_zone_args {
struct gendisk *disk;
- unsigned long *seq_zones_bitmap;
+ unsigned long *conv_zones_bitmap;
unsigned long *seq_zones_wlock;
+ unsigned int nr_zones;
+ sector_t zone_sectors;
sector_t sector;
};
@@ -364,25 +356,33 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx,
struct blk_revalidate_zone_args *args = data;
struct gendisk *disk = args->disk;
struct request_queue *q = disk->queue;
- sector_t zone_sectors = blk_queue_zone_sectors(q);
sector_t capacity = get_capacity(disk);
/*
* All zones must have the same size, with the exception on an eventual
* smaller last zone.
*/
- if (zone->start + zone_sectors < capacity &&
- zone->len != zone_sectors) {
- pr_warn("%s: Invalid zoned device with non constant zone size\n",
- disk->disk_name);
- return false;
- }
+ if (zone->start == 0) {
+ if (zone->len == 0 || !is_power_of_2(zone->len)) {
+ pr_warn("%s: Invalid zoned device with non power of two zone size (%llu)\n",
+ disk->disk_name, zone->len);
+ return -ENODEV;
+ }
- if (zone->start + zone->len >= capacity &&
- zone->len > zone_sectors) {
- pr_warn("%s: Invalid zoned device with larger last zone size\n",
- disk->disk_name);
- return -ENODEV;
+ args->zone_sectors = zone->len;
+ args->nr_zones = (capacity + zone->len - 1) >> ilog2(zone->len);
+ } else if (zone->start + args->zone_sectors < capacity) {
+ if (zone->len != args->zone_sectors) {
+ pr_warn("%s: Invalid zoned device with non constant zone size\n",
+ disk->disk_name);
+ return -ENODEV;
+ }
+ } else {
+ if (zone->len > args->zone_sectors) {
+ pr_warn("%s: Invalid zoned device with larger last zone size\n",
+ disk->disk_name);
+ return -ENODEV;
+ }
}
/* Check for holes in the zone report */
@@ -395,8 +395,22 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx,
/* Check zone type */
switch (zone->type) {
case BLK_ZONE_TYPE_CONVENTIONAL:
+ if (!args->conv_zones_bitmap) {
+ args->conv_zones_bitmap =
+ blk_alloc_zone_bitmap(q->node, args->nr_zones);
+ if (!args->conv_zones_bitmap)
+ return -ENOMEM;
+ }
+ set_bit(idx, args->conv_zones_bitmap);
+ break;
case BLK_ZONE_TYPE_SEQWRITE_REQ:
case BLK_ZONE_TYPE_SEQWRITE_PREF:
+ if (!args->seq_zones_wlock) {
+ args->seq_zones_wlock =
+ blk_alloc_zone_bitmap(q->node, args->nr_zones);
+ if (!args->seq_zones_wlock)
+ return -ENOMEM;
+ }
break;
default:
pr_warn("%s: Invalid zone type 0x%x at sectors %llu\n",
@@ -404,78 +418,54 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx,
return -ENODEV;
}
- if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL)
- set_bit(idx, args->seq_zones_bitmap);
-
args->sector += zone->len;
return 0;
}
-static int blk_update_zone_info(struct gendisk *disk, unsigned int nr_zones,
- struct blk_revalidate_zone_args *args)
-{
- /*
- * Ensure that all memory allocations in this context are done as
- * if GFP_NOIO was specified.
- */
- unsigned int noio_flag = memalloc_noio_save();
- struct request_queue *q = disk->queue;
- int ret;
-
- args->seq_zones_wlock = blk_alloc_zone_bitmap(q->node, nr_zones);
- if (!args->seq_zones_wlock)
- return -ENOMEM;
- args->seq_zones_bitmap = blk_alloc_zone_bitmap(q->node, nr_zones);
- if (!args->seq_zones_bitmap)
- return -ENOMEM;
-
- ret = disk->fops->report_zones(disk, 0, nr_zones,
- blk_revalidate_zone_cb, args);
- memalloc_noio_restore(noio_flag);
- return ret;
-}
-
/**
* blk_revalidate_disk_zones - (re)allocate and initialize zone bitmaps
* @disk: Target disk
*
* Helper function for low-level device drivers to (re) allocate and initialize
* a disk request queue zone bitmaps. This functions should normally be called
- * within the disk ->revalidate method. For BIO based queues, no zone bitmap
- * is allocated.
+ * within the disk ->revalidate method for blk-mq based drivers. For BIO based
+ * drivers only q->nr_zones needs to be updated so that the sysfs exposed value
+ * is correct.
*/
int blk_revalidate_disk_zones(struct gendisk *disk)
{
struct request_queue *q = disk->queue;
- unsigned int nr_zones = __blkdev_nr_zones(q, get_capacity(disk));
- struct blk_revalidate_zone_args args = { .disk = disk };
- int ret = 0;
+ struct blk_revalidate_zone_args args = {
+ .disk = disk,
+ };
+ unsigned int noio_flag;
+ int ret;
if (WARN_ON_ONCE(!blk_queue_is_zoned(q)))
return -EIO;
+ if (WARN_ON_ONCE(!queue_is_mq(q)))
+ return -EIO;
/*
- * BIO based queues do not use a scheduler so only q->nr_zones
- * needs to be updated so that the sysfs exposed value is correct.
+ * Ensure that all memory allocations in this context are done as if
+ * GFP_NOIO was specified.
*/
- if (!queue_is_mq(q)) {
- q->nr_zones = nr_zones;
- return 0;
- }
-
- if (nr_zones)
- ret = blk_update_zone_info(disk, nr_zones, &args);
+ noio_flag = memalloc_noio_save();
+ ret = disk->fops->report_zones(disk, 0, UINT_MAX,
+ blk_revalidate_zone_cb, &args);
+ memalloc_noio_restore(noio_flag);
/*
- * Install the new bitmaps, making sure the queue is stopped and
- * all I/Os are completed (i.e. a scheduler is not referencing the
- * bitmaps).
+ * Install the new bitmaps and update nr_zones only once the queue is
+ * stopped and all I/Os are completed (i.e. a scheduler is not
+ * referencing the bitmaps).
*/
blk_mq_freeze_queue(q);
if (ret >= 0) {
- q->nr_zones = nr_zones;
+ blk_queue_chunk_sectors(q, args.zone_sectors);
+ q->nr_zones = args.nr_zones;
swap(q->seq_zones_wlock, args.seq_zones_wlock);
- swap(q->seq_zones_bitmap, args.seq_zones_bitmap);
+ swap(q->conv_zones_bitmap, args.conv_zones_bitmap);
ret = 0;
} else {
pr_warn("%s: failed to revalidate zones\n", disk->disk_name);
@@ -484,8 +474,7 @@ int blk_revalidate_disk_zones(struct gendisk *disk)
blk_mq_unfreeze_queue(q);
kfree(args.seq_zones_wlock);
- kfree(args.seq_zones_bitmap);
+ kfree(args.conv_zones_bitmap);
return ret;
}
EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones);
-
diff --git a/block/blk.h b/block/blk.h
index 2bea40180b6f..6842f28c033e 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -121,6 +121,7 @@ static inline void blk_rq_bio_prep(struct request *rq, struct bio *bio,
#ifdef CONFIG_BLK_DEV_INTEGRITY
void blk_flush_integrity(void);
bool __bio_integrity_endio(struct bio *);
+void bio_integrity_free(struct bio *bio);
static inline bool bio_integrity_endio(struct bio *bio)
{
if (bio_integrity(bio))
@@ -166,6 +167,9 @@ static inline bool bio_integrity_endio(struct bio *bio)
{
return true;
}
+static inline void bio_integrity_free(struct bio *bio)
+{
+}
#endif /* CONFIG_BLK_DEV_INTEGRITY */
unsigned long blk_rq_timeout(unsigned long timeout);
diff --git a/block/ioctl.c b/block/ioctl.c
index 7ac8a66c9787..5de98b97af2a 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -512,7 +512,7 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
case BLKGETZONESZ:
return put_uint(arg, bdev_zone_sectors(bdev));
case BLKGETNRZONES:
- return put_uint(arg, blkdev_nr_zones(bdev));
+ return put_uint(arg, blkdev_nr_zones(bdev->bd_disk));
case HDIO_GETGEO:
return blkdev_getgeo(bdev, argp);
case BLKRAGET:
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index c548a5a6c1a0..a8730cc4db10 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -297,6 +297,10 @@ static blk_qc_t brd_make_request(struct request_queue *q, struct bio *bio)
unsigned int len = bvec.bv_len;
int err;
+ /* Don't support un-aligned buffer */
+ WARN_ON_ONCE((bvec.bv_offset & (SECTOR_SIZE - 1)) ||
+ (len & (SECTOR_SIZE - 1)));
+
err = brd_do_bvec(brd, bvec.bv_page, len, bvec.bv_offset,
bio_op(bio), sector);
if (err)
@@ -382,7 +386,6 @@ static struct brd_device *brd_alloc(int i)
goto out_free_dev;
blk_queue_make_request(brd->brd_queue, brd_make_request);
- blk_queue_max_hw_sectors(brd->brd_queue, 1024);
/* This is so fdisk will align partitions on 4k, because of
* direct_access API needing 4k alignment, returning a PFN
diff --git a/drivers/block/null_blk_main.c b/drivers/block/null_blk_main.c
index 795fda576824..ae8d4bc532b0 100644
--- a/drivers/block/null_blk_main.c
+++ b/drivers/block/null_blk_main.c
@@ -1559,14 +1559,13 @@ static int init_driver_queues(struct nullb *nullb)
static int null_gendisk_register(struct nullb *nullb)
{
+ sector_t size = ((sector_t)nullb->dev->size * SZ_1M) >> SECTOR_SHIFT;
struct gendisk *disk;
- sector_t size;
disk = nullb->disk = alloc_disk_node(1, nullb->dev->home_node);
if (!disk)
return -ENOMEM;
- size = (sector_t)nullb->dev->size * 1024 * 1024ULL;
- set_capacity(disk, size >> 9);
+ set_capacity(disk, size);
disk->flags |= GENHD_FL_EXT_DEVT | GENHD_FL_SUPPRESS_PARTITION_INFO;
disk->major = null_major;
@@ -1576,12 +1575,19 @@ static int null_gendisk_register(struct nullb *nullb)
disk->queue = nullb->q;
strncpy(disk->disk_name, nullb->disk_name, DISK_NAME_LEN);
+#ifdef CONFIG_BLK_DEV_ZONED
if (nullb->dev->zoned) {
- int ret = blk_revalidate_disk_zones(disk);
-
- if (ret != 0)
- return ret;
+ if (queue_is_mq(nullb->q)) {
+ int ret = blk_revalidate_disk_zones(disk);
+ if (ret)
+ return ret;
+ } else {
+ blk_queue_chunk_sectors(nullb->q,
+ nullb->dev->zone_size_sects);
+ nullb->q->nr_zones = blkdev_nr_zones(disk);
+ }
}
+#endif
add_disk(disk);
return 0;
@@ -1607,7 +1613,7 @@ static int null_init_tag_set(struct nullb *nullb, struct blk_mq_tag_set *set)
return blk_mq_alloc_tag_set(set);
}
-static void null_validate_conf(struct nullb_device *dev)
+static int null_validate_conf(struct nullb_device *dev)
{
dev->blocksize = round_down(dev->blocksize, 512);
dev->blocksize = clamp_t(unsigned int, dev->blocksize, 512, 4096);
@@ -1634,6 +1640,14 @@ static void null_validate_conf(struct nullb_device *dev)
/* can not stop a queue */
if (dev->queue_mode == NULL_Q_BIO)
dev->mbps = 0;
+
+ if (dev->zoned &&
+ (!dev->zone_size || !is_power_of_2(dev->zone_size))) {
+ pr_err("zone_size must be power-of-two\n");
+ return -EINVAL;
+ }
+
+ return 0;
}
#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
@@ -1666,7 +1680,9 @@ static int null_add_dev(struct nullb_device *dev)
struct nullb *nullb;
int rv;
- null_validate_conf(dev);
+ rv = null_validate_conf(dev);
+ if (rv)
+ return rv;
nullb = kzalloc_node(sizeof(*nullb), GFP_KERNEL, dev->home_node);
if (!nullb) {
@@ -1731,7 +1747,6 @@ static int null_add_dev(struct nullb_device *dev)
if (rv)
goto out_cleanup_blk_queue;
- blk_queue_chunk_sectors(nullb->q, dev->zone_size_sects);
nullb->q->limits.zoned = BLK_ZONED_HM;
blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, nullb->q);
blk_queue_required_elevator_features(nullb->q,
@@ -1792,11 +1807,6 @@ static int __init null_init(void)
g_bs = PAGE_SIZE;
}
- if (!is_power_of_2(g_zone_size)) {
- pr_err("zone_size must be power-of-two\n");
- return -EINVAL;
- }
-
if (g_home_node != NUMA_NO_NODE && g_home_node >= nr_online_nodes) {
pr_err("invalid home_node value\n");
g_home_node = NUMA_NO_NODE;
diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index fd1e19f1a49f..3666afa639d1 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -936,6 +936,8 @@ next:
out_of_memory:
pr_alert("%s: out of memory\n", __func__);
put_free_pages(ring, pages_to_gnt, segs_to_map);
+ for (i = last_map; i < num; i++)
+ pages[i]->handle = BLKBACK_INVALID_HANDLE;
return -ENOMEM;
}
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 2ae0c1913766..0a2cc197f62b 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1954,12 +1954,14 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
/*
* For a zoned target, the number of zones should be updated for the
* correct value to be exposed in sysfs queue/nr_zones. For a BIO based
- * target, this is all that is needed. For a request based target, the
- * queue zone bitmaps must also be updated.
- * Use blk_revalidate_disk_zones() to handle this.
+ * target, this is all that is needed.
*/
- if (blk_queue_is_zoned(q))
- blk_revalidate_disk_zones(t->md->disk);
+#ifdef CONFIG_BLK_DEV_ZONED
+ if (blk_queue_is_zoned(q)) {
+ WARN_ON_ONCE(queue_is_mq(q));
+ q->nr_zones = blkdev_nr_zones(t->md->disk);
+ }
+#endif
/* Allow reads to exceed readahead limits */
q->backing_dev_info->io_pages = limits->max_sectors >> (PAGE_SHIFT - 9);
diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c
index 4574e0dedbd6..70a1063161c0 100644
--- a/drivers/md/dm-zoned-target.c
+++ b/drivers/md/dm-zoned-target.c
@@ -727,7 +727,7 @@ static int dmz_get_zoned_device(struct dm_target *ti, char *path)
dev->zone_nr_blocks = dmz_sect2blk(dev->zone_nr_sectors);
dev->zone_nr_blocks_shift = ilog2(dev->zone_nr_blocks);
- dev->nr_zones = blkdev_nr_zones(dev->bdev);
+ dev->nr_zones = blkdev_nr_zones(dev->bdev->bd_disk);
dmz->dev = dev;
diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index 0e5ede48f045..27d72c1d4654 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -412,8 +412,6 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, unsigned char *buf)
goto err;
/* The drive satisfies the kernel restrictions: set it up */
- blk_queue_chunk_sectors(sdkp->disk->queue,
- logical_to_sectors(sdkp->device, zone_blocks));
blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, sdkp->disk->queue);
blk_queue_required_elevator_features(sdkp->disk->queue,
ELEVATOR_F_ZBD_SEQ_WRITE);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index ee63c2732fa2..69bf2fb6f7cd 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1531,7 +1531,7 @@ rescan:
ret = blk_add_partitions(disk, bdev);
if (ret == -EAGAIN)
goto rescan;
- } else {
+ } else if (invalidate) {
/*
* Tell userspace that the media / partition table may have
* changed.
diff --git a/fs/io-wq.c b/fs/io-wq.c
index 91b85df0861e..74b40506c5d9 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -111,7 +111,7 @@ struct io_wq {
struct task_struct *manager;
struct user_struct *user;
- struct cred *creds;
+ const struct cred *creds;
struct mm_struct *mm;
refcount_t refs;
struct completion done;
diff --git a/fs/io-wq.h b/fs/io-wq.h
index 600e0158cba7..7c333a28e2a7 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -52,6 +52,7 @@ static inline void wq_node_del(struct io_wq_work_list *list,
list->last = prev;
if (prev)
prev->next = node->next;
+ node->next = NULL;
}
#define wq_list_for_each(pos, prv, head) \
@@ -87,7 +88,7 @@ typedef void (put_work_fn)(struct io_wq_work *);
struct io_wq_data {
struct mm_struct *mm;
struct user_struct *user;
- struct cred *creds;
+ const struct cred *creds;
get_work_fn *get_work;
put_work_fn *put_work;
@@ -118,10 +119,6 @@ static inline void io_wq_worker_sleeping(struct task_struct *tsk)
static inline void io_wq_worker_running(struct task_struct *tsk)
{
}
-#endif
+#endif /* CONFIG_IO_WQ */
-static inline bool io_wq_current_is_worker(void)
-{
- return in_task() && (current->flags & PF_IO_WORKER);
-}
-#endif
+#endif /* INTERNAL_IO_WQ_H */
diff --git a/fs/io_uring.c b/fs/io_uring.c
index ec53aa7cdc94..405be10da73d 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -145,7 +145,7 @@ struct io_rings {
/*
* Number of completion events lost because the queue was full;
* this should be avoided by the application by making sure
- * there are not more requests pending thatn there is space in
+ * there are not more requests pending than there is space in
* the completion queue.
*
* Written by the kernel, shouldn't be modified by the
@@ -238,7 +238,7 @@ struct io_ring_ctx {
struct user_struct *user;
- struct cred *creds;
+ const struct cred *creds;
/* 0 is for ctx quiesce/reinit/free, 1 is for sqo_thread started */
struct completion *completions;
@@ -275,7 +275,8 @@ struct io_ring_ctx {
* manipulate the list, hence no extra locking is needed there.
*/
struct list_head poll_list;
- struct rb_root cancel_tree;
+ struct hlist_head *cancel_hash;
+ unsigned cancel_hash_bits;
spinlock_t inflight_lock;
struct list_head inflight_list;
@@ -303,9 +304,32 @@ struct io_timeout_data {
u32 seq_offset;
};
-struct io_timeout {
- struct file *file;
- struct io_timeout_data *data;
+struct io_async_connect {
+ struct sockaddr_storage address;
+};
+
+struct io_async_msghdr {
+ struct iovec fast_iov[UIO_FASTIOV];
+ struct iovec *iov;
+ struct sockaddr __user *uaddr;
+ struct msghdr msg;
+};
+
+struct io_async_rw {
+ struct iovec fast_iov[UIO_FASTIOV];
+ struct iovec *iov;
+ ssize_t nr_segs;
+ ssize_t size;
+};
+
+struct io_async_ctx {
+ struct io_uring_sqe sqe;
+ union {
+ struct io_async_rw rw;
+ struct io_async_msghdr msg;
+ struct io_async_connect connect;
+ struct io_timeout_data timeout;
+ };
};
/*
@@ -319,10 +343,10 @@ struct io_kiocb {
struct file *file;
struct kiocb rw;
struct io_poll_iocb poll;
- struct io_timeout timeout;
};
const struct io_uring_sqe *sqe;
+ struct io_async_ctx *io;
struct file *ring_file;
int ring_fd;
bool has_user;
@@ -332,7 +356,7 @@ struct io_kiocb {
struct io_ring_ctx *ctx;
union {
struct list_head list;
- struct rb_node rb_node;
+ struct hlist_node hash_node;
};
struct list_head link_list;
unsigned int flags;
@@ -353,7 +377,6 @@ struct io_kiocb {
#define REQ_F_TIMEOUT_NOSEQ 8192 /* no timeout sequence */
#define REQ_F_INFLIGHT 16384 /* on inflight list */
#define REQ_F_COMP_LOCKED 32768 /* completion under lock */
-#define REQ_F_FREE_SQE 65536 /* free sqe if not async queued */
u64 user_data;
u32 result;
u32 sequence;
@@ -422,6 +445,7 @@ static void io_ring_ctx_ref_free(struct percpu_ref *ref)
static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
{
struct io_ring_ctx *ctx;
+ int hash_bits;
ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
if (!ctx)
@@ -435,6 +459,21 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
if (!ctx->completions)
goto err;
+ /*
+ * Use 5 bits less than the max cq entries, that should give us around
+ * 32 entries per hash list if totally full and uniformly spread.
+ */
+ hash_bits = ilog2(p->cq_entries);
+ hash_bits -= 5;
+ if (hash_bits <= 0)
+ hash_bits = 1;
+ ctx->cancel_hash_bits = hash_bits;
+ ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head),
+ GFP_KERNEL);
+ if (!ctx->cancel_hash)
+ goto err;
+ __hash_init(ctx->cancel_hash, 1U << hash_bits);
+
if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
goto err;
@@ -448,7 +487,6 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
init_waitqueue_head(&ctx->wait);
spin_lock_init(&ctx->completion_lock);
INIT_LIST_HEAD(&ctx->poll_list);
- ctx->cancel_tree = RB_ROOT;
INIT_LIST_HEAD(&ctx->defer_list);
INIT_LIST_HEAD(&ctx->timeout_list);
init_waitqueue_head(&ctx->inflight_wait);
@@ -459,6 +497,7 @@ err:
if (ctx->fallback_req)
kmem_cache_free(req_cachep, ctx->fallback_req);
kfree(ctx->completions);
+ kfree(ctx->cancel_hash);
kfree(ctx);
return NULL;
}
@@ -592,7 +631,7 @@ static void io_kill_timeout(struct io_kiocb *req)
{
int ret;
- ret = hrtimer_try_to_cancel(&req->timeout.data->timer);
+ ret = hrtimer_try_to_cancel(&req->io->timeout.timer);
if (ret != -1) {
atomic_inc(&req->ctx->cq_timeouts);
list_del_init(&req->list);
@@ -806,6 +845,7 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
}
got_it:
+ req->io = NULL;
req->ring_file = NULL;
req->file = NULL;
req->ctx = ctx;
@@ -836,8 +876,8 @@ static void __io_free_req(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
- if (req->flags & REQ_F_FREE_SQE)
- kfree(req->sqe);
+ if (req->io)
+ kfree(req->io);
if (req->file && !(req->flags & REQ_F_FIXED_FILE))
fput(req->file);
if (req->flags & REQ_F_INFLIGHT) {
@@ -849,8 +889,6 @@ static void __io_free_req(struct io_kiocb *req)
wake_up(&ctx->inflight_wait);
spin_unlock_irqrestore(&ctx->inflight_lock, flags);
}
- if (req->flags & REQ_F_TIMEOUT)
- kfree(req->timeout.data);
percpu_ref_put(&ctx->refs);
if (likely(!io_is_fallback_req(req)))
kmem_cache_free(req_cachep, req);
@@ -863,7 +901,7 @@ static bool io_link_cancel_timeout(struct io_kiocb *req)
struct io_ring_ctx *ctx = req->ctx;
int ret;
- ret = hrtimer_try_to_cancel(&req->timeout.data->timer);
+ ret = hrtimer_try_to_cancel(&req->io->timeout.timer);
if (ret != -1) {
io_cqring_fill_event(req, -ECANCELED);
io_commit_cqring(ctx);
@@ -878,7 +916,6 @@ static bool io_link_cancel_timeout(struct io_kiocb *req)
static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
{
struct io_ring_ctx *ctx = req->ctx;
- struct io_kiocb *nxt;
bool wake_ev = false;
/* Already got next link */
@@ -890,24 +927,21 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
* potentially happen if the chain is messed up, check to be on the
* safe side.
*/
- nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, list);
- while (nxt) {
- list_del_init(&nxt->list);
+ while (!list_empty(&req->link_list)) {
+ struct io_kiocb *nxt = list_first_entry(&req->link_list,
+ struct io_kiocb, link_list);
- if ((req->flags & REQ_F_LINK_TIMEOUT) &&
- (nxt->flags & REQ_F_TIMEOUT)) {
+ if (unlikely((req->flags & REQ_F_LINK_TIMEOUT) &&
+ (nxt->flags & REQ_F_TIMEOUT))) {
+ list_del_init(&nxt->link_list);
wake_ev |= io_link_cancel_timeout(nxt);
- nxt = list_first_entry_or_null(&req->link_list,
- struct io_kiocb, list);
req->flags &= ~REQ_F_LINK_TIMEOUT;
continue;
}
- if (!list_empty(&req->link_list)) {
- INIT_LIST_HEAD(&nxt->link_list);
- list_splice(&req->link_list, &nxt->link_list);
- nxt->flags |= REQ_F_LINK;
- }
+ list_del_init(&req->link_list);
+ if (!list_empty(&nxt->link_list))
+ nxt->flags |= REQ_F_LINK;
*nxtptr = nxt;
break;
}
@@ -923,15 +957,15 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
static void io_fail_links(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
- struct io_kiocb *link;
unsigned long flags;
spin_lock_irqsave(&ctx->completion_lock, flags);
while (!list_empty(&req->link_list)) {
- link = list_first_entry(&req->link_list, struct io_kiocb, list);
- list_del_init(&link->list);
+ struct io_kiocb *link = list_first_entry(&req->link_list,
+ struct io_kiocb, link_list);
+ list_del_init(&link->link_list);
trace_io_uring_fail_link(req, link);
if ((req->flags & REQ_F_LINK_TIMEOUT) &&
@@ -1079,9 +1113,9 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
* completions for those, only batch free for fixed
* file and non-linked commands.
*/
- if (((req->flags &
- (REQ_F_FIXED_FILE|REQ_F_LINK|REQ_F_FREE_SQE)) ==
- REQ_F_FIXED_FILE) && !io_is_fallback_req(req)) {
+ if (((req->flags & (REQ_F_FIXED_FILE|REQ_F_LINK)) ==
+ REQ_F_FIXED_FILE) && !io_is_fallback_req(req) &&
+ !req->io) {
reqs[to_free++] = req;
if (to_free == ARRAY_SIZE(reqs))
io_free_req_many(ctx, reqs, &to_free);
@@ -1410,15 +1444,6 @@ static int io_prep_rw(struct io_kiocb *req, bool force_nonblock)
if (S_ISREG(file_inode(req->file)->i_mode))
req->flags |= REQ_F_ISREG;
- /*
- * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so
- * we know to async punt it even if it was opened O_NONBLOCK
- */
- if (force_nonblock && !io_file_supports_async(req->file)) {
- req->flags |= REQ_F_MUST_PUNT;
- return -EAGAIN;
- }
-
kiocb->ki_pos = READ_ONCE(sqe->off);
kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
@@ -1587,6 +1612,16 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
return io_import_fixed(req->ctx, rw, sqe, iter);
}
+ if (req->io) {
+ struct io_async_rw *iorw = &req->io->rw;
+
+ *iovec = iorw->iov;
+ iov_iter_init(iter, rw, *iovec, iorw->nr_segs, iorw->size);
+ if (iorw->iov == iorw->fast_iov)
+ *iovec = NULL;
+ return iorw->size;
+ }
+
if (!req->has_user)
return -EFAULT;
@@ -1657,6 +1692,50 @@ static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb,
return ret;
}
+static void io_req_map_io(struct io_kiocb *req, ssize_t io_size,
+ struct iovec *iovec, struct iovec *fast_iov,
+ struct iov_iter *iter)
+{
+ req->io->rw.nr_segs = iter->nr_segs;
+ req->io->rw.size = io_size;
+ req->io->rw.iov = iovec;
+ if (!req->io->rw.iov) {
+ req->io->rw.iov = req->io->rw.fast_iov;
+ memcpy(req->io->rw.iov, fast_iov,
+ sizeof(struct iovec) * iter->nr_segs);
+ }
+}
+
+static int io_setup_async_io(struct io_kiocb *req, ssize_t io_size,
+ struct iovec *iovec, struct iovec *fast_iov,
+ struct iov_iter *iter)
+{
+ req->io = kmalloc(sizeof(*req->io), GFP_KERNEL);
+ if (req->io) {
+ io_req_map_io(req, io_size, iovec, fast_iov, iter);
+ memcpy(&req->io->sqe, req->sqe, sizeof(req->io->sqe));
+ req->sqe = &req->io->sqe;
+ return 0;
+ }
+
+ return -ENOMEM;
+}
+
+static int io_read_prep(struct io_kiocb *req, struct iovec **iovec,
+ struct iov_iter *iter, bool force_nonblock)
+{
+ ssize_t ret;
+
+ ret = io_prep_rw(req, force_nonblock);
+ if (ret)
+ return ret;
+
+ if (unlikely(!(req->file->f_mode & FMODE_READ)))
+ return -EBADF;
+
+ return io_import_iovec(READ, req, iovec, iter);
+}
+
static int io_read(struct io_kiocb *req, struct io_kiocb **nxt,
bool force_nonblock)
{
@@ -1665,23 +1744,31 @@ static int io_read(struct io_kiocb *req, struct io_kiocb **nxt,
struct iov_iter iter;
struct file *file;
size_t iov_count;
- ssize_t read_size, ret;
-
- ret = io_prep_rw(req, force_nonblock);
- if (ret)
- return ret;
- file = kiocb->ki_filp;
+ ssize_t io_size, ret;
- if (unlikely(!(file->f_mode & FMODE_READ)))
- return -EBADF;
-
- ret = io_import_iovec(READ, req, &iovec, &iter);