diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2020-06-05 15:45:03 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2020-06-05 15:45:03 -0700 |
commit | b25c6644bfd3affd7d0127ce95c5c96c155a7515 (patch) | |
tree | 9ef9c0fe74a08b7baf3a3c3753368b0f481b581f /drivers/md/dm-zoned-metadata.c | |
parent | 818dbde78e0f4f11c9f804c36913a7ccfc2e87ad (diff) | |
parent | 64611a15ca9da91ff532982429c44686f4593b5f (diff) |
Merge tag 'for-5.8/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm
Pull device mapper updates from Mike Snitzer:
- The largest change for this cycle is the DM zoned target's metadata
version 2 feature that adds support for pairing regular block devices
with a zoned device to ease the performance impact associated with
finite random zones of zoned device.
The changes came in three batches: the first prepared for and then
added the ability to pair a single regular block device, the second
was a batch of fixes to improve zoned's reclaim heuristic, and the
third removed the limitation of only adding a single additional
regular block device to allow many devices.
Testing has shown linear scaling as more devices are added.
- Add new emulated block size (ebs) target that emulates a smaller
logical_block_size than a block device supports
The primary use-case is to emulate "512e" devices that have 512 byte
logical_block_size and 4KB physical_block_size. This is useful to
some legacy applications that otherwise wouldn't be able to be used
on 4K devices because they depend on issuing IO in 512 byte
granularity.
- Add discard interfaces to DM bufio. First consumer of the interface
is the dm-ebs target that makes heavy use of dm-bufio.
- Fix DM crypt's block queue_limits stacking to not truncate
logic_block_size.
- Add Documentation for DM integrity's status line.
- Switch DMDEBUG from a compile time config option to instead use
dynamic debug via pr_debug.
- Fix DM multipath target's hueristic for how it manages
"queue_if_no_path" state internally.
DM multipath now avoids disabling "queue_if_no_path" unless it is
actually needed (e.g. in response to configure timeout or explicit
"fail_if_no_path" message).
This fixes reports of spurious -EIO being reported back to userspace
application during fault tolerance testing with an NVMe backend.
Added various dynamic DMDEBUG messages to assist with debugging
queue_if_no_path in the future.
- Add a new DM multipath "Historical Service Time" Path Selector.
- Fix DM multipath's dm_blk_ioctl() to switch paths on IO error.
- Improve DM writecache target performance by using explicit cache
flushing for target's single-threaded usecase and a small cleanup to
remove unnecessary test in persistent_memory_claim.
- Other small cleanups in DM core, dm-persistent-data, and DM
integrity.
* tag 'for-5.8/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm: (62 commits)
dm crypt: avoid truncating the logical block size
dm mpath: add DM device name to Failing/Reinstating path log messages
dm mpath: enhance queue_if_no_path debugging
dm mpath: restrict queue_if_no_path state machine
dm mpath: simplify __must_push_back
dm zoned: check superblock location
dm zoned: prefer full zones for reclaim
dm zoned: select reclaim zone based on device index
dm zoned: allocate zone by device index
dm zoned: support arbitrary number of devices
dm zoned: move random and sequential zones into struct dmz_dev
dm zoned: per-device reclaim
dm zoned: add metadata pointer to struct dmz_dev
dm zoned: add device pointer to struct dm_zone
dm zoned: allocate temporary superblock for tertiary devices
dm zoned: convert to xarray
dm zoned: add a 'reserved' zone flag
dm zoned: improve logging messages for reclaim
dm zoned: avoid unnecessary device recalulation for secondary superblock
dm zoned: add debugging message for reading superblocks
...
Diffstat (limited to 'drivers/md/dm-zoned-metadata.c')
-rw-r--r-- | drivers/md/dm-zoned-metadata.c | 1046 |
1 files changed, 771 insertions, 275 deletions
diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index bf2245370305..130b5a6d9f12 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -16,7 +16,7 @@ /* * Metadata version. */ -#define DMZ_META_VER 1 +#define DMZ_META_VER 2 /* * On-disk super block magic. @@ -69,8 +69,17 @@ struct dmz_super { /* Checksum */ __le32 crc; /* 48 */ + /* DM-Zoned label */ + u8 dmz_label[32]; /* 80 */ + + /* DM-Zoned UUID */ + u8 dmz_uuid[16]; /* 96 */ + + /* Device UUID */ + u8 dev_uuid[16]; /* 112 */ + /* Padding to full 512B sector */ - u8 reserved[464]; /* 512 */ + u8 reserved[400]; /* 512 */ }; /* @@ -122,8 +131,10 @@ enum { */ struct dmz_sb { sector_t block; + struct dmz_dev *dev; struct dmz_mblock *mblk; struct dmz_super *sb; + struct dm_zone *zone; }; /* @@ -131,28 +142,41 @@ struct dmz_sb { */ struct dmz_metadata { struct dmz_dev *dev; + unsigned int nr_devs; + + char devname[BDEVNAME_SIZE]; + char label[BDEVNAME_SIZE]; + uuid_t uuid; sector_t zone_bitmap_size; unsigned int zone_nr_bitmap_blocks; unsigned int zone_bits_per_mblk; + sector_t zone_nr_blocks; + sector_t zone_nr_blocks_shift; + + sector_t zone_nr_sectors; + sector_t zone_nr_sectors_shift; + unsigned int nr_bitmap_blocks; unsigned int nr_map_blocks; + unsigned int nr_zones; unsigned int nr_useable_zones; unsigned int nr_meta_blocks; unsigned int nr_meta_zones; unsigned int nr_data_zones; + unsigned int nr_cache_zones; unsigned int nr_rnd_zones; unsigned int nr_reserved_seq; unsigned int nr_chunks; /* Zone information array */ - struct dm_zone *zones; + struct xarray zones; - struct dm_zone *sb_zone; struct dmz_sb sb[2]; unsigned int mblk_primary; + unsigned int sb_version; u64 sb_gen; unsigned int min_nr_mblks; unsigned int max_nr_mblks; @@ -168,15 +192,11 @@ struct dmz_metadata { /* Zone allocation management */ struct mutex map_lock; struct dmz_mblock **map_mblk; - unsigned int nr_rnd; - atomic_t unmap_nr_rnd; - struct list_head unmap_rnd_list; - struct list_head map_rnd_list; - unsigned int nr_seq; - atomic_t unmap_nr_seq; - struct list_head unmap_seq_list; - struct list_head map_seq_list; + unsigned int nr_cache; + atomic_t unmap_nr_cache; + struct list_head unmap_cache_list; + struct list_head map_cache_list; atomic_t nr_reserved_seq_zones; struct list_head reserved_seq_zones_list; @@ -184,22 +204,65 @@ struct dmz_metadata { wait_queue_head_t free_wq; }; +#define dmz_zmd_info(zmd, format, args...) \ + DMINFO("(%s): " format, (zmd)->label, ## args) + +#define dmz_zmd_err(zmd, format, args...) \ + DMERR("(%s): " format, (zmd)->label, ## args) + +#define dmz_zmd_warn(zmd, format, args...) \ + DMWARN("(%s): " format, (zmd)->label, ## args) + +#define dmz_zmd_debug(zmd, format, args...) \ + DMDEBUG("(%s): " format, (zmd)->label, ## args) /* * Various accessors */ -unsigned int dmz_id(struct dmz_metadata *zmd, struct dm_zone *zone) +static unsigned int dmz_dev_zone_id(struct dmz_metadata *zmd, struct dm_zone *zone) { - return ((unsigned int)(zone - zmd->zones)); + if (WARN_ON(!zone)) + return 0; + + return zone->id - zone->dev->zone_offset; } sector_t dmz_start_sect(struct dmz_metadata *zmd, struct dm_zone *zone) { - return (sector_t)dmz_id(zmd, zone) << zmd->dev->zone_nr_sectors_shift; + unsigned int zone_id = dmz_dev_zone_id(zmd, zone); + + return (sector_t)zone_id << zmd->zone_nr_sectors_shift; } sector_t dmz_start_block(struct dmz_metadata *zmd, struct dm_zone *zone) { - return (sector_t)dmz_id(zmd, zone) << zmd->dev->zone_nr_blocks_shift; + unsigned int zone_id = dmz_dev_zone_id(zmd, zone); + + return (sector_t)zone_id << zmd->zone_nr_blocks_shift; +} + +unsigned int dmz_zone_nr_blocks(struct dmz_metadata *zmd) +{ + return zmd->zone_nr_blocks; +} + +unsigned int dmz_zone_nr_blocks_shift(struct dmz_metadata *zmd) +{ + return zmd->zone_nr_blocks_shift; +} + +unsigned int dmz_zone_nr_sectors(struct dmz_metadata *zmd) +{ + return zmd->zone_nr_sectors; +} + +unsigned int dmz_zone_nr_sectors_shift(struct dmz_metadata *zmd) +{ + return zmd->zone_nr_sectors_shift; +} + +unsigned int dmz_nr_zones(struct dmz_metadata *zmd) +{ + return zmd->nr_zones; } unsigned int dmz_nr_chunks(struct dmz_metadata *zmd) @@ -207,14 +270,88 @@ unsigned int dmz_nr_chunks(struct dmz_metadata *zmd) return zmd->nr_chunks; } -unsigned int dmz_nr_rnd_zones(struct dmz_metadata *zmd) +unsigned int dmz_nr_rnd_zones(struct dmz_metadata *zmd, int idx) { - return zmd->nr_rnd; + return zmd->dev[idx].nr_rnd; } -unsigned int dmz_nr_unmap_rnd_zones(struct dmz_metadata *zmd) +unsigned int dmz_nr_unmap_rnd_zones(struct dmz_metadata *zmd, int idx) { - return atomic_read(&zmd->unmap_nr_rnd); + return atomic_read(&zmd->dev[idx].unmap_nr_rnd); +} + +unsigned int dmz_nr_cache_zones(struct dmz_metadata *zmd) +{ + return zmd->nr_cache; +} + +unsigned int dmz_nr_unmap_cache_zones(struct dmz_metadata *zmd) +{ + return atomic_read(&zmd->unmap_nr_cache); +} + +unsigned int dmz_nr_seq_zones(struct dmz_metadata *zmd, int idx) +{ + return zmd->dev[idx].nr_seq; +} + +unsigned int dmz_nr_unmap_seq_zones(struct dmz_metadata *zmd, int idx) +{ + return atomic_read(&zmd->dev[idx].unmap_nr_seq); +} + +static struct dm_zone *dmz_get(struct dmz_metadata *zmd, unsigned int zone_id) +{ + return xa_load(&zmd->zones, zone_id); +} + +static struct dm_zone *dmz_insert(struct dmz_metadata *zmd, + unsigned int zone_id, struct dmz_dev *dev) +{ + struct dm_zone *zone = kzalloc(sizeof(struct dm_zone), GFP_KERNEL); + + if (!zone) + return ERR_PTR(-ENOMEM); + + if (xa_insert(&zmd->zones, zone_id, zone, GFP_KERNEL)) { + kfree(zone); + return ERR_PTR(-EBUSY); + } + + INIT_LIST_HEAD(&zone->link); + atomic_set(&zone->refcount, 0); + zone->id = zone_id; + zone->chunk = DMZ_MAP_UNMAPPED; + zone->dev = dev; + + return zone; +} + +const char *dmz_metadata_label(struct dmz_metadata *zmd) +{ + return (const char *)zmd->label; +} + +bool dmz_check_dev(struct dmz_metadata *zmd) +{ + unsigned int i; + + for (i = 0; i < zmd->nr_devs; i++) { + if (!dmz_check_bdev(&zmd->dev[i])) + return false; + } + return true; +} + +bool dmz_dev_is_dying(struct dmz_metadata *zmd) +{ + unsigned int i; + + for (i = 0; i < zmd->nr_devs; i++) { + if (dmz_bdev_is_dying(&zmd->dev[i])) + return true; + } + return false; } /* @@ -402,9 +539,10 @@ static struct dmz_mblock *dmz_get_mblock_slow(struct dmz_metadata *zmd, { struct dmz_mblock *mblk, *m; sector_t block = zmd->sb[zmd->mblk_primary].block + mblk_no; + struct dmz_dev *dev = zmd->sb[zmd->mblk_primary].dev; struct bio *bio; - if (dmz_bdev_is_dying(zmd->dev)) + if (dmz_bdev_is_dying(dev)) return ERR_PTR(-EIO); /* Get a new block and a BIO to read it */ @@ -440,7 +578,7 @@ static struct dmz_mblock *dmz_get_mblock_slow(struct dmz_metadata *zmd, /* Submit read BIO */ bio->bi_iter.bi_sector = dmz_blk2sect(block); - bio_set_dev(bio, zmd->dev->bdev); + bio_set_dev(bio, dev->bdev); bio->bi_private = mblk; bio->bi_end_io = dmz_mblock_bio_end_io; bio_set_op_attrs(bio, REQ_OP_READ, REQ_META | REQ_PRIO); @@ -537,6 +675,7 @@ static struct dmz_mblock *dmz_get_mblock(struct dmz_metadata *zmd, sector_t mblk_no) { struct dmz_mblock *mblk; + struct dmz_dev *dev = zmd->sb[zmd->mblk_primary].dev; /* Check rbtree */ spin_lock(&zmd->mblk_lock); @@ -555,7 +694,7 @@ static struct dmz_mblock *dmz_get_mblock(struct dmz_metadata *zmd, TASK_UNINTERRUPTIBLE); if (test_bit(DMZ_META_ERROR, &mblk->state)) { dmz_release_mblock(zmd, mblk); - dmz_check_bdev(zmd->dev); + dmz_check_bdev(dev); return ERR_PTR(-EIO); } @@ -579,10 +718,11 @@ static void dmz_dirty_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk) static int dmz_write_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk, unsigned int set) { + struct dmz_dev *dev = zmd->sb[set].dev; sector_t block = zmd->sb[set].block + mblk->no; struct bio *bio; - if (dmz_bdev_is_dying(zmd->dev)) + if (dmz_bdev_is_dying(dev)) return -EIO; bio = bio_alloc(GFP_NOIO, 1); @@ -594,7 +734,7 @@ static int dmz_write_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk, set_bit(DMZ_META_WRITING, &mblk->state); bio->bi_iter.bi_sector = dmz_blk2sect(block); - bio_set_dev(bio, zmd->dev->bdev); + bio_set_dev(bio, dev->bdev); bio->bi_private = mblk; bio->bi_end_io = dmz_mblock_bio_end_io; bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_META | REQ_PRIO); @@ -607,13 +747,16 @@ static int dmz_write_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk, /* * Read/write a metadata block. */ -static int dmz_rdwr_block(struct dmz_metadata *zmd, int op, sector_t block, - struct page *page) +static int dmz_rdwr_block(struct dmz_dev *dev, int op, + sector_t block, struct page *page) { struct bio *bio; int ret; - if (dmz_bdev_is_dying(zmd->dev)) + if (WARN_ON(!dev)) + return -EIO; + + if (dmz_bdev_is_dying(dev)) return -EIO; bio = bio_alloc(GFP_NOIO, 1); @@ -621,14 +764,14 @@ static int dmz_rdwr_block(struct dmz_metadata *zmd, int op, sector_t block, return -ENOMEM; bio->bi_iter.bi_sector = dmz_blk2sect(block); - bio_set_dev(bio, zmd->dev->bdev); + bio_set_dev(bio, dev->bdev); bio_set_op_attrs(bio, op, REQ_SYNC | REQ_META | REQ_PRIO); bio_add_page(bio, page, DMZ_BLOCK_SIZE, 0); ret = submit_bio_wait(bio); bio_put(bio); if (ret) - dmz_check_bdev(zmd->dev); + dmz_check_bdev(dev); return ret; } @@ -637,18 +780,32 @@ static int dmz_rdwr_block(struct dmz_metadata *zmd, int op, sector_t block, */ static int dmz_write_sb(struct dmz_metadata *zmd, unsigned int set) { - sector_t block = zmd->sb[set].block; struct dmz_mblock *mblk = zmd->sb[set].mblk; struct dmz_super *sb = zmd->sb[set].sb; + struct dmz_dev *dev = zmd->sb[set].dev; + sector_t sb_block; u64 sb_gen = zmd->sb_gen + 1; int ret; sb->magic = cpu_to_le32(DMZ_MAGIC); - sb->version = cpu_to_le32(DMZ_META_VER); + + sb->version = cpu_to_le32(zmd->sb_version); + if (zmd->sb_version > 1) { + BUILD_BUG_ON(UUID_SIZE != 16); + export_uuid(sb->dmz_uuid, &zmd->uuid); + memcpy(sb->dmz_label, zmd->label, BDEVNAME_SIZE); + export_uuid(sb->dev_uuid, &dev->uuid); + } sb->gen = cpu_to_le64(sb_gen); - sb->sb_block = cpu_to_le64(block); + /* + * The metadata always references the absolute block address, + * ie relative to the entire block range, not the per-device + * block address. + */ + sb_block = zmd->sb[set].zone->id << zmd->zone_nr_blocks_shift; + sb->sb_block = cpu_to_le64(sb_block); sb->nr_meta_blocks = cpu_to_le32(zmd->nr_meta_blocks); sb->nr_reserved_seq = cpu_to_le32(zmd->nr_reserved_seq); sb->nr_chunks = cpu_to_le32(zmd->nr_chunks); @@ -659,9 +816,10 @@ static int dmz_write_sb(struct dmz_metadata *zmd, unsigned int set) sb->crc = 0; sb->crc = cpu_to_le32(crc32_le(sb_gen, (unsigned char *)sb, DMZ_BLOCK_SIZE)); - ret = dmz_rdwr_block(zmd, REQ_OP_WRITE, block, mblk->page); + ret = dmz_rdwr_block(dev, REQ_OP_WRITE, zmd->sb[set].block, + mblk->page); if (ret == 0) - ret = blkdev_issue_flush(zmd->dev->bdev, GFP_NOIO); + ret = blkdev_issue_flush(dev->bdev, GFP_NOIO); return ret; } @@ -674,6 +832,7 @@ static int dmz_write_dirty_mblocks(struct dmz_metadata *zmd, unsigned int set) { struct dmz_mblock *mblk; + struct dmz_dev *dev = zmd->sb[set].dev; struct blk_plug plug; int ret = 0, nr_mblks_submitted = 0; @@ -695,7 +854,7 @@ static int dmz_write_dirty_mblocks(struct dmz_metadata *zmd, TASK_UNINTERRUPTIBLE); if (test_bit(DMZ_META_ERROR, &mblk->state)) { clear_bit(DMZ_META_ERROR, &mblk->state); - dmz_check_bdev(zmd->dev); + dmz_check_bdev(dev); ret = -EIO; } nr_mblks_submitted--; @@ -703,7 +862,7 @@ static int dmz_write_dirty_mblocks(struct dmz_metadata *zmd, /* Flush drive cache (this will also sync data) */ if (ret == 0) - ret = blkdev_issue_flush(zmd->dev->bdev, GFP_NOIO); + ret = blkdev_issue_flush(dev->bdev, GFP_NOIO); return ret; } @@ -740,6 +899,7 @@ int dmz_flush_metadata(struct dmz_metadata *zmd) { struct dmz_mblock *mblk; struct list_head write_list; + struct dmz_dev *dev; int ret; if (WARN_ON(!zmd)) @@ -753,6 +913,7 @@ int dmz_flush_metadata(struct dmz_metadata *zmd) * from modifying metadata. */ down_write(&zmd->mblk_sem); + dev = zmd->sb[zmd->mblk_primary].dev; /* * This is called from the target flush work and reclaim work. @@ -760,7 +921,7 @@ int dmz_flush_metadata(struct dmz_metadata *zmd) */ dmz_lock_flush(zmd); - if (dmz_bdev_is_dying(zmd->dev)) { + if (dmz_bdev_is_dying(dev)) { ret = -EIO; goto out; } @@ -772,7 +933,7 @@ int dmz_flush_metadata(struct dmz_metadata *zmd) /* If there are no dirty metadata blocks, just flush the device cache */ if (list_empty(&write_list)) { - ret = blkdev_issue_flush(zmd->dev->bdev, GFP_NOIO); + ret = blkdev_issue_flush(dev->bdev, GFP_NOIO); goto err; } @@ -821,7 +982,7 @@ err: list_splice(&write_list, &zmd->mblk_dirty_list); spin_unlock(&zmd->mblk_lock); } - if (!dmz_check_bdev(zmd->dev)) + if (!dmz_check_bdev(dev)) ret = -EIO; goto out; } @@ -829,12 +990,31 @@ err: /* * Check super block. */ -static int dmz_check_sb(struct dmz_metadata *zmd, struct dmz_super *sb) +static int dmz_check_sb(struct dmz_metadata *zmd, struct dmz_sb *dsb, + bool tertiary) { + struct dmz_super *sb = dsb->sb; + struct dmz_dev *dev = dsb->dev; unsigned int nr_meta_zones, nr_data_zones; - struct dmz_dev *dev = zmd->dev; u32 crc, stored_crc; - u64 gen; + u64 gen, sb_block; + + if (le32_to_cpu(sb->magic) != DMZ_MAGIC) { + dmz_dev_err(dev, "Invalid meta magic (needed 0x%08x, got 0x%08x)", + DMZ_MAGIC, le32_to_cpu(sb->magic)); + return -ENXIO; + } + + zmd->sb_version = le32_to_cpu(sb->version); + if (zmd->sb_version > DMZ_META_VER) { + dmz_dev_err(dev, "Invalid meta version (needed %d, got %d)", + DMZ_META_VER, zmd->sb_version); + return -EINVAL; + } + if (zmd->sb_version < 2 && tertiary) { + dmz_dev_err(dev, "Tertiary superblocks are not supported"); + return -EINVAL; + } gen = le64_to_cpu(sb->gen); stored_crc = le32_to_cpu(sb->crc); @@ -846,20 +1026,57 @@ static int dmz_check_sb(struct dmz_metadata *zmd, struct dmz_super *sb) return -ENXIO; } - if (le32_to_cpu(sb->magic) != DMZ_MAGIC) { - dmz_dev_err(dev, "Invalid meta magic (needed 0x%08x, got 0x%08x)", - DMZ_MAGIC, le32_to_cpu(sb->magic)); - return -ENXIO; - } + sb_block = le64_to_cpu(sb->sb_block); + if (sb_block != (u64)dsb->zone->id << zmd->zone_nr_blocks_shift ) { + dmz_dev_err(dev, "Invalid superblock position " + "(is %llu expected %llu)", + sb_block, + (u64)dsb->zone->id << zmd->zone_nr_blocks_shift); + return -EINVAL; + } + if (zmd->sb_version > 1) { + uuid_t sb_uuid; + + import_uuid(&sb_uuid, sb->dmz_uuid); + if (uuid_is_null(&sb_uuid)) { + dmz_dev_err(dev, "NULL DM-Zoned uuid"); + return -ENXIO; + } else if (uuid_is_null(&zmd->uuid)) { + uuid_copy(&zmd->uuid, &sb_uuid); + } else if (!uuid_equal(&zmd->uuid, &sb_uuid)) { + dmz_dev_err(dev, "mismatching DM-Zoned uuid, " + "is %pUl expected %pUl", + &sb_uuid, &zmd->uuid); + return -ENXIO; + } + if (!strlen(zmd->label)) + memcpy(zmd->label, sb->dmz_label, BDEVNAME_SIZE); + else if (memcmp(zmd->label, sb->dmz_label, BDEVNAME_SIZE)) { + dmz_dev_err(dev, "mismatching DM-Zoned label, " + "is %s expected %s", + sb->dmz_label, zmd->label); + return -ENXIO; + } + import_uuid(&dev->uuid, sb->dev_uuid); + if (uuid_is_null(&dev->uuid)) { + dmz_dev_err(dev, "NULL device uuid"); + return -ENXIO; + } - if (le32_to_cpu(sb->version) != DMZ_META_VER) { - dmz_dev_err(dev, "Invalid meta version (needed %d, got %d)", - DMZ_META_VER, le32_to_cpu(sb->version)); - return -ENXIO; + if (tertiary) { + /* + * Generation number should be 0, but it doesn't + * really matter if it isn't. + */ + if (gen != 0) + dmz_dev_warn(dev, "Invalid generation %llu", + gen); + return 0; + } } - nr_meta_zones = (le32_to_cpu(sb->nr_meta_blocks) + dev->zone_nr_blocks - 1) - >> dev->zone_nr_blocks_shift; + nr_meta_zones = (le32_to_cpu(sb->nr_meta_blocks) + zmd->zone_nr_blocks - 1) + >> zmd->zone_nr_blocks_shift; if (!nr_meta_zones || nr_meta_zones >= zmd->nr_rnd_zones) { dmz_dev_err(dev, "Invalid number of metadata blocks"); @@ -895,10 +1112,13 @@ static int dmz_check_sb(struct dmz_metadata *zmd, struct dmz_super *sb) /* * Read the first or second super block from disk. */ -static int dmz_read_sb(struct dmz_metadata *zmd, unsigned int set) +static int dmz_read_sb(struct dmz_metadata *zmd, struct dmz_sb *sb, int set) { - return dmz_rdwr_block(zmd, REQ_OP_READ, zmd->sb[set].block, - zmd->sb[set].mblk->page); + dmz_zmd_debug(zmd, "read superblock set %d dev %s block %llu", + set, sb->dev->name, sb->block); + + return dmz_rdwr_block(sb->dev, REQ_OP_READ, + sb->block, sb->mblk->page); } /* @@ -908,8 +1128,9 @@ static int dmz_read_sb(struct dmz_metadata *zmd, unsigned int set) */ static int dmz_lookup_secondary_sb(struct dmz_metadata *zmd) { - unsigned int zone_nr_blocks = zmd->dev->zone_nr_blocks; + unsigned int zone_nr_blocks = zmd->zone_nr_blocks; struct dmz_mblock *mblk; + unsigned int zone_id = zmd->sb[0].zone->id; int i; /* Allocate a block */ @@ -922,24 +1143,29 @@ static int dmz_lookup_secondary_sb(struct dmz_metadata *zmd) /* Bad first super block: search for the second one */ zmd->sb[1].block = zmd->sb[0].block + zone_nr_blocks; - for (i = 0; i < zmd->nr_rnd_zones - 1; i++) { - if (dmz_read_sb(zmd, 1) != 0) + zmd->sb[1].zone = dmz_get(zmd, zone_id + 1); + zmd->sb[1].dev = zmd->sb[0].dev; + for (i = 1; i < zmd->nr_rnd_zones; i++) { + if (dmz_read_sb(zmd, &zmd->sb[1], 1) != 0) break; if (le32_to_cpu(zmd->sb[1].sb->magic) == DMZ_MAGIC) return 0; zmd->sb[1].block += zone_nr_blocks; + zmd->sb[1].zone = dmz_get(zmd, zone_id + i); } dmz_free_mblock(zmd, mblk); zmd->sb[1].mblk = NULL; + zmd->sb[1].zone = NULL; + zmd->sb[1].dev = NULL; return -EIO; } /* - * Read the first or second super block from disk. + * Read a super block from disk. */ -static int dmz_get_sb(struct dmz_metadata *zmd, unsigned int set) +static int dmz_get_sb(struct dmz_metadata *zmd, struct dmz_sb *sb, int set) { struct dmz_mblock *mblk; int ret; @@ -949,14 +1175,14 @@ static int dmz_get_sb(struct dmz_metadata *zmd, unsigned int set) if (!mblk) return -ENOMEM; - zmd->sb[set].mblk = mblk; - zmd->sb[set].sb = mblk->data; + sb->mblk = mblk; + sb->sb = mblk->data; /* Read super block */ - ret = dmz_read_sb(zmd, set); + ret = dmz_read_sb(zmd, sb, set); if (ret) { dmz_free_mblock(zmd, mblk); - zmd->sb[set].mblk = NULL; + sb->mblk = NULL; return ret; } @@ -972,14 +1198,13 @@ static int dmz_recover_mblocks(struct dmz_metadata *zmd, unsigned int dst_set) struct page *page; int i, ret; - dmz_dev_warn(zmd->dev, "Metadata set %u invalid: recovering", dst_set); + dmz_dev_warn(zmd->sb[dst_set].dev, + "Metadata set %u invalid: recovering", dst_set); if (dst_set == 0) - zmd->sb[0].block = dmz_start_block(zmd, zmd->sb_zone); - else { - zmd->sb[1].block = zmd->sb[0].block + - (zmd->nr_meta_zones << zmd->dev->zone_nr_blocks_shift); - } + zmd->sb[0].block = dmz_start_block(zmd, zmd->sb[0].zone); + else + zmd->sb[1].block = dmz_start_block(zmd, zmd->sb[1].zone); page = alloc_page(GFP_NOIO); if (!page) @@ -987,11 +1212,11 @@ static int dmz_recover_mblocks(struct dmz_metadata *zmd, unsigned int dst_set) /* Copy metadata blocks */ for (i = 1; i < zmd->nr_meta_blocks; i++) { - ret = dmz_rdwr_block(zmd, REQ_OP_READ, + ret = dmz_rdwr_block(zmd->sb[src_set].dev, REQ_OP_READ, zmd->sb[src_set].block + i, page); if (ret) goto out; - ret = dmz_rdwr_block(zmd, REQ_OP_WRITE, + ret = dmz_rdwr_block(zmd->sb[dst_set].dev, REQ_OP_WRITE, zmd->sb[dst_set].block + i, page); if (ret) goto out; @@ -1023,53 +1248,73 @@ static int dmz_load_sb(struct dmz_metadata *zmd) u64 sb_gen[2] = {0, 0}; int ret; + if (!zmd->sb[0].zone) { + dmz_zmd_err(zmd, "Primary super block zone not set"); + return -ENXIO; + } + /* Read and check the primary super block */ - zmd->sb[0].block = dmz_start_block(zmd, zmd->sb_zone); - ret = dmz_get_sb(zmd, 0); + zmd->sb[0].block = dmz_start_block(zmd, zmd->sb[0].zone); + zmd->sb[0].dev = zmd->sb[0].zone->dev; + ret = dmz_get_sb(zmd, &zmd->sb[0], 0); if (ret) { - dmz_dev_err(zmd->dev, "Read primary super block failed"); + dmz_dev_err(zmd->sb[0].dev, "Read primary super block failed"); return ret; } - ret = dmz_check_sb(zmd, zmd->sb[0].sb); + ret = dmz_check_sb(zmd, &zmd->sb[0], false); /* Read and check secondary super block */ if (ret == 0) { sb_good[0] = true; - zmd->sb[1].block = zmd->sb[0].block + - (zmd->nr_meta_zones << zmd->dev->zone_nr_blocks_shift); - ret = dmz_get_sb(zmd, 1); + if (!zmd->sb[1].zone) { + unsigned int zone_id = + zmd->sb[0].zone->id + zmd->nr_meta_zones; + + zmd->sb[1].zone = dmz_get(zmd, zone_id); + } + zmd->sb[1].block = dmz_start_block(zmd, zmd->sb[1].zone); + zmd->sb[1].dev = zmd->sb[0].dev; + ret = dmz_get_sb(zmd, &zmd->sb[1], 1); } else ret = dmz_lookup_secondary_sb(zmd); if (ret) { - dmz_dev_err(zmd->dev, "Read secondary super block failed"); + dmz_dev_err(zmd->sb[1].dev, "Read secondary super block failed"); return ret; } - ret = dmz_check_sb(zmd, zmd->sb[1].sb); + ret = dmz_check_sb(zmd, &zmd->sb[1], false); if (ret == 0) sb_good[1] = true; /* Use highest generation sb first */ if (!sb_good[0] && !sb_good[1]) { - dmz_dev_err(zmd->dev, "No valid super block found"); + dmz_zmd_err(zmd, "No valid super block found"); return -EIO; } if (sb_good[0]) sb_gen[0] = le64_to_cpu(zmd->sb[0].sb->gen); - else + else { ret = dmz_recover_mblocks(zmd, 0); + if (ret) { + dmz_dev_err(zmd->sb[0].dev, + "Recovery of superblock 0 failed"); + return -EIO; + } + } if (sb_good[1]) sb_gen[1] = le64_to_cpu(zmd->sb[1].sb->gen); - else + else { ret = dmz_recover_mblocks(zmd, 1); - if (ret) { - dmz_dev_err(zmd->dev, "Recovery failed"); - return -EIO; + if (ret) { + dmz_dev_err(zmd->sb[1].dev, + "Recovery of superblock 1 failed"); + return -EIO; + } } if (sb_gen[0] >= sb_gen[1]) { @@ -1080,32 +1325,70 @@ static int dmz_load_sb(struct dmz_metadata *zmd) zmd->mblk_primary = 1; } - dmz_dev_debug(zmd->dev, "Using super block %u (gen %llu)", + dmz_dev_debug(zmd->sb[zmd->mblk_primary].dev, + "Using super block %u (gen %llu)", zmd->mblk_primary, zmd->sb_gen); - return 0; + if (zmd->sb_version > 1) { + int i; + struct dmz_sb *sb; + + sb = kzalloc(sizeof(struct dmz_sb), GFP_KERNEL); + if (!sb) + return -ENOMEM; + for (i = 1; i < zmd->nr_devs; i++) { + sb->block = 0; + sb->zone = dmz_get(zmd, zmd->dev[i].zone_offset); + sb->dev = &zmd->dev[i]; + if (!dmz_is_meta(sb->zone)) { + dmz_dev_err(sb->dev, + "Tertiary super block zone %u not marked as metadata zone", + sb->zone->id); + ret = -EINVAL; + goto out_kfree; + } + ret = dmz_get_sb(zmd, sb, i + 1); + if (ret) { + dmz_dev_err(sb->dev, + "Read tertiary super block failed"); + dmz_free_mblock(zmd, sb->mblk); + goto out_kfree; + } + ret = dmz_check_sb(zmd, sb, true); + dmz_free_mblock(zmd, sb->mblk); + if (ret == -EINVAL) + goto out_kfree; + } + out_kfree: + kfree(sb); + } + return ret; } /* * Initialize a zone descriptor. */ -static int dmz_init_zone(struct blk_zone *blkz, unsigned int idx, void *data) +static int dmz_init_zone(struct blk_zone *blkz, unsigned int num, void *data) { - struct dmz_metadata *zmd = data; - struct dm_zone *zone = &zmd->zones[idx]; - struct dmz_dev *dev = zmd->dev; + struct dmz_dev *dev = data; + struct dmz_metadata *zmd = dev->metadata; + int idx = num + dev->zone_offset; + struct dm_zone *zone; + + zone = dmz_insert(zmd, idx, dev); + if (IS_ERR(zone)) + return PTR_ERR(zone); - /* Ignore the eventual last runt (smaller) zone */ - if (blkz->len != dev->zone_nr_sectors) { - if (blkz->start + blkz->len == dev->capacity) + if (blkz->len != zmd->zone_nr_sectors) { + if (zmd->sb_version > 1) { + /* Ignore the eventual runt (smaller) zone */ + set_bit(DMZ_OFFLINE, &zone->flags); + return 0; + } else if (blkz->start + blkz->len == dev->capacity) return 0; return -ENXIO; } - INIT_LIST_HEAD(&zone->link); - atomic_set(&zone->refcount, 0); - zone->chunk = DMZ_MAP_UNMAPPED; - switch (blkz->type) { case BLK_ZONE_TYPE_CONVENTIONAL: set_bit(DMZ_RND, &zone->flags); @@ -1131,13 +1414,45 @@ static int dmz_init_zone(struct blk_zone *blkz, unsigned int idx, void *data) zmd->nr_useable_zones++; if (dmz_is_rnd(zone)) { zmd->nr_rnd_zones++; - if (!zmd->sb_zone) { - /* Super block zone */ - zmd->sb_zone = zone; + if (zmd->nr_devs == 1 && !zmd->sb[0].zone) { + /* Primary super block zone */ + zmd->sb[0].zone = zone; } } + if (zmd->nr_devs > 1 && num == 0) { + /* + * Tertiary superblock zones are always at the + * start of the zoned devices, so mark them + * as metadata zone. + */ + set_bit(DMZ_META, &zone->flags); + } } + return 0; +} + +static int dmz_emulate_zones(struct dmz_metadata *zmd, struct dmz_dev *dev) +{ + int idx; + sector_t zone_offset = 0; + for(idx = 0; idx < dev->nr_zones; idx++) { + struct dm_zone *zone; + + zone = dmz_insert(zmd, idx, dev); + if (IS_ERR(zone)) + return PTR_ERR(zone); + set_bit(DMZ_CACHE, &zone->flags); + zone->wp_block = 0; + zmd->nr_cache_zones++; + zmd->nr_useable_zones++; + if (dev->capacity - zone_offset < zmd->zone_nr_sectors) { + /* Disable runt zone */ + set_bit(DMZ_OFFLINE, &zone->flags); + break; + } + zone_offset += zmd->zone_nr_sectors; + } return 0; } @@ -1146,8 +1461,15 @@ static int dmz_init_zone(struct blk_zone *blkz, unsigned int idx, void *data) */ static void dmz_drop_zones(struct dmz_metadata *zmd) { - kfree(zmd->zones); - zmd->zones = NULL; + int idx; + + for(idx = 0; idx < zmd->nr_zones; idx++) { + struct dm_zone *zone = xa_load(&zmd->zones, idx); + + kfree(zone); + xa_erase(&zmd->zones, idx); + } + xa_destroy(&zmd->zones); } /* @@ -1156,32 +1478,87 @@ static void dmz_drop_zones(struct dmz_metadata *zmd) */ static int dmz_init_zones(struct dmz_metadata *zmd) { - struct dmz_dev *dev = zmd->dev; - int ret; + int i, ret; + struct dmz_dev *zoned_dev = &zmd->dev[0]; /* Init */ - zmd->zone_bitmap_size = dev->zone_nr_blocks >> 3; + zmd->zone_nr_sectors = zmd->dev[0].zone_nr_sectors; + zmd->zone_nr_sectors_shift = ilog2(zmd->zone_nr_sectors); + zmd->zone_nr_blocks = dmz_sect2blk(zmd->zone_nr_sectors); + zmd->zone_nr_blocks_shift = ilog2(zmd->zone_nr_blocks); + zmd->zone_bitmap_size = zmd->zone_nr_blocks >> 3; zmd->zone_nr_bitmap_blocks = max_t(sector_t, 1, zmd->zone_bitmap_size >> DMZ_BLOCK_SHIFT); - zmd->zone_bits_per_mblk = min_t(sector_t, dev->zone_nr_blocks, + zmd->zone_bits_per_mblk = min_t(sector_t, zmd->zone_nr_blocks, DMZ_BLOCK_SIZE_BITS); /* Allocate zone array */ - zmd->zones = kcalloc(dev->nr_zones, sizeof(struct dm_zone), GFP_KERNEL); - if (!zmd->zones) - return -ENOMEM; + zmd->nr_zones = 0; + for (i = 0; i < zmd->nr_devs; i++) { + struct dmz_dev *dev = &zmd->dev[i]; + + dev->metadata = zmd; + zmd->nr_zones += dev->nr_zones; + + atomic_set(&dev->unmap_nr_rnd, 0); + INIT_LIST_HEAD(&dev->unmap_rnd_list); + INIT_LIST_HEAD(&dev->map_rnd_list); + + atomic_set(&dev->unmap_nr_seq, 0); + INIT_LIST_HEAD(&dev->unmap_seq_list); + INIT_LIST_HEAD(&dev->map_seq_list); + } + + if (!zmd->nr_zones) { + DMERR("(%s): No zones found", zmd->devname); + return -ENXIO; + } + xa_init(&zmd->zones); + + DMDEBUG("(%s): Using %zu B for zone information", + zmd->devname, sizeof(struct dm_zone) * zmd->nr_zones); - dmz_dev_info(dev, "Using %zu B for zone information", - sizeof(struct dm_zone) * dev->nr_zones); + if (zmd->nr_devs > 1) { + ret = dmz_emulate_zones(zmd, &zmd->dev[0]); + if (ret < 0) { + DMDEBUG("(%s): Failed to emulate zones, error %d", + zmd->devname, ret); + dmz_drop_zones(zmd); + return ret; + } + + /* + * Primary superblock zone is always at zone 0 when multiple + * drives are present. + */ + zmd->sb[0].zone = dmz_get(zmd, 0); + + for (i = 1; i < zmd->nr_devs; i++) { + zoned_dev = &zmd->dev[i]; + + ret = blkdev_report_zones(zoned_dev->bdev, 0, + BLK_ALL_ZONES, + dmz_init_zone, zoned_dev); + if (ret < 0) { + DMDEBUG("(%s): Failed to report zones, error %d", + zmd->devname, ret); + dmz_drop_zones(zmd); + return ret; + } + } + return 0; + } /* * Get zone information and initialize zone descriptors. At the same * time, determine where the super block should be: first block of the * first randomly writable zone. */ - ret = blkdev_report_zones(dev->bdev, 0, BLK_ALL_ZONES, dmz_init_zone, - zmd); + ret = blkdev_report_zones(zoned_dev->bdev, 0, BLK_ALL_ZONES, + dmz_init_zone, zoned_dev); if (ret < 0) { + DMDEBUG("(%s): Failed to report zones, error %d", + zmd->devname, ret); dmz_drop_zones(zmd); return ret; } @@ -1213,9 +1590,13 @@ static int dmz_update_zone_cb(struct blk_zone *blkz, unsigned int idx, |