From 9114d79569a3fb858a7ecb1f21cb1dec93dc2f21 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Tue, 19 Mar 2013 18:16:42 +0100 Subject: drbd: cleanup bogus assert message This fixes ASSERT( mdev->state.disk == D_FAILED ) in drivers/block/drbd/drbd_main.c When we detach from local disk, we let the local refcount hit zero twice. First, we transition to D_FAILED, so we won't give out new references to incoming requests; we still may give out *internal* references, though. Once the refcount hits zero [1] while in D_FAILED, we queue a transition to D_DISKLESS to our worker. We need to queue it, because we may be in atomic context when putting the reference. Once the transition to D_DISKLESS actually happened [2] from worker context, we don't give out new internal references either. Between hitting zero the first time [1] and actually transition to D_DISKLESS [2], there may be a few very short lived internal get/put, so we may hit zero more than once while being in D_FAILED, or even see a race where a an internal get_ldev() happened while D_FAILED, but the corresponding put_ldev() happens just after the transition to D_DISKLESS. That's why we have the additional test_and_set_bit(GO_DISKLESS,); and that's why the assert was placed wrong. Since there was exactly one code path left to drbd_go_diskless(), and that checks already for D_FAILED, drop that assert, and fold in the drbd_queue_work(). Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_int.h | 7 ++++--- drivers/block/drbd/drbd_main.c | 7 ------- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 6b51afa1aae1..db504d021a6e 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -1148,7 +1148,6 @@ extern int drbd_bitmap_io_from_worker(struct drbd_conf *mdev, char *why, enum bm_flag flags); extern int drbd_bmio_set_n_write(struct drbd_conf *mdev); extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev); -extern void drbd_go_diskless(struct drbd_conf *mdev); extern void drbd_ldev_destroy(struct drbd_conf *mdev); /* Meta data layout @@ -2053,9 +2052,11 @@ static inline void put_ldev(struct drbd_conf *mdev) if (mdev->state.disk == D_DISKLESS) /* even internal references gone, safe to destroy */ drbd_ldev_destroy(mdev); - if (mdev->state.disk == D_FAILED) + if (mdev->state.disk == D_FAILED) { /* all application IO references gone. */ - drbd_go_diskless(mdev); + if (!test_and_set_bit(GO_DISKLESS, &mdev->flags)) + drbd_queue_work(&mdev->tconn->sender_work, &mdev->go_diskless); + } wake_up(&mdev->misc_wait); } } diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index e98da675f0c1..731a28eedc56 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -3252,13 +3252,6 @@ static int w_go_diskless(struct drbd_work *w, int unused) return 0; } -void drbd_go_diskless(struct drbd_conf *mdev) -{ - D_ASSERT(mdev->state.disk == D_FAILED); - if (!test_and_set_bit(GO_DISKLESS, &mdev->flags)) - drbd_queue_work(&mdev->tconn->sender_work, &mdev->go_diskless); -} - /** * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap * @mdev: DRBD device. -- cgit v1.2.3 From ae8bf312e97d554b6aa32e7b2ceb993812ad0835 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Tue, 19 Mar 2013 18:16:43 +0100 Subject: drbd: cleanup ondisk meta data layout calculations and defines Add a comment about our meta data layout variants, and rename a few defines (e.g. MD_RESERVED_SECT -> MD_128MB_SECT) to make it clear that they are short hand for fixed constants, and not arbitrarily to be redefined as one may see fit. Properly pad struct meta_data_on_disk to 4kB, and initialize to zero not only the first 512 Byte, but all of it in drbd_md_sync(). Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_actlog.c | 28 ++++++++++--- drivers/block/drbd/drbd_bitmap.c | 13 +++++- drivers/block/drbd/drbd_int.h | 86 +++++++++++++++++++++++----------------- drivers/block/drbd/drbd_main.c | 11 +++-- drivers/block/drbd/drbd_nl.c | 42 +++++++++++++++----- 5 files changed, 123 insertions(+), 57 deletions(-) diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index 92510f8ad013..b230d91ec430 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c @@ -209,7 +209,8 @@ int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, current->comm, current->pid, __func__, (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ"); - err = _drbd_md_sync_page_io(mdev, bdev, iop, sector, rw, MD_BLOCK_SIZE); + /* we do all our meta data IO in aligned 4k blocks. */ + err = _drbd_md_sync_page_io(mdev, bdev, iop, sector, rw, 4096); if (err) { dev_err(DEV, "drbd_md_sync_page_io(,%llus,%s) failed with error %d\n", (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ", err); @@ -350,6 +351,24 @@ static unsigned int rs_extent_to_bm_page(unsigned int rs_enr) (BM_EXT_SHIFT - BM_BLOCK_SHIFT)); } +static sector_t al_tr_number_to_on_disk_sector(struct drbd_conf *mdev) +{ + const unsigned int stripes = 1; + const unsigned int stripe_size_4kB = MD_32kB_SECT/MD_4kB_SECT; + + /* transaction number, modulo on-disk ring buffer wrap around */ + unsigned int t = mdev->al_tr_number % (stripe_size_4kB * stripes); + + /* ... to aligned 4k on disk block */ + t = ((t % stripes) * stripe_size_4kB) + t/stripes; + + /* ... to 512 byte sector in activity log */ + t *= 8; + + /* ... plus offset to the on disk position */ + return mdev->ldev->md.md_offset + mdev->ldev->md.al_offset + t; +} + static int _al_write_transaction(struct drbd_conf *mdev) { @@ -432,13 +451,12 @@ _al_write_transaction(struct drbd_conf *mdev) if (mdev->al_tr_cycle >= mdev->act_log->nr_elements) mdev->al_tr_cycle = 0; - sector = mdev->ldev->md.md_offset - + mdev->ldev->md.al_offset - + mdev->al_tr_pos * (MD_BLOCK_SIZE>>9); + sector = al_tr_number_to_on_disk_sector(mdev); crc = crc32c(0, buffer, 4096); buffer->crc32c = cpu_to_be32(crc); + /* normal execution path goes through all three branches */ if (drbd_bm_write_hinted(mdev)) err = -EIO; /* drbd_chk_io_error done already */ @@ -446,8 +464,6 @@ _al_write_transaction(struct drbd_conf *mdev) err = -EIO; drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR); } else { - /* advance ringbuffer position and transaction counter */ - mdev->al_tr_pos = (mdev->al_tr_pos + 1) % (MD_AL_SECTORS*512/MD_BLOCK_SIZE); mdev->al_tr_number++; } diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index 8dc29502dc08..64fbb8385cdc 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c @@ -612,6 +612,17 @@ static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len) } } +/* For the layout, see comment above drbd_md_set_sector_offsets(). */ +static u64 drbd_md_on_disk_bits(struct drbd_backing_dev *ldev) +{ + u64 bitmap_sectors; + if (ldev->md.al_offset == 8) + bitmap_sectors = ldev->md.md_size_sect - ldev->md.bm_offset; + else + bitmap_sectors = ldev->md.al_offset - ldev->md.bm_offset; + return bitmap_sectors << (9 + 3); +} + /* * make sure the bitmap has enough room for the attached storage, * if necessary, resize. @@ -668,7 +679,7 @@ int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity, int set_new_bits) words = ALIGN(bits, 64) >> LN2_BPL; if (get_ldev(mdev)) { - u64 bits_on_disk = ((u64)mdev->ldev->md.md_size_sect-MD_BM_OFFSET) << 12; + u64 bits_on_disk = drbd_md_on_disk_bits(mdev->ldev); put_ldev(mdev); if (bits > bits_on_disk) { dev_info(DEV, "bits = %lu\n", bits); diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index db504d021a6e..60c89e5b298c 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -753,13 +753,8 @@ struct drbd_md { u32 flags; u32 md_size_sect; - s32 al_offset; /* signed relative sector offset to al area */ + s32 al_offset; /* signed relative sector offset to activity log */ s32 bm_offset; /* signed relative sector offset to bitmap */ - - /* u32 al_nr_extents; important for restoring the AL - * is stored into ldev->dc.al_extents, which in turn - * gets applied to act_log->nr_elements - */ }; struct drbd_backing_dev { @@ -1009,7 +1004,6 @@ struct drbd_conf { struct lru_cache *act_log; /* activity log */ unsigned int al_tr_number; int al_tr_cycle; - int al_tr_pos; /* position of the next transaction in the journal */ wait_queue_head_t seq_wait; atomic_t packet_seq; unsigned int peer_seq; @@ -1151,21 +1145,41 @@ extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev); extern void drbd_ldev_destroy(struct drbd_conf *mdev); /* Meta data layout - We reserve a 128MB Block (4k aligned) - * either at the end of the backing device - * or on a separate meta data device. */ + * + * We currently have two possible layouts. + * Offsets in (512 byte) sectors. + * external: + * |----------- md_size_sect ------------------| + * [ 4k superblock ][ activity log ][ Bitmap ] + * | al_offset == 8 | + * | bm_offset = al_offset + X | + * ==> bitmap sectors = md_size_sect - bm_offset + * + * Variants: + * old, indexed fixed size meta data: + * + * internal: + * |----------- md_size_sect ------------------| + * [data.....][ Bitmap ][ activity log ][ 4k superblock ][padding*] + * | al_offset < 0 | + * | bm_offset = al_offset - Y | + * ==> bitmap sectors = Y = al_offset - bm_offset + * + * [padding*] are zero or up to 7 unused 512 Byte sectors to the + * end of the device, so that the [4k superblock] will be 4k aligned. + * + * The activity log consists of 4k transaction blocks, + * which are written in a ring-buffer, or striped ring-buffer like fashion, + * which are writtensize used to be fixed 32kB, + * but is about to become configurable. + */ -/* The following numbers are sectors */ -/* Allows up to about 3.8TB, so if you want more, +/* Our old fixed size meta data layout + * allows up to about 3.8TB, so if you want more, * you need to use the "flexible" meta data format. */ -#define MD_RESERVED_SECT (128LU << 11) /* 128 MB, unit sectors */ -#define MD_AL_OFFSET 8 /* 8 Sectors after start of meta area */ -#define MD_AL_SECTORS 64 /* = 32 kB on disk activity log ring buffer */ -#define MD_BM_OFFSET (MD_AL_OFFSET + MD_AL_SECTORS) - -/* we do all meta data IO in 4k blocks */ -#define MD_BLOCK_SHIFT 12 -#define MD_BLOCK_SIZE (1<md.md_offset + MD_AL_OFFSET - 1; + return bdev->md.md_offset + MD_4kB_SECT -1; case DRBD_MD_INDEX_FLEX_EXT: default: - return bdev->md.md_offset + bdev->md.md_size_sect; + return bdev->md.md_offset + bdev->md.md_size_sect -1; } } @@ -1861,13 +1876,11 @@ static inline sector_t drbd_md_ss__(struct drbd_conf *mdev, rcu_read_unlock(); switch (meta_dev_idx) { - default: /* external, some index */ - return MD_RESERVED_SECT * meta_dev_idx; + default: /* external, some index; this is the old fixed size layout */ + return MD_128MB_SECT * meta_dev_idx; case DRBD_MD_INDEX_INTERNAL: /* with drbd08, internal meta data is always "flexible" */ case DRBD_MD_INDEX_FLEX_INT: - /* sizeof(struct md_on_disk_07) == 4k - * position: last 4k aligned block of 4k size */ if (!bdev->backing_bdev) { if (__ratelimit(&drbd_ratelimit_state)) { dev_err(DEV, "bdev->backing_bdev==NULL\n"); @@ -1875,8 +1888,9 @@ static inline sector_t drbd_md_ss__(struct drbd_conf *mdev, } return 0; } - return (drbd_get_capacity(bdev->backing_bdev) & ~7ULL) - - MD_AL_OFFSET; + /* sizeof(struct md_on_disk_07) == 4k + * position: last 4k aligned block of 4k size */ + return (drbd_get_capacity(bdev->backing_bdev) & ~7ULL) - 8; case DRBD_MD_INDEX_FLEX_EXT: return 0; } diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 731a28eedc56..76faeab40c8f 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -2834,6 +2834,7 @@ void conn_md_sync(struct drbd_tconn *tconn) rcu_read_unlock(); } +/* aligned 4kByte */ struct meta_data_on_disk { u64 la_size; /* last agreed size. */ u64 uuid[UI_SIZE]; /* UUIDs. */ @@ -2843,13 +2844,13 @@ struct meta_data_on_disk { u32 magic; u32 md_size_sect; u32 al_offset; /* offset to this block */ - u32 al_nr_extents; /* important for restoring the AL */ + u32 al_nr_extents; /* important for restoring the AL (userspace) */ /* `-- act_log->nr_elements <-- ldev->dc.al_extents */ u32 bm_offset; /* offset to the bitmap, from here */ u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */ u32 la_peer_max_bio_size; /* last peer max_bio_size */ - u32 reserved_u32[3]; + u8 reserved_u8[4096 - (7*8 + 8*4)]; } __packed; /** @@ -2862,6 +2863,10 @@ void drbd_md_sync(struct drbd_conf *mdev) sector_t sector; int i; + /* Don't accidentally change the DRBD meta data layout. */ + BUILD_BUG_ON(UI_SIZE != 4); + BUILD_BUG_ON(sizeof(struct meta_data_on_disk) != 4096); + del_timer(&mdev->md_sync_timer); /* timer may be rearmed by drbd_md_mark_dirty() now. */ if (!test_and_clear_bit(MD_DIRTY, &mdev->flags)) @@ -2876,7 +2881,7 @@ void drbd_md_sync(struct drbd_conf *mdev) if (!buffer) goto out; - memset(buffer, 0, 512); + memset(buffer, 0, sizeof(*buffer)); buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev)); for (i = UI_CURRENT; i < UI_SIZE; i++) diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 2af26fc95280..581f6800cc30 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -696,12 +696,32 @@ out: return 0; } -/* initializes the md.*_offset members, so we are able to find - * the on disk meta data */ +/* Initializes the md.*_offset members, so we are able to find + * the on disk meta data. + * + * We currently have two possible layouts: + * external: + * |----------- md_size_sect ------------------| + * [ 4k superblock ][ activity log ][ Bitmap ] + * | al_offset == 8 | + * | bm_offset = al_offset + X | + * ==> bitmap sectors = md_size_sect - bm_offset + * + * internal: + * |----------- md_size_sect ------------------| + * [data.....][ Bitmap ][ activity log ][ 4k superblock ] + * | al_offset < 0 | + * | bm_offset = al_offset - Y | + * ==> bitmap sectors = Y = al_offset - bm_offset + * + * Activity log size used to be fixed 32kB, + * but is about to become configurable. + */ static void drbd_md_set_sector_offsets(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) { sector_t md_size_sect = 0; + unsigned int al_size_sect = MD_32kB_SECT; int meta_dev_idx; rcu_read_lock(); @@ -710,23 +730,23 @@ static void drbd_md_set_sector_offsets(struct drbd_conf *mdev, switch (meta_dev_idx) { default: /* v07 style fixed size indexed meta data */ - bdev->md.md_size_sect = MD_RESERVED_SECT; + bdev->md.md_size_sect = MD_128MB_SECT; bdev->md.md_offset = drbd_md_ss__(mdev, bdev); - bdev->md.al_offset = MD_AL_OFFSET; - bdev->md.bm_offset = MD_BM_OFFSET; + bdev->md.al_offset = MD_4kB_SECT; + bdev->md.bm_offset = MD_4kB_SECT + al_size_sect; break; case DRBD_MD_INDEX_FLEX_EXT: /* just occupy the full device; unit: sectors */ bdev->md.md_size_sect = drbd_get_capacity(bdev->md_bdev); bdev->md.md_offset = 0; - bdev->md.al_offset = MD_AL_OFFSET; - bdev->md.bm_offset = MD_BM_OFFSET; + bdev->md.al_offset = MD_4kB_SECT; + bdev->md.bm_offset = MD_4kB_SECT + al_size_sect; break; case DRBD_MD_INDEX_INTERNAL: case DRBD_MD_INDEX_FLEX_INT: bdev->md.md_offset = drbd_md_ss__(mdev, bdev); /* al size is still fixed */ - bdev->md.al_offset = -MD_AL_SECTORS; + bdev->md.al_offset = -al_size_sect; /* we need (slightly less than) ~ this much bitmap sectors: */ md_size_sect = drbd_get_capacity(bdev->backing_bdev); md_size_sect = ALIGN(md_size_sect, BM_SECT_PER_EXT); @@ -735,11 +755,11 @@ static void drbd_md_set_sector_offsets(struct drbd_conf *mdev, /* plus the "drbd meta data super block", * and the activity log; */ - md_size_sect += MD_BM_OFFSET; + md_size_sect += MD_4kB_SECT + al_size_sect; bdev->md.md_size_sect = md_size_sect; /* bitmap offset is adjusted by 'super' block size */ - bdev->md.bm_offset = -md_size_sect + MD_AL_OFFSET; + bdev->md.bm_offset = -md_size_sect + MD_4kB_SECT; break; } rcu_read_unlock(); @@ -1416,7 +1436,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) min_md_device_sectors = (2<<10); } else { max_possible_sectors = DRBD_MAX_SECTORS; - min_md_device_sectors = MD_RESERVED_SECT * (new_disk_conf->meta_dev_idx + 1); + min_md_device_sectors = MD_128MB_SECT * (new_disk_conf->meta_dev_idx + 1); } if (drbd_get_capacity(nbc->md_bdev) < min_md_device_sectors) { -- cgit v1.2.3 From 3a4d4eb3cb03fbc66696fc8cd472701d56f3aee7 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Tue, 19 Mar 2013 18:16:44 +0100 Subject: drbd: prepare for new striped layout of activity log Introduce two new on-disk meta data fields: al_stripes and al_stripe_size_4k The intended use case is activity log on RAID 0 or similar. Logically consecutive transactions will advance their on-disk position by al_stripe_size_4k 4kB (transaction sized) blocks. Right now, these are still asserted to be the backward compatible values al_stripes = 1, al_stripe_size_4k = 8 (which amounts to 32kB). Also introduce a caching member for meta_dev_idx in the in-core structure: even though it is initially passed in in the rcu-protected disk_conf structure, it cannot change without a detach/attach cycle. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_actlog.c | 6 ++-- drivers/block/drbd/drbd_int.h | 46 +++++++++++------------- drivers/block/drbd/drbd_main.c | 77 +++++++++++++++++++++++++++++++++++----- drivers/block/drbd/drbd_nl.c | 5 ++- 4 files changed, 94 insertions(+), 40 deletions(-) diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index b230d91ec430..7e7680e8da6c 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c @@ -353,11 +353,11 @@ static unsigned int rs_extent_to_bm_page(unsigned int rs_enr) static sector_t al_tr_number_to_on_disk_sector(struct drbd_conf *mdev) { - const unsigned int stripes = 1; - const unsigned int stripe_size_4kB = MD_32kB_SECT/MD_4kB_SECT; + const unsigned int stripes = mdev->ldev->md.al_stripes; + const unsigned int stripe_size_4kB = mdev->ldev->md.al_stripe_size_4k; /* transaction number, modulo on-disk ring buffer wrap around */ - unsigned int t = mdev->al_tr_number % (stripe_size_4kB * stripes); + unsigned int t = mdev->al_tr_number % (mdev->ldev->md.al_size_4k); /* ... to aligned 4k on disk block */ t = ((t % stripes) * stripe_size_4kB) + t/stripes; diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 60c89e5b298c..ee19ba28b59a 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -755,6 +755,14 @@ struct drbd_md { s32 al_offset; /* signed relative sector offset to activity log */ s32 bm_offset; /* signed relative sector offset to bitmap */ + + /* cached value of bdev->disk_conf->meta_dev_idx (see below) */ + s32 meta_dev_idx; + + /* see al_tr_number_to_on_disk_sector() */ + u32 al_stripes; + u32 al_stripe_size_4k; + u32 al_size_4k; /* cached product of the above */ }; struct drbd_backing_dev { @@ -1862,38 +1870,24 @@ static inline sector_t drbd_get_max_capacity(struct drbd_backing_dev *bdev) } /** - * drbd_md_ss__() - Return the sector number of our meta data super block - * @mdev: DRBD device. + * drbd_md_ss() - Return the sector number of our meta data super block * @bdev: Meta data block device. */ -static inline sector_t drbd_md_ss__(struct drbd_conf *mdev, - struct drbd_backing_dev *bdev) +static inline sector_t drbd_md_ss(struct drbd_backing_dev *bdev) { - int meta_dev_idx; + const int meta_dev_idx = bdev->md.meta_dev_idx; - rcu_read_lock(); - meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx; - rcu_read_unlock(); + if (meta_dev_idx == DRBD_MD_INDEX_FLEX_EXT) + return 0; - switch (meta_dev_idx) { - default: /* external, some index; this is the old fixed size layout */ - return MD_128MB_SECT * meta_dev_idx; - case DRBD_MD_INDEX_INTERNAL: - /* with drbd08, internal meta data is always "flexible" */ - case DRBD_MD_INDEX_FLEX_INT: - if (!bdev->backing_bdev) { - if (__ratelimit(&drbd_ratelimit_state)) { - dev_err(DEV, "bdev->backing_bdev==NULL\n"); - dump_stack(); - } - return 0; - } - /* sizeof(struct md_on_disk_07) == 4k - * position: last 4k aligned block of 4k size */ + /* Since drbd08, internal meta data is always "flexible". + * position: last 4k aligned block of 4k size */ + if (meta_dev_idx == DRBD_MD_INDEX_INTERNAL || + meta_dev_idx == DRBD_MD_INDEX_FLEX_INT) return (drbd_get_capacity(bdev->backing_bdev) & ~7ULL) - 8; - case DRBD_MD_INDEX_FLEX_EXT: - return 0; - } + + /* external, some index; this is the old fixed size layout */ + return MD_128MB_SECT * bdev->md.meta_dev_idx; } static inline void diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 76faeab40c8f..7a2e07b45ecf 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -2850,7 +2850,11 @@ struct meta_data_on_disk { u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */ u32 la_peer_max_bio_size; /* last peer max_bio_size */ - u8 reserved_u8[4096 - (7*8 + 8*4)]; + /* see al_tr_number_to_on_disk_sector() */ + u32 al_stripes; + u32 al_stripe_size_4k; + + u8 reserved_u8[4096 - (7*8 + 10*4)]; } __packed; /** @@ -2898,7 +2902,10 @@ void drbd_md_sync(struct drbd_conf *mdev) buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset); buffer->la_peer_max_bio_size = cpu_to_be32(mdev->peer_max_bio_size); - D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset); + buffer->al_stripes = cpu_to_be32(mdev->ldev->md.al_stripes); + buffer->al_stripe_size_4k = cpu_to_be32(mdev->ldev->md.al_stripe_size_4k); + + D_ASSERT(drbd_md_ss(mdev->ldev) == mdev->ldev->md.md_offset); sector = mdev->ldev->md.md_offset; if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) { @@ -2916,13 +2923,60 @@ out: put_ldev(mdev); } +static int check_activity_log_stripe_size(struct drbd_conf *mdev, + struct meta_data_on_disk *on_disk, + struct drbd_md *in_core) +{ + u32 al_stripes = be32_to_cpu(on_disk->al_stripes); + u32 al_stripe_size_4k = be32_to_cpu(on_disk->al_stripe_size_4k); + u64 al_size_4k; + + /* both not set: default to old fixed size activity log */ + if (al_stripes == 0 && al_stripe_size_4k == 0) { + al_stripes = 1; + al_stripe_size_4k = MD_32kB_SECT/8; + } + + /* some paranoia plausibility checks */ + + /* we need both values to be set */ + if (al_stripes == 0 || al_stripe_size_4k == 0) + goto err; + + al_size_4k = (u64)al_stripes * al_stripe_size_4k; + + /* Upper limit of activity log area, to avoid potential overflow + * problems in al_tr_number_to_on_disk_sector(). As right now, more + * than 72 * 4k blocks total only increases the amount of history, + * limiting this arbitrarily to 16 GB is not a real limitation ;-) */ + if (al_size_4k > (16 * 1024 * 1024/4)) + goto err; + + /* Lower limit: we need at least 8 transaction slots (32kB) + * to not break existing setups */ + if (al_size_4k < MD_32kB_SECT/8) + goto err; + + in_core->al_stripe_size_4k = al_stripe_size_4k; + in_core->al_stripes = al_stripes; + in_core->al_size_4k = al_size_4k; + + return 0; +err: + dev_err(DEV, "invalid activity log striping: al_stripes=%u, al_stripe_size_4k=%u\n", + al_stripes, al_stripe_size_4k); + return -EINVAL; +} + /** * drbd_md_read() - Reads in the meta data super block * @mdev: DRBD device. * @bdev: Device from which the meta data should be read in. * - * Return 0 (NO_ERROR) on success, and an enum drbd_ret_code in case + * Return NO_ERROR on success, and an enum drbd_ret_code in case * something goes wrong. + * + * Called exactly once during drbd_adm_attach() */ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) { @@ -2937,6 +2991,10 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) if (!buffer) goto out; + /* First, figure out where our meta data superblock is located. */ + bdev->md.meta_dev_idx = bdev->disk_conf->meta_dev_idx; + bdev->md.md_offset = drbd_md_ss(bdev); + if (drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) { /* NOTE: can't do normal error processing here as this is called BEFORE disk is attached */ @@ -2954,40 +3012,43 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) rv = ERR_MD_UNCLEAN; goto err; } + + rv = ERR_MD_INVALID; if (magic != DRBD_MD_MAGIC_08) { if (magic == DRBD_MD_MAGIC_07) dev_err(DEV, "Found old (0.7) meta data magic. Did you \"drbdadm create-md\"?\n"); else dev_err(DEV, "Meta data magic not found. Did you \"drbdadm create-md\"?\n"); - rv = ERR_MD_INVALID; goto err; } + + if (check_activity_log_stripe_size(mdev, buffer, &bdev->md)) + goto err; + if (be32_to_cpu(buffer->al_offset) != bdev->md.al_offset) { dev_err(DEV, "unexpected al_offset: %d (expected %d)\n", be32_to_cpu(buffer->al_offset), bdev->md.al_offset); - rv = ERR_MD_INVALID; goto err; } if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) { dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n", be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset); - rv = ERR_MD_INVALID; goto err; } if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) { dev_err(DEV, "unexpected md_size: %u (expected %u)\n", be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect); - rv = ERR_MD_INVALID; goto err; } if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) { dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n", be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE); - rv = ERR_MD_INVALID; goto err; } + rv = NO_ERROR; + bdev->md.la_size_sect = be64_to_cpu(buffer->la_size); for (i = UI_CURRENT; i < UI_SIZE; i++) bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]); diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 581f6800cc30..104b7cea691e 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -727,24 +727,23 @@ static void drbd_md_set_sector_offsets(struct drbd_conf *mdev, rcu_read_lock(); meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx; + bdev->md.md_offset = drbd_md_ss(bdev); + switch (meta_dev_idx) { default: /* v07 style fixed size indexed meta data */ bdev->md.md_size_sect = MD_128MB_SECT; - bdev->md.md_offset = drbd_md_ss__(mdev, bdev); bdev->md.al_offset = MD_4kB_SECT; bdev->md.bm_offset = MD_4kB_SECT + al_size_sect; break; case DRBD_MD_INDEX_FLEX_EXT: /* just occupy the full device; unit: sectors */ bdev->md.md_size_sect = drbd_get_capacity(bdev->md_bdev); - bdev->md.md_offset = 0; bdev->md.al_offset = MD_4kB_SECT; bdev->md.bm_offset = MD_4kB_SECT + al_size_sect; break; case DRBD_MD_INDEX_INTERNAL: case DRBD_MD_INDEX_FLEX_INT: - bdev->md.md_offset = drbd_md_ss__(mdev, bdev); /* al size is still fixed */ bdev->md.al_offset = -al_size_sect; /* we need (slightly less than) ~ this much bitmap sectors: */ -- cgit v1.2.3 From 68e41a43f18b681f83329c8ad83123571bb8db0d Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Tue, 19 Mar 2013 18:16:45 +0100 Subject: drbd: use the cached meta_dev_idx Now we have the cached meta_dev_idx member, we can get rid of a few rcu_read_lock() sections and rcu_dereference(). Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_int.h | 32 +++++--------------------------- drivers/block/drbd/drbd_nl.c | 7 +------ 2 files changed, 6 insertions(+), 33 deletions(-) diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index ee19ba28b59a..6eecdec9da2b 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -1777,9 +1777,9 @@ static inline void drbd_chk_io_error_(struct drbd_conf *mdev, * BTW, for internal meta data, this happens to be the maximum capacity * we could agree upon with our peer node. */ -static inline sector_t _drbd_md_first_sector(int meta_dev_idx, struct drbd_backing_dev *bdev) +static inline sector_t drbd_md_first_sector(struct drbd_backing_dev *bdev) { - switch (meta_dev_idx) { + switch (bdev->md.meta_dev_idx) { case DRBD_MD_INDEX_INTERNAL: case DRBD_MD_INDEX_FLEX_INT: return bdev->md.md_offset + bdev->md.bm_offset; @@ -1789,30 +1789,13 @@ static inline sector_t _drbd_md_first_sector(int meta_dev_idx, struct drbd_backi } } -static inline sector_t drbd_md_first_sector(struct drbd_backing_dev *bdev) -{ - int meta_dev_idx; - - rcu_read_lock(); - meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx; - rcu_read_unlock(); - - return _drbd_md_first_sector(meta_dev_idx, bdev); -} - /** * drbd_md_last_sector() - Return the last sector number of the meta data area * @bdev: Meta data block device. */ static inline sector_t drbd_md_last_sector(struct drbd_backing_dev *bdev) { - int meta_dev_idx; - - rcu_read_lock(); - meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx; - rcu_read_unlock(); - - switch (meta_dev_idx) { + switch (bdev->md.meta_dev_idx) { case DRBD_MD_INDEX_INTERNAL: case DRBD_MD_INDEX_FLEX_INT: return bdev->md.md_offset + MD_4kB_SECT -1; @@ -1840,18 +1823,13 @@ static inline sector_t drbd_get_capacity(struct block_device *bdev) static inline sector_t drbd_get_max_capacity(struct drbd_backing_dev *bdev) { sector_t s; - int meta_dev_idx; - - rcu_read_lock(); - meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx; - rcu_read_unlock(); - switch (meta_dev_idx) { + switch (bdev->md.meta_dev_idx) { case DRBD_MD_INDEX_INTERNAL: case DRBD_MD_INDEX_FLEX_INT: s = drbd_get_capacity(bdev->backing_bdev) ? min_t(sector_t, DRBD_MAX_SECTORS_FLEX, - _drbd_md_first_sector(meta_dev_idx, bdev)) + drbd_md_first_sector(bdev)) : 0; break; case DRBD_MD_INDEX_FLEX_EXT: diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 104b7cea691e..5621df86967a 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -722,14 +722,10 @@ static void drbd_md_set_sector_offsets(struct drbd_conf *mdev, { sector_t md_size_sect = 0; unsigned int al_size_sect = MD_32kB_SECT; - int meta_dev_idx; - - rcu_read_lock(); - meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx; bdev->md.md_offset = drbd_md_ss(bdev); - switch (meta_dev_idx) { + switch (bdev->md.meta_dev_idx) { default: /* v07 style fixed size indexed meta data */ bdev->md.md_size_sect = MD_128MB_SECT; @@ -761,7 +757,6 @@ static void drbd_md_set_sector_offsets(struct drbd_conf *mdev, bdev->md.bm_offset = -md_size_sect + MD_4kB_SECT; break; } - rcu_read_unlock(); } /* input size is expected to be in KB */ -- cgit v1.2.3 From cccac9857d624dab74b23bafe0482fcdd91df7d8 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Tue, 19 Mar 2013 18:16:46 +0100 Subject: drbd: mechanically rename la_size to la_size_sect Make it obvious that this value is in units of 512 Byte sectors. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_main.c | 6 +++--- drivers/block/drbd/drbd_nl.c | 16 ++++++++-------- drivers/block/drbd/drbd_receiver.c | 2 +- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 7a2e07b45ecf..6b956fc04dc8 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -2836,7 +2836,7 @@ void conn_md_sync(struct drbd_tconn *tconn) /* aligned 4kByte */ struct meta_data_on_disk { - u64 la_size; /* last agreed size. */ + u64 la_size_sect; /* last agreed size. */ u64 uuid[UI_SIZE]; /* UUIDs. */ u64 device_uuid; u64 reserved_u64_1; @@ -2887,7 +2887,7 @@ void drbd_md_sync(struct drbd_conf *mdev) memset(buffer, 0, sizeof(*buffer)); - buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev)); + buffer->la_size_sect = cpu_to_be64(drbd_get_capacity(mdev->this_bdev)); for (i = UI_CURRENT; i < UI_SIZE; i++) buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]); buffer->flags = cpu_to_be32(mdev->ldev->md.flags); @@ -3049,7 +3049,7 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) rv = NO_ERROR; - bdev->md.la_size_sect = be64_to_cpu(buffer->la_size); + bdev->md.la_size_sect = be64_to_cpu(buffer->la_size_sect); for (i = UI_CURRENT; i < UI_SIZE; i++) bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]); bdev->md.flags = be32_to_cpu(buffer->flags); diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 5621df86967a..d5211b06df45 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -819,7 +819,7 @@ void drbd_resume_io(struct drbd_conf *mdev) enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds_flags flags) __must_hold(local) { sector_t prev_first_sect, prev_size; /* previous meta location */ - sector_t la_size, u_size; + sector_t la_size_sect, u_size; sector_t size; char ppb[10]; @@ -842,7 +842,7 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds prev_first_sect = drbd_md_first_sector(mdev->ldev); prev_size = mdev->ldev->md.md_size_sect; - la_size = mdev->ldev->md.la_size_sect; + la_size_sect = mdev->ldev->md.la_size_sect; /* TODO: should only be some assert here, not (re)init... */ drbd_md_set_sector_offsets(mdev, mdev->ldev); @@ -878,7 +878,7 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds if (rv == dev_size_error) goto out; - la_size_changed = (la_size != mdev->ldev->md.la_size_sect); + la_size_changed = (la_size_sect != mdev->ldev->md.la_size_sect); md_moved = prev_first_sect != drbd_md_first_sector(mdev->ldev) || prev_size != mdev->ldev->md.md_size_sect; @@ -900,9 +900,9 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds drbd_md_mark_dirty(mdev); } - if (size > la_size) + if (size > la_size_sect) rv = grew; - if (size < la_size) + if (size < la_size_sect) rv = shrunk; out: lc_unlock(mdev->act_log); @@ -917,7 +917,7 @@ drbd_new_dev_size(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, sector_t u_size, int assume_peer_has_space) { sector_t p_size = mdev->p_size; /* partner's disk size. */ - sector_t la_size = bdev->md.la_size_sect; /* last agreed size. */ + sector_t la_size_sect = bdev->md.la_size_sect; /* last agreed size. */ sector_t m_size; /* my size */ sector_t size = 0; @@ -931,8 +931,8 @@ drbd_new_dev_size(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, if (p_size && m_size) { size = min_t(sector_t, p_size, m_size); } else { - if (la_size) { - size = la_size; + if (la_size_sect) { + size = la_size_sect; if (m_size && m_size < size) size = m_size; if (p_size && p_size < size) diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index a9eccfc6079b..8172a2cfdead 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -3992,7 +3992,7 @@ static int receive_state(struct drbd_tconn *tconn, struct packet_info *pi) clear_bit(DISCARD_MY_DATA, &mdev->flags); - drbd_md_sync(mdev); /* update connected indicator, la_size, ... */ + drbd_md_sync(mdev); /* update connected indicator, la_size_sect, ... */ return 0; } -- cgit v1.2.3 From c04ccaa669e147ffb66e4e74d82c7dbfc100ec5e Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Tue, 19 Mar 2013 18:16:47 +0100 Subject: drbd: read meta data early, base on-disk offsets on super block We used to calculate all on-disk meta data offsets, and then compare the stored offsets, basically treating them as magic numbers. Now with the activity log striping, the activity log size is no longer fixed. We need to first read the super block, then base the activity log and bitmap offsets on the stored offsets/al stripe settings. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_actlog.c | 11 +++- drivers/block/drbd/drbd_main.c | 131 ++++++++++++++++++++++++++++++++------- drivers/block/drbd/drbd_nl.c | 15 ++--- drivers/block/drbd/drbd_worker.c | 3 +- 4 files changed, 123 insertions(+), 37 deletions(-) diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index 7e7680e8da6c..c79625aa8cf2 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c @@ -168,7 +168,11 @@ static int _drbd_md_sync_page_io(struct drbd_conf *mdev, bio->bi_end_io = drbd_md_io_complete; bio->bi_rw = rw; - if (!get_ldev_if_state(mdev, D_ATTACHING)) { /* Corresponding put_ldev in drbd_md_io_complete() */ + if (!(rw & WRITE) && mdev->state.disk == D_DISKLESS && mdev->ldev == NULL) + /* special case, drbd_md_read() during drbd_adm_attach(): no get_ldev */ + ; + else if (!get_ldev_if_state(mdev, D_ATTACHING)) { + /* Corresponding put_ldev in drbd_md_io_complete() */ dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in _drbd_md_sync_page_io()\n"); err = -ENODEV; goto out; @@ -199,9 +203,10 @@ int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, BUG_ON(!bdev->md_bdev); - dev_dbg(DEV, "meta_data io: %s [%d]:%s(,%llus,%s)\n", + dev_dbg(DEV, "meta_data io: %s [%d]:%s(,%llus,%s) %pS\n", current->comm, current->pid, __func__, - (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ"); + (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ", + (void*)_RET_IP_ ); if (sector < drbd_md_first_sector(bdev) || sector + 7 > drbd_md_last_sector(bdev)) diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 6b956fc04dc8..e55271d6e7f6 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -2968,6 +2968,86 @@ err: return -EINVAL; } +static int check_offsets_and_sizes(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) +{ + sector_t capacity = drbd_get_capacity(bdev->md_bdev); + struct drbd_md *in_core = &bdev->md; + s32 on_disk_al_sect; + s32 on_disk_bm_sect; + + /* The on-disk size of the activity log, calculated from offsets, and + * the size of the activity log calculated from the stripe settings, + * should match. + * Though we could relax this a bit: it is ok, if the striped activity log + * fits in the available on-disk activity log size. + * Right now, that would break how resize is implemented. + * TODO: make drbd_determine_dev_size() (and the drbdmeta tool) aware + * of possible unused padding space in the on disk layout. */ + if (in_core->al_offset < 0) { + if (in_core->bm_offset > in_core->al_offset) + goto err; + on_disk_al_sect = -in_core->al_offset; + on_disk_bm_sect = in_core->al_offset - in_core->bm_offset; + } else { + if (in_core->al_offset != MD_4kB_SECT) + goto err; + if (in_core->bm_offset < in_core->al_offset + in_core->al_size_4k * MD_4kB_SECT) + goto err; + + on_disk_al_sect = in_core->bm_offset - MD_4kB_SECT; + on_disk_bm_sect = in_core->md_size_sect - in_core->bm_offset; + } + + /* old fixed size meta data is exactly that: fixed. */ + if (in_core->meta_dev_idx >= 0) { + if (in_core->md_size_sect != MD_128MB_SECT + || in_core->al_offset != MD_4kB_SECT + || in_core->bm_offset != MD_4kB_SECT + MD_32kB_SECT + || in_core->al_stripes != 1 + || in_core->al_stripe_size_4k != MD_32kB_SECT/8) + goto err; + } + + if (capacity < in_core->md_size_sect) + goto err; + if (capacity - in_core->md_size_sect < drbd_md_first_sector(bdev)) + goto err; + + /* should be aligned, and at least 32k */ + if ((on_disk_al_sect & 7) || (on_disk_al_sect < MD_32kB_SECT)) + goto err; + + /* should fit (for now: exactly) into the available on-disk space; + * overflow prevention is in check_activity_log_stripe_size() above. */ + if (on_disk_al_sect != in_core->al_size_4k * MD_4kB_SECT) + goto err; + + /* again, should be aligned */ + if (in_core->bm_offset & 7) + goto err; + + /* FIXME check for device grow with flex external meta data? */ + + /* can the available bitmap space cover the last agreed device size? */ + if (on_disk_bm_sect < (in_core->la_size_sect+7)/MD_4kB_SECT/8/512) + goto err; + + return 0; + +err: + dev_err(DEV, "meta data offsets don't make sense: idx=%d " + "al_s=%u, al_sz4k=%u, al_offset=%d, bm_offset=%d, " + "md_size_sect=%u, la_size=%llu, md_capacity=%llu\n", + in_core->meta_dev_idx, + in_core->al_stripes, in_core->al_stripe_size_4k, + in_core->al_offset, in_core->bm_offset, in_core->md_size_sect, + (unsigned long long)in_core->la_size_sect, + (unsigned long long)capacity); + + return -EINVAL; +} + + /** * drbd_md_read() - Reads in the meta data super block * @mdev: DRBD device. @@ -2976,7 +3056,8 @@ err: * Return NO_ERROR on success, and an enum drbd_ret_code in case * something goes wrong. * - * Called exactly once during drbd_adm_attach() + * Called exactly once during drbd_adm_attach(), while still being D_DISKLESS, + * even before @bdev is assigned to @mdev->ldev. */ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) { @@ -2984,14 +3065,15 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) u32 magic, flags; int i, rv = NO_ERROR; - if (!get_ldev_if_state(mdev, D_ATTACHING)) - return ERR_IO_MD_DISK; + if (mdev->state.disk != D_DISKLESS) + return ERR_DISK_CONFIGURED; buffer = drbd_md_get_buffer(mdev); if (!buffer) - goto out; + return ERR_NOMEM; - /* First, figure out where our meta data superblock is located. */ + /* First, figure out where our meta data superblock is located, + * and read it. */ bdev->md.meta_dev_idx = bdev->disk_conf->meta_dev_idx; bdev->md.md_offset = drbd_md_ss(bdev); @@ -3022,14 +3104,29 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) goto err; } - if (check_activity_log_stripe_size(mdev, buffer, &bdev->md)) + if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) { + dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n", + be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE); goto err; + } - if (be32_to_cpu(buffer->al_offset) != bdev->md.al_offset) { - dev_err(DEV, "unexpected al_offset: %d (expected %d)\n", - be32_to_cpu(buffer->al_offset), bdev->md.al_offset); + + /* convert to in_core endian */ + bdev->md.la_size_sect = be64_to_cpu(buffer->la_size_sect); + for (i = UI_CURRENT; i < UI_SIZE; i++) + bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]); + bdev->md.flags = be32_to_cpu(buffer->flags); + bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid); + + bdev->md.md_size_sect = be32_to_cpu(buffer->md_size_sect); + bdev->md.al_offset = be32_to_cpu(buffer->al_offset); + bdev->md.bm_offset = be32_to_cpu(buffer->bm_offset); + + if (check_activity_log_stripe_size(mdev, buffer, &bdev->md)) goto err; - } + if (check_offsets_and_sizes(mdev, bdev)) + goto err; + if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) { dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n", be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset); @@ -3041,20 +3138,8 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) goto err; } - if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) { - dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n", - be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE); - goto err; - } - rv = NO_ERROR; - bdev->md.la_size_sect = be64_to_cpu(buffer->la_size_sect); - for (i = UI_CURRENT; i < UI_SIZE; i++) - bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]); - bdev->md.flags = be32_to_cpu(buffer->flags); - bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid); - spin_lock_irq(&mdev->tconn->req_lock); if (mdev->state.conn < C_CONNECTED) { unsigned int peer; @@ -3066,8 +3151,6 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) err: drbd_md_put_buffer(mdev); - out: - put_ldev(mdev); return rv; } diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index d5211b06df45..974ea47a656a 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -721,7 +721,7 @@ static void drbd_md_set_sector_offsets(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) { sector_t md_size_sect = 0; - unsigned int al_size_sect = MD_32kB_SECT; + unsigned int al_size_sect = bdev->md.al_size_4k * 8; bdev->md.md_offset = drbd_md_ss(bdev); @@ -1413,8 +1413,11 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) goto fail; } - /* RT - for drbd_get_max_capacity() DRBD_MD_INDEX_FLEX_INT */ - drbd_md_set_sector_offsets(mdev, nbc); + /* Read our meta data super block early. + * This also sets other on-disk offsets. */ + retcode = drbd_md_read(mdev, nbc); + if (retcode != NO_ERROR) + goto fail; if (drbd_get_max_capacity(nbc) < new_disk_conf->disk_size) { dev_err(DEV, "max capacity %llu smaller than disk size %llu\n", @@ -1481,8 +1484,6 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) if (!get_ldev_if_state(mdev, D_ATTACHING)) goto force_diskless; - drbd_md_set_sector_offsets(mdev, nbc); - if (!mdev->bitmap) { if (drbd_bm_init(mdev)) { retcode = ERR_NOMEM; @@ -1490,10 +1491,6 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) } } - retcode = drbd_md_read(mdev, nbc); - if (retcode != NO_ERROR) - goto force_diskless_dec; - if (mdev->state.conn < C_CONNECTED && mdev->state.role == R_PRIMARY && (mdev->ed_uuid & ~((u64)1)) != (nbc->md.uuid[UI_CURRENT] & ~((u64)1))) { diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 424dc7bdf9b7..34b5d5d23ac4 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -89,7 +89,8 @@ void drbd_md_io_complete(struct bio *bio, int error) md_io->done = 1; wake_up(&mdev->misc_wait); bio_put(bio); - put_ldev(mdev); + if (mdev->ldev) /* special case: drbd_md_read() during drbd_adm_attach() */ + put_ldev(mdev); } /* reads on behalf of the partner, -- cgit v1.2.3 From 56392d2f40aac4b520fc50bc356f40e07f7e1c7d Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Tue, 19 Mar 2013 18:16:48 +0100 Subject: drbd: Clarify when activity log I/O is delegated to the worker thread Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_actlog.c | 49 +++++++++++++++++++++----------------- drivers/block/drbd/drbd_int.h | 2 +- drivers/block/drbd/drbd_receiver.c | 2 +- drivers/block/drbd/drbd_req.c | 2 +- drivers/block/drbd/drbd_worker.c | 2 +- 5 files changed, 31 insertions(+), 26 deletions(-) diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index c79625aa8cf2..82199d9a9a61 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c @@ -104,7 +104,7 @@ struct update_al_work { int err; }; -static int al_write_transaction(struct drbd_conf *mdev); +static int al_write_transaction(struct drbd_conf *mdev, bool delegate); void *drbd_md_get_buffer(struct drbd_conf *mdev) { @@ -246,7 +246,10 @@ static struct lc_element *_al_get(struct drbd_conf *mdev, unsigned int enr) return al_ext; } -void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i) +/* + * @delegate: delegate activity log I/O to the worker thread + */ +void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i, bool delegate) { /* for bios crossing activity log extent boundaries, * we may need to activate two extents in one go */ @@ -255,6 +258,17 @@ void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i) unsigned enr; bool locked = false; + /* When called through generic_make_request(), we must delegate + * activity log I/O to the worker thread: a further request + * submitted via generic_make_request() within the same task + * would be queued on current->bio_list, and would only start + * after this function returns (see generic_make_request()). + * + * However, if we *are* the worker, we must not delegate to ourselves. + */ + + if (delegate) + BUG_ON(current == mdev->tconn->worker.task); D_ASSERT(first <= last); D_ASSERT(atomic_read(&mdev->local_cnt) > 0); @@ -270,13 +284,6 @@ void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i) (locked = lc_try_lock_for_transaction(mdev->act_log))); if (locked) { - /* drbd_al_write_transaction(mdev,al_ext,enr); - * recurses into generic_make_request(), which - * disallows recursion, bios being serialized on the - * current->bio_tail list now. - * we have to delegate updates to the activity log - * to the worker thread. */ - /* Double check: it may have been committed by someone else, * while we have been waiting for the lock. */ if (mdev->act_log->pending_changes) { @@ -287,7 +294,7 @@ void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i) rcu_read_unlock(); if (write_al_updates) { - al_write_transaction(mdev); + al_write_transaction(mdev, delegate); mdev->al_writ_cnt++; } @@ -495,20 +502,18 @@ static int w_al_write_transaction(struct drbd_work *w, int unused) /* Calls from worker context (see w_restart_disk_io()) need to write the transaction directly. Others came through generic_make_request(), those need to delegate it to the worker. */ -static int al_write_transaction(struct drbd_conf *mdev) +static int al_write_transaction(struct drbd_conf *mdev, bool delegate) { - struct update_al_work al_work; - - if (current == mdev->tconn->worker.task) + if (delegate) { + struct update_al_work al_work; + init_completion(&al_work.event); + al_work.w.cb = w_al_write_transaction; + al_work.w.mdev = mdev; + drbd_queue_work_front(&mdev->tconn->sender_work, &al_work.w); + wait_for_completion(&al_work.event); + return al_work.err; + } else return _al_write_transaction(mdev); - - init_completion(&al_work.event); - al_work.w.cb = w_al_write_transaction; - al_work.w.mdev = mdev; - drbd_queue_work_front(&mdev->tconn->sender_work, &al_work.w); - wait_for_completion(&al_work.event); - - return al_work.err; } static int _try_lc_del(struct drbd_conf *mdev, struct lc_element *al_ext) diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 6eecdec9da2b..453fccfc440c 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -1598,7 +1598,7 @@ extern const char *drbd_conn_str(enum drbd_conns s); extern const char *drbd_role_str(enum drbd_role s); /* drbd_actlog.c */ -extern void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i); +extern void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i, bool delegate); extern void drbd_al_complete_io(struct drbd_conf *mdev, struct drbd_interval *i); extern void drbd_rs_complete_io(struct drbd_conf *mdev, sector_t sector); extern int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector); diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 8172a2cfdead..1921871ca9a8 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -2265,7 +2265,7 @@ static int receive_Data(struct drbd_tconn *tconn, struct packet_info *pi) drbd_set_out_of_sync(mdev, peer_req->i.sector, peer_req->i.size); peer_req->flags |= EE_CALL_AL_COMPLETE_IO; peer_req->flags &= ~EE_MAY_SET_IN_SYNC; - drbd_al_begin_io(mdev, &peer_req->i); + drbd_al_begin_io(mdev, &peer_req->i, true); } err = drbd_submit_peer_request(mdev, peer_req, rw, DRBD_FAULT_DT_WR); diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 2b8303ad63c9..7d1ff1aaeb71 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -1054,7 +1054,7 @@ void __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long if (rw == WRITE && req->private_bio && req->i.size && !test_bit(AL_SUSPENDED, &mdev->flags)) { req->rq_state |= RQ_IN_ACT_LOG; - drbd_al_begin_io(mdev, &req->i); + drbd_al_begin_io(mdev, &req->i, true); } spin_lock_irq(&mdev->tconn->req_lock); diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 34b5d5d23ac4..f41e224caa7c 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -1411,7 +1411,7 @@ int w_restart_disk_io(struct drbd_work *w, int cancel) struct drbd_conf *mdev = w->mdev; if (bio_data_dir(req->master_bio) == WRITE && req->rq_state & RQ_IN_ACT_LOG) - drbd_al_begin_io(mdev, &req->i); + drbd_al_begin_io(mdev, &req->i, false); drbd_req_make_private_bio(req, req->master_bio); req->private_bio->bi_bdev = mdev->ldev->backing_bdev; -- cgit v1.2.3 From ebfd5d8f715167b886c9401e6b123847187f137b Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Tue, 19 Mar 2013 18:16:49 +0100 Subject: drbd: drbd_al_being_io: short circuit to reduce latency A request hitting an already "hot" extent should proceed right away, even if some other requests need to wait for pending transactions. Without that short-circuit, several simultaneous make_request contexts race for committing the transaction, possibly penalizing the innocent. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_actlog.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index 82199d9a9a61..1d7244d2a910 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c @@ -256,6 +256,7 @@ void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i, bool dele unsigned first = i->sector >> (AL_EXTENT_SHIFT-9); unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9); unsigned enr; + bool need_transaction = false; bool locked = false; /* When called through generic_make_request(), we must delegate @@ -273,8 +274,17 @@ void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i, bool dele D_ASSERT(first <= last); D_ASSERT(atomic_read(&mdev->local_cnt) > 0); - for (enr = first; enr <= last; enr++) - wait_event(mdev->al_wait, _al_get(mdev, enr) != NULL); + for (enr = first; enr <= last; enr++) { + struct lc_element *al_ext; + wait_event(mdev->al_wait, (al_ext = _al_get(mdev, enr)) != NULL); + if (al_ext->lc_number != enr) + need_transaction = true; + } + + /* If *this* request was to an already active extent, + * we're done, even if there are pending changes. */ + if (!need_transaction) + return; /* Serialize multiple transactions. * This uses test_and_set_bit, memory barrier is implicit. -- cgit v1.2.3 From 6d9febe237146156947f0da8407c620b5c33c1df Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Tue, 19 Mar 2013 18:16:50 +0100 Subject: drbd: split __drbd_make_request in before and after drbd_al_begin_io This is in preparation to be able to defer requests that need to wait for an activity log transaction to a submitter workqueue. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_req.c | 40 ++++++++++++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 10 deletions(-) diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 7d1ff1aaeb71..96d5968fc1e4 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -34,14 +34,14 @@ static bool drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int size); /* Update disk stats at start of I/O request */ -static void _drbd_start_io_acct(struct drbd_conf *mdev, struct drbd_request *req, struct bio *bio) +static void _drbd_start_io_acct(struct drbd_conf *mdev, struct drbd_request *req) { - const int rw = bio_data_dir(bio); + const int rw = bio_data_dir(req->master_bio); int cpu; cpu = part_stat_lock(); part_round_stats(cpu, &mdev->vdisk->part0); part_stat_inc(cpu, &mdev->vdisk->part0, ios[rw]); - part_stat_add(cpu, &mdev->vdisk->part0, sectors[rw], bio_sectors(bio)); + part_stat_add(cpu, &mdev->vdisk->part0, sectors[rw], req->i.size >> 9); (void) cpu; /* The macro invocations above want the cpu argument, I do not like the compiler warning about cpu only assigned but never used... */ part_inc_in_flight(&mdev->vdisk->part0, rw); @@ -1020,12 +1020,16 @@ drbd_submit_req_private_bio(struct drbd_request *req) bio_endio(bio, -EIO); } -void __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long start_time) +/* returns the new drbd_request pointer, if the caller is expected to + * drbd_send_and_submit() it (to save latency), or NULL if we queued the + * request on the submitter thread. + * Returns ERR_PTR(-ENOMEM) if we cannot allocate a drbd_request. + */ +struct drbd_request * +drbd_request_prepare(struct drbd_conf *mdev, struct bio *bio, unsigned long start_time) { - const int rw = bio_rw(bio); - struct bio_and_error m = { NULL, }; + const int rw = bio_data_dir(bio); struct drbd_request *req; - bool no_remote = false; /* allocate outside of all locks; */ req = drbd_req_new(mdev, bio); @@ -1035,7 +1039,7 @@ void __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long * if user cannot handle io errors, that's not our business. */ dev_err(DEV, "could not kmalloc() req\n"); bio_endio(bio, -ENOMEM); - return; + return ERR_PTR(-ENOMEM); } req->start_time = start_time; @@ -1057,6 +1061,15 @@ void __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long drbd_al_begin_io(mdev, &req->i, true); } + return req; +} + +static void drbd_send_and_submit(struct drbd_conf *mdev, struct drbd_request *req) +{ + const int rw = bio_rw(req->master_bio); + struct bio_and_error m = { NULL, }; + bool no_remote = false; + spin_lock_irq(&mdev->tconn->req_lock); if (rw == WRITE) { /* This may temporarily give up the req_lock, @@ -1079,7 +1092,7 @@ void __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long } /* Update disk stats */ - _drbd_start_io_acct(mdev, req, bio); + _drbd_start_io_acct(mdev, req); /* We fail READ/READA early, if we can not serve it. * We must do this before req is registered on any lists. @@ -1137,7 +1150,14 @@ out: if (m.bio) complete_master_bio(mdev, &m); - return; +} + +void __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long start_time) +{ + struct drbd_request *req = drbd_request_prepare(mdev, bio, start_time); + if (IS_ERR_OR_NULL(req)) + return; + drbd_send_and_submit(mdev, req); } void drbd_make_request(struct request_queue *q, struct bio *bio) -- cgit v1.2.3 From 113fef9e20e0d614b3f5940b67c96e719c559eea Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Fri, 22 Mar 2013 18:14:40 -0600 Subject: drbd: prepare to queue write requests on a submit worker Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_int.h | 13 +++++++++++++ drivers/block/drbd/drbd_main.c | 26 +++++++++++++++++++++++++- drivers/block/drbd/drbd_nl.c | 1 + drivers/block/drbd/drbd_req.c | 29 +++++++++++++++++++++++++++++ 4 files changed, 68 insertions(+), 1 deletion(-) diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 453fccfc440c..a6b71b6076b5 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -894,6 +894,14 @@ struct drbd_tconn { /* is a resource from the config file */ } send; }; +struct submit_worker { + struct workqueue_struct *wq; + struct work_struct worker; + + spinlock_t lock; + struct list_head writes; +}; + struct drbd_conf { struct drbd_tconn *tconn; int vnr; /* volume number within the connection */ @@ -1034,6 +1042,10 @@ struct drbd_conf { atomic_t ap_in_flight; /* App sectors in flight (waiting for ack) */ unsigned int peer_max_bio_size; unsigned int local_max_bio_size; + + /* any requests that would block in drbd_make_request() + * are deferred to this single-threaded work queue */ + struct submit_worker submit; }; static inline struct drbd_conf *minor_to_mdev(unsigned int minor) @@ -1440,6 +1452,7 @@ extern void conn_free_crypto(struct drbd_tconn *tconn); extern int proc_details; /* drbd_req */ +extern void do_submit(struct work_struct *ws); extern void __drbd_make_request(struct drbd_conf *, struct bio *, unsigned long); extern void drbd_make_request(struct request_queue *q, struct bio *bio); extern int drbd_read_remote(struct drbd_conf *mdev, struct drbd_request *req); diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index e55271d6e7f6..a150b59897a0 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -45,7 +45,7 @@ #include #include #include - +#include #define __KERNEL_SYSCALLS__ #include #include @@ -2300,6 +2300,7 @@ static void drbd_cleanup(void) idr_for_each_entry(&minors, mdev, i) { idr_remove(&minors, mdev_to