summaryrefslogtreecommitdiffstats
path: root/drivers/block
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2011-03-27 20:02:07 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2011-03-27 20:02:07 -0700
commit8d49a77568d1105ff3e64aec484dac059f54824e (patch)
tree633ee954a3cea97bf136dec933388a2e419e5dac /drivers/block
parent93567c43eb2a4771b9c590435928f9b3a428e568 (diff)
parent1ddd5049545e0aa1a0ed19bca4d9c9c3ce1ac8a2 (diff)
Merge branch 'for-2.6.39/drivers' of git://git.kernel.dk/linux-2.6-block
* 'for-2.6.39/drivers' of git://git.kernel.dk/linux-2.6-block: (122 commits) cciss: fix lost command issue drbd: need include for bitops functions declarations Revert "cciss: Add missing allocation in scsi_cmd_stack_setup and corresponding deallocation" cciss: fix missed command status value CMD_UNABORTABLE cciss: remove unnecessary casts cciss: Mask off error bits of c->busaddr in cmd_special_free when calling pci_free_consistent cciss: Inform controller we are using 32-bit tags. cciss: hoist tag masking out of loop cciss: Add missing allocation in scsi_cmd_stack_setup and corresponding deallocation cciss: export resettable host attribute drbd: drop code present under #ifdef which is relevant to 2.6.28 and below drbd: Fixed handling of read errors on a 'VerifyS' node drbd: Fixed handling of read errors on a 'VerifyT' node drbd: Implemented real timeout checking for request processing time drbd: Remove unused function atodb_endio() drbd: improve log message if received sector offset exceeds local capacity drbd: kill dead code drbd: don't BUG_ON, if bio_add_page of a single page to an empty bio fails drbd: Removed left over, now wrong comments drbd: serialize admin requests for new verify run with pending bitmap io ...
Diffstat (limited to 'drivers/block')
-rw-r--r--drivers/block/cciss.c86
-rw-r--r--drivers/block/cciss.h1
-rw-r--r--drivers/block/cciss_cmd.h1
-rw-r--r--drivers/block/cciss_scsi.c13
-rw-r--r--drivers/block/drbd/drbd_actlog.c335
-rw-r--r--drivers/block/drbd/drbd_bitmap.c752
-rw-r--r--drivers/block/drbd/drbd_int.h270
-rw-r--r--drivers/block/drbd/drbd_main.c673
-rw-r--r--drivers/block/drbd/drbd_nl.c183
-rw-r--r--drivers/block/drbd/drbd_proc.c114
-rw-r--r--drivers/block/drbd/drbd_receiver.c608
-rw-r--r--drivers/block/drbd/drbd_req.c169
-rw-r--r--drivers/block/drbd/drbd_req.h36
-rw-r--r--drivers/block/drbd/drbd_strings.c6
-rw-r--r--drivers/block/drbd/drbd_worker.c360
-rw-r--r--drivers/block/drbd/drbd_wrappers.h2
16 files changed, 2214 insertions, 1395 deletions
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 35658f445fca..9bf13988f1a2 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -193,7 +193,7 @@ static int __devinit cciss_find_cfg_addrs(struct pci_dev *pdev,
u64 *cfg_offset);
static int __devinit cciss_pci_find_memory_BAR(struct pci_dev *pdev,
unsigned long *memory_bar);
-
+static inline u32 cciss_tag_discard_error_bits(ctlr_info_t *h, u32 tag);
/* performant mode helper functions */
static void calc_bucket_map(int *bucket, int num_buckets, int nsgs,
@@ -231,7 +231,7 @@ static const struct block_device_operations cciss_fops = {
*/
static void set_performant_mode(ctlr_info_t *h, CommandList_struct *c)
{
- if (likely(h->transMethod == CFGTBL_Trans_Performant))
+ if (likely(h->transMethod & CFGTBL_Trans_Performant))
c->busaddr |= 1 | (h->blockFetchTable[c->Header.SGList] << 1);
}
@@ -556,6 +556,44 @@ static void __devinit cciss_procinit(ctlr_info_t *h)
#define to_hba(n) container_of(n, struct ctlr_info, dev)
#define to_drv(n) container_of(n, drive_info_struct, dev)
+/* List of controllers which cannot be reset on kexec with reset_devices */
+static u32 unresettable_controller[] = {
+ 0x324a103C, /* Smart Array P712m */
+ 0x324b103C, /* SmartArray P711m */
+ 0x3223103C, /* Smart Array P800 */
+ 0x3234103C, /* Smart Array P400 */
+ 0x3235103C, /* Smart Array P400i */
+ 0x3211103C, /* Smart Array E200i */
+ 0x3212103C, /* Smart Array E200 */
+ 0x3213103C, /* Smart Array E200i */
+ 0x3214103C, /* Smart Array E200i */
+ 0x3215103C, /* Smart Array E200i */
+ 0x3237103C, /* Smart Array E500 */
+ 0x323D103C, /* Smart Array P700m */
+ 0x409C0E11, /* Smart Array 6400 */
+ 0x409D0E11, /* Smart Array 6400 EM */
+};
+
+static int ctlr_is_resettable(struct ctlr_info *h)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(unresettable_controller); i++)
+ if (unresettable_controller[i] == h->board_id)
+ return 0;
+ return 1;
+}
+
+static ssize_t host_show_resettable(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct ctlr_info *h = to_hba(dev);
+
+ return snprintf(buf, 20, "%d\n", ctlr_is_resettable(h));
+}
+static DEVICE_ATTR(resettable, S_IRUGO, host_show_resettable, NULL);
+
static ssize_t host_store_rescan(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t count)
@@ -741,6 +779,7 @@ static DEVICE_ATTR(usage_count, S_IRUGO, cciss_show_usage_count, NULL);
static struct attribute *cciss_host_attrs[] = {
&dev_attr_rescan.attr,
+ &dev_attr_resettable.attr,
NULL
};
@@ -973,8 +1012,8 @@ static void cmd_special_free(ctlr_info_t *h, CommandList_struct *c)
temp64.val32.upper = c->ErrDesc.Addr.upper;
pci_free_consistent(h->pdev, sizeof(ErrorInfo_struct),
c->err_info, (dma_addr_t) temp64.val);
- pci_free_consistent(h->pdev, sizeof(CommandList_struct),
- c, (dma_addr_t) c->busaddr);
+ pci_free_consistent(h->pdev, sizeof(CommandList_struct), c,
+ (dma_addr_t) cciss_tag_discard_error_bits(h, (u32) c->busaddr));
}
static inline ctlr_info_t *get_host(struct gendisk *disk)
@@ -1490,8 +1529,7 @@ static int cciss_bigpassthru(ctlr_info_t *h, void __user *argp)
return -EINVAL;
if (!capable(CAP_SYS_RAWIO))
return -EPERM;
- ioc = (BIG_IOCTL_Command_struct *)
- kmalloc(sizeof(*ioc), GFP_KERNEL);
+ ioc = kmalloc(sizeof(*ioc), GFP_KERNEL);
if (!ioc) {
status = -ENOMEM;
goto cleanup1;
@@ -2653,6 +2691,10 @@ static int process_sendcmd_error(ctlr_info_t *h, CommandList_struct *c)
c->Request.CDB[0]);
return_status = IO_NEEDS_RETRY;
break;
+ case CMD_UNABORTABLE:
+ dev_warn(&h->pdev->dev, "cmd unabortable\n");
+ return_status = IO_ERROR;
+ break;
default:
dev_warn(&h->pdev->dev, "cmd 0x%02x returned "
"unknown status %x\n", c->Request.CDB[0],
@@ -3103,6 +3145,13 @@ static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd,
(cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC) ?
DID_PASSTHROUGH : DID_ERROR);
break;
+ case CMD_UNABORTABLE:
+ dev_warn(&h->pdev->dev, "cmd %p unabortable\n", cmd);
+ rq->errors = make_status_bytes(SAM_STAT_GOOD,
+ cmd->err_info->CommandStatus, DRIVER_OK,
+ cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC ?
+ DID_PASSTHROUGH : DID_ERROR);
+ break;
default:
dev_warn(&h->pdev->dev, "cmd %p returned "
"unknown status %x\n", cmd,
@@ -3136,10 +3185,13 @@ static inline u32 cciss_tag_to_index(u32 tag)
return tag >> DIRECT_LOOKUP_SHIFT;
}
-static inline u32 cciss_tag_discard_error_bits(u32 tag)
+static inline u32 cciss_tag_discard_error_bits(ctlr_info_t *h, u32 tag)
{
-#define CCISS_ERROR_BITS 0x03
- return tag & ~CCISS_ERROR_BITS;
+#define CCISS_PERF_ERROR_BITS ((1 << DIRECT_LOOKUP_SHIFT) - 1)
+#define CCISS_SIMPLE_ERROR_BITS 0x03
+ if (likely(h->transMethod & CFGTBL_Trans_Performant))
+ return tag & ~CCISS_PERF_ERROR_BITS;
+ return tag & ~CCISS_SIMPLE_ERROR_BITS;
}
static inline void cciss_mark_tag_indexed(u32 *tag)
@@ -3359,7 +3411,7 @@ static inline u32 next_command(ctlr_info_t *h)
{
u32 a;
- if (unlikely(h->transMethod != CFGTBL_Trans_Performant))
+ if (unlikely(!(h->transMethod & CFGTBL_Trans_Performant)))
return h->access.command_completed(h);
if ((*(h->reply_pool_head) & 1) == (h->reply_pool_wraparound)) {
@@ -3394,14 +3446,12 @@ static inline u32 process_indexed_cmd(ctlr_info_t *h, u32 raw_tag)
/* process completion of a non-indexed command */
static inline u32 process_nonindexed_cmd(ctlr_info_t *h, u32 raw_tag)
{
- u32 tag;
CommandList_struct *c = NULL;
__u32 busaddr_masked, tag_masked;
- tag = cciss_tag_discard_error_bits(raw_tag);
+ tag_masked = cciss_tag_discard_error_bits(h, raw_tag);
list_for_each_entry(c, &h->cmpQ, list) {
- busaddr_masked = cciss_tag_discard_error_bits(c->busaddr);
- tag_masked = cciss_tag_discard_error_bits(tag);
+ busaddr_masked = cciss_tag_discard_error_bits(h, c->busaddr);
if (busaddr_masked == tag_masked) {
finish_cmd(h, c, raw_tag);
return next_command(h);
@@ -3753,7 +3803,8 @@ static void __devinit cciss_wait_for_mode_change_ack(ctlr_info_t *h)
}
}
-static __devinit void cciss_enter_performant_mode(ctlr_info_t *h)
+static __devinit void cciss_enter_performant_mode(ctlr_info_t *h,
+ u32 use_short_tags)
{
/* This is a bit complicated. There are 8 registers on
* the controller which we write to to tell it 8 different
@@ -3808,7 +3859,7 @@ static __devinit void cciss_enter_performant_mode(ctlr_info_t *h)
writel(0, &h->transtable->RepQCtrAddrHigh32);
writel(h->reply_pool_dhandle, &h->transtable->RepQAddr0Low32);
writel(0, &h->transtable->RepQAddr0High32);
- writel(CFGTBL_Trans_Performant,
+ writel(CFGTBL_Trans_Performant | use_short_tags,
&(h->cfgtable->HostWrite.TransportRequest));
writel(CFGTBL_ChangeReq, h->vaddr + SA5_DOORBELL);
@@ -3855,7 +3906,8 @@ static void __devinit cciss_put_controller_into_performant_mode(ctlr_info_t *h)
if ((h->reply_pool == NULL) || (h->blockFetchTable == NULL))
goto clean_up;
- cciss_enter_performant_mode(h);
+ cciss_enter_performant_mode(h,
+ trans_support & CFGTBL_Trans_use_short_tags);
/* Change the access methods to the performant access methods */
h->access = SA5_performant_access;
diff --git a/drivers/block/cciss.h b/drivers/block/cciss.h
index 579f74918493..554bbd907d14 100644
--- a/drivers/block/cciss.h
+++ b/drivers/block/cciss.h
@@ -222,6 +222,7 @@ static void SA5_submit_command( ctlr_info_t *h, CommandList_struct *c)
h->ctlr, c->busaddr);
#endif /* CCISS_DEBUG */
writel(c->busaddr, h->vaddr + SA5_REQUEST_PORT_OFFSET);
+ readl(h->vaddr + SA5_REQUEST_PORT_OFFSET);
h->commands_outstanding++;
if ( h->commands_outstanding > h->max_outstanding)
h->max_outstanding = h->commands_outstanding;
diff --git a/drivers/block/cciss_cmd.h b/drivers/block/cciss_cmd.h
index 35463d2f0ee7..cd441bef031f 100644
--- a/drivers/block/cciss_cmd.h
+++ b/drivers/block/cciss_cmd.h
@@ -56,6 +56,7 @@
#define CFGTBL_Trans_Simple 0x00000002l
#define CFGTBL_Trans_Performant 0x00000004l
+#define CFGTBL_Trans_use_short_tags 0x20000000l
#define CFGTBL_BusType_Ultra2 0x00000001l
#define CFGTBL_BusType_Ultra3 0x00000002l
diff --git a/drivers/block/cciss_scsi.c b/drivers/block/cciss_scsi.c
index 727d0225b7d0..df793803f5ae 100644
--- a/drivers/block/cciss_scsi.c
+++ b/drivers/block/cciss_scsi.c
@@ -824,13 +824,18 @@ static void complete_scsi_command(CommandList_struct *c, int timeout,
break;
case CMD_UNSOLICITED_ABORT:
cmd->result = DID_ABORT << 16;
- dev_warn(&h->pdev->dev, "%p aborted do to an "
+ dev_warn(&h->pdev->dev, "%p aborted due to an "
"unsolicited abort\n", c);
break;
case CMD_TIMEOUT:
cmd->result = DID_TIME_OUT << 16;
dev_warn(&h->pdev->dev, "%p timedout\n", c);
break;
+ case CMD_UNABORTABLE:
+ cmd->result = DID_ERROR << 16;
+ dev_warn(&h->pdev->dev, "c %p command "
+ "unabortable\n", c);
+ break;
default:
cmd->result = DID_ERROR << 16;
dev_warn(&h->pdev->dev,
@@ -1007,11 +1012,15 @@ cciss_scsi_interpret_error(ctlr_info_t *h, CommandList_struct *c)
break;
case CMD_UNSOLICITED_ABORT:
dev_warn(&h->pdev->dev,
- "%p aborted do to an unsolicited abort\n", c);
+ "%p aborted due to an unsolicited abort\n", c);
break;
case CMD_TIMEOUT:
dev_warn(&h->pdev->dev, "%p timedout\n", c);
break;
+ case CMD_UNABORTABLE:
+ dev_warn(&h->pdev->dev,
+ "%p unabortable\n", c);
+ break;
default:
dev_warn(&h->pdev->dev,
"%p returned unknown status %x\n",
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index aca302492ff2..2a1642bc451d 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -92,7 +92,7 @@ static int _drbd_md_sync_page_io(struct drbd_conf *mdev,
bio->bi_end_io = drbd_md_io_complete;
bio->bi_rw = rw;
- if (FAULT_ACTIVE(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD))
+ if (drbd_insert_fault(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD))
bio_endio(bio, -EIO);
else
submit_bio(rw, bio);
@@ -176,13 +176,17 @@ static struct lc_element *_al_get(struct drbd_conf *mdev, unsigned int enr)
struct lc_element *al_ext;
struct lc_element *tmp;
unsigned long al_flags = 0;
+ int wake;
spin_lock_irq(&mdev->al_lock);
tmp = lc_find(mdev->resync, enr/AL_EXT_PER_BM_SECT);
if (unlikely(tmp != NULL)) {
struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce);
if (test_bit(BME_NO_WRITES, &bm_ext->flags)) {
+ wake = !test_and_set_bit(BME_PRIORITY, &bm_ext->flags);
spin_unlock_irq(&mdev->al_lock);
+ if (wake)
+ wake_up(&mdev->al_wait);
return NULL;
}
}
@@ -258,6 +262,33 @@ void drbd_al_complete_io(struct drbd_conf *mdev, sector_t sector)
spin_unlock_irqrestore(&mdev->al_lock, flags);
}
+#if (PAGE_SHIFT + 3) < (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT)
+/* Currently BM_BLOCK_SHIFT, BM_EXT_SHIFT and AL_EXTENT_SHIFT
+ * are still coupled, or assume too much about their relation.
+ * Code below will not work if this is violated.
+ * Will be cleaned up with some followup patch.
+ */
+# error FIXME
+#endif
+
+static unsigned int al_extent_to_bm_page(unsigned int al_enr)
+{
+ return al_enr >>
+ /* bit to page */
+ ((PAGE_SHIFT + 3) -
+ /* al extent number to bit */
+ (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT));
+}
+
+static unsigned int rs_extent_to_bm_page(unsigned int rs_enr)
+{
+ return rs_enr >>
+ /* bit to page */
+ ((PAGE_SHIFT + 3) -
+ /* al extent number to bit */
+ (BM_EXT_SHIFT - BM_BLOCK_SHIFT));
+}
+
int
w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused)
{
@@ -285,7 +316,7 @@ w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused)
* For now, we must not write the transaction,
* if we cannot write out the bitmap of the evicted extent. */
if (mdev->state.conn < C_CONNECTED && evicted != LC_FREE)
- drbd_bm_write_sect(mdev, evicted/AL_EXT_PER_BM_SECT);
+ drbd_bm_write_page(mdev, al_extent_to_bm_page(evicted));
/* The bitmap write may have failed, causing a state change. */
if (mdev->state.disk < D_INCONSISTENT) {
@@ -334,7 +365,7 @@ w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused)
+ mdev->ldev->md.al_offset + mdev->al_tr_pos;
if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE))
- drbd_chk_io_error(mdev, 1, TRUE);
+ drbd_chk_io_error(mdev, 1, true);
if (++mdev->al_tr_pos >
div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT))
@@ -511,225 +542,6 @@ cancel:
return 1;
}
-static void atodb_endio(struct bio *bio, int error)
-{
- struct drbd_atodb_wait *wc = bio->bi_private;
- struct drbd_conf *mdev = wc->mdev;
- struct page *page;
- int uptodate = bio_flagged(bio, BIO_UPTODATE);
-
- /* strange behavior of some lower level drivers...
- * fail the request by clearing the uptodate flag,
- * but do not return any error?! */
- if (!error && !uptodate)
- error = -EIO;
-
- drbd_chk_io_error(mdev, error, TRUE);
- if (error && wc->error == 0)
- wc->error = error;
-
- if (atomic_dec_and_test(&wc->count))
- complete(&wc->io_done);
-
- page = bio->bi_io_vec[0].bv_page;
- put_page(page);
- bio_put(bio);
- mdev->bm_writ_cnt++;
- put_ldev(mdev);
-}
-
-/* sector to word */
-#define S2W(s) ((s)<<(BM_EXT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL))
-
-/* activity log to on disk bitmap -- prepare bio unless that sector
- * is already covered by previously prepared bios */
-static int atodb_prepare_unless_covered(struct drbd_conf *mdev,
- struct bio **bios,
- unsigned int enr,
- struct drbd_atodb_wait *wc) __must_hold(local)
-{
- struct bio *bio;
- struct page *page;
- sector_t on_disk_sector;
- unsigned int page_offset = PAGE_SIZE;
- int offset;
- int i = 0;
- int err = -ENOMEM;
-
- /* We always write aligned, full 4k blocks,
- * so we can ignore the logical_block_size (for now) */
- enr &= ~7U;
- on_disk_sector = enr + mdev->ldev->md.md_offset
- + mdev->ldev->md.bm_offset;
-
- D_ASSERT(!(on_disk_sector & 7U));
-
- /* Check if that enr is already covered by an already created bio.
- * Caution, bios[] is not NULL terminated,
- * but only initialized to all NULL.
- * For completely scattered activity log,
- * the last invocation iterates over all bios,
- * and finds the last NULL entry.
- */
- while ((bio = bios[i])) {
- if (bio->bi_sector == on_disk_sector)
- return 0;
- i++;
- }
- /* bios[i] == NULL, the next not yet used slot */
-
- /* GFP_KERNEL, we are not in the write-out path */
- bio = bio_alloc(GFP_KERNEL, 1);
- if (bio == NULL)
- return -ENOMEM;
-
- if (i > 0) {
- const struct bio_vec *prev_bv = bios[i-1]->bi_io_vec;
- page_offset = prev_bv->bv_offset + prev_bv->bv_len;
- page = prev_bv->bv_page;
- }
- if (page_offset == PAGE_SIZE) {
- page = alloc_page(__GFP_HIGHMEM);
- if (page == NULL)
- goto out_bio_put;
- page_offset = 0;
- } else {
- get_page(page);
- }
-
- offset = S2W(enr);
- drbd_bm_get_lel(mdev, offset,
- min_t(size_t, S2W(8), drbd_bm_words(mdev) - offset),
- kmap(page) + page_offset);
- kunmap(page);
-
- bio->bi_private = wc;
- bio->bi_end_io = atodb_endio;
- bio->bi_bdev = mdev->ldev->md_bdev;
- bio->bi_sector = on_disk_sector;
-
- if (bio_add_page(bio, page, 4096, page_offset) != 4096)
- goto out_put_page;
-
- atomic_inc(&wc->count);
- /* we already know that we may do this...
- * get_ldev_if_state(mdev,D_ATTACHING);
- * just get the extra reference, so that the local_cnt reflects
- * the number of pending IO requests DRBD at its backing device.
- */
- atomic_inc(&mdev->local_cnt);
-
- bios[i] = bio;
-
- return 0;
-
-out_put_page:
- err = -EINVAL;
- put_page(page);
-out_bio_put:
- bio_put(bio);
- return err;
-}
-
-/**
- * drbd_al_to_on_disk_bm() - * Writes bitmap parts covered by active AL extents
- * @mdev: DRBD device.
- *
- * Called when we detach (unconfigure) local storage,
- * or when we go from R_PRIMARY to R_SECONDARY role.
- */
-void drbd_al_to_on_disk_bm(struct drbd_conf *mdev)
-{
- int i, nr_elements;
- unsigned int enr;
- struct bio **bios;
- struct drbd_atodb_wait wc;
-
- ERR_IF (!get_ldev_if_state(mdev, D_ATTACHING))
- return; /* sorry, I don't have any act_log etc... */
-
- wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
-
- nr_elements = mdev->act_log->nr_elements;
-
- /* GFP_KERNEL, we are not in anyone's write-out path */
- bios = kzalloc(sizeof(struct bio *) * nr_elements, GFP_KERNEL);
- if (!bios)
- goto submit_one_by_one;
-
- atomic_set(&wc.count, 0);
- init_completion(&wc.io_done);
- wc.mdev = mdev;
- wc.error = 0;
-
- for (i = 0; i < nr_elements; i++) {
- enr = lc_element_by_index(mdev->act_log, i)->lc_number;
- if (enr == LC_FREE)
- continue;
- /* next statement also does atomic_inc wc.count and local_cnt */
- if (atodb_prepare_unless_covered(mdev, bios,
- enr/AL_EXT_PER_BM_SECT,
- &wc))
- goto free_bios_submit_one_by_one;
- }
-
- /* unnecessary optimization? */
- lc_unlock(mdev->act_log);
- wake_up(&mdev->al_wait);
-
- /* all prepared, submit them */
- for (i = 0; i < nr_elements; i++) {
- if (bios[i] == NULL)
- break;
- if (FAULT_ACTIVE(mdev, DRBD_FAULT_MD_WR)) {
- bios[i]->bi_rw = WRITE;
- bio_endio(bios[i], -EIO);
- } else {
- submit_bio(WRITE, bios[i]);
- }
- }
-
- /* always (try to) flush bitmap to stable storage */
- drbd_md_flush(mdev);
-
- /* In case we did not submit a single IO do not wait for
- * them to complete. ( Because we would wait forever here. )
- *
- * In case we had IOs and they are already complete, there
- * is not point in waiting anyways.
- * Therefore this if () ... */
- if (atomic_read(&wc.count))
- wait_for_completion(&wc.io_done);
-
- put_ldev(mdev);
-
- kfree(bios);
- return;
-
- free_bios_submit_one_by_one:
- /* free everything by calling the endio callback directly. */
- for (i = 0; i < nr_elements && bios[i]; i++)
- bio_endio(bios[i], 0);
-
- kfree(bios);
-
- submit_one_by_one:
- dev_warn(DEV, "Using the slow drbd_al_to_on_disk_bm()\n");
-
- for (i = 0; i < mdev->act_log->nr_elements; i++) {
- enr = lc_element_by_index(mdev->act_log, i)->lc_number;
- if (enr == LC_FREE)
- continue;
- /* Really slow: if we have al-extents 16..19 active,
- * sector 4 will be written four times! Synchronous! */
- drbd_bm_write_sect(mdev, enr/AL_EXT_PER_BM_SECT);
- }
-
- lc_unlock(mdev->act_log);
- wake_up(&mdev->al_wait);
- put_ldev(mdev);
-}
-
/**
* drbd_al_apply_to_bm() - Sets the bitmap to diry(1) where covered ba active AL extents
* @mdev: DRBD device.
@@ -809,7 +621,7 @@ static int w_update_odbm(struct drbd_conf *mdev, struct drbd_work *w, int unused
return 1;
}
- drbd_bm_write_sect(mdev, udw->enr);
+ drbd_bm_write_page(mdev, rs_extent_to_bm_page(udw->enr));
put_ldev(mdev);
kfree(udw);
@@ -889,7 +701,6 @@ static void drbd_try_clear_on_disk_bm(struct drbd_conf *mdev, sector_t sector,
dev_warn(DEV, "Kicking resync_lru element enr=%u "
"out with rs_failed=%d\n",
ext->lce.lc_number, ext->rs_failed);
- set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags);
}
ext->rs_left = rs_left;
ext->rs_failed = success ? 0 : count;
@@ -908,7 +719,6 @@ static void drbd_try_clear_on_disk_bm(struct drbd_conf *mdev, sector_t sector,
drbd_queue_work_front(&mdev->data.work, &udw->w);
} else {
dev_warn(DEV, "Could not kmalloc an udw\n");
- set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags);
}
}
} else {
@@ -919,6 +729,22 @@ static void drbd_try_clear_on_disk_bm(struct drbd_conf *mdev, sector_t sector,
}
}
+void drbd_advance_rs_marks(struct drbd_conf *mdev, unsigned long still_to_go)
+{
+ unsigned long now = jiffies;
+ unsigned long last = mdev->rs_mark_time[mdev->rs_last_mark];
+ int next = (mdev->rs_last_mark + 1) % DRBD_SYNC_MARKS;
+ if (time_after_eq(now, last + DRBD_SYNC_MARK_STEP)) {
+ if (mdev->rs_mark_left[mdev->rs_last_mark] != still_to_go &&
+ mdev->state.conn != C_PAUSED_SYNC_T &&
+ mdev->state.conn != C_PAUSED_SYNC_S) {
+ mdev->rs_mark_time[next] = now;
+ mdev->rs_mark_left[next] = still_to_go;
+ mdev->rs_last_mark = next;
+ }
+ }
+}
+
/* clear the bit corresponding to the piece of storage in question:
* size byte of data starting from sector. Only clear a bits of the affected
* one ore more _aligned_ BM_BLOCK_SIZE blocks.
@@ -936,7 +762,7 @@ void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size,
int wake_up = 0;
unsigned long flags;
- if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) {
+ if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_BIO_SIZE) {
dev_err(DEV, "drbd_set_in_sync: sector=%llus size=%d nonsense!\n",
(unsigned long long)sector, size);
return;
@@ -969,21 +795,9 @@ void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size,
*/
count = drbd_bm_clear_bits(mdev, sbnr, ebnr);
if (count && get_ldev(mdev)) {
- unsigned long now = jiffies;
- unsigned long last = mdev->rs_mark_time[mdev->rs_last_mark];
- int next = (mdev->rs_last_mark + 1) % DRBD_SYNC_MARKS;
- if (time_after_eq(now, last + DRBD_SYNC_MARK_STEP)) {
- unsigned long tw = drbd_bm_total_weight(mdev);
- if (mdev->rs_mark_left[mdev->rs_last_mark] != tw &&
- mdev->state.conn != C_PAUSED_SYNC_T &&
- mdev->state.conn != C_PAUSED_SYNC_S) {
- mdev->rs_mark_time[next] = now;
- mdev->rs_mark_left[next] = tw;
- mdev->rs_last_mark = next;
- }
- }
+ drbd_advance_rs_marks(mdev, drbd_bm_total_weight(mdev));
spin_lock_irqsave(&mdev->al_lock, flags);
- drbd_try_clear_on_disk_bm(mdev, sector, count, TRUE);
+ drbd_try_clear_on_disk_bm(mdev, sector, count, true);
spin_unlock_irqrestore(&mdev->al_lock, flags);
/* just wake_up unconditional now, various lc_chaged(),
@@ -998,27 +812,27 @@ void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size,
/*
* this is intended to set one request worth of data out of sync.
* affects at least 1 bit,
- * and at most 1+DRBD_MAX_SEGMENT_SIZE/BM_BLOCK_SIZE bits.
+ * and at most 1+DRBD_MAX_BIO_SIZE/BM_BLOCK_SIZE bits.
*
* called by tl_clear and drbd_send_dblock (==drbd_make_request).
* so this can be _any_ process.
*/
-void __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size,
+int __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size,
const char *file, const unsigned int line)
{
unsigned long sbnr, ebnr, lbnr, flags;
sector_t esector, nr_sectors;
- unsigned int enr, count;
+ unsigned int enr, count = 0;
struct lc_element *e;
- if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) {
+ if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_BIO_SIZE) {
dev_err(DEV, "sector: %llus, size: %d\n",
(unsigned long long)sector, size);
- return;
+ return 0;
}
if (!get_ldev(mdev))
- return; /* no disk, no metadata, no bitmap to set bits in */
+ return 0; /* no disk, no metadata, no bitmap to set bits in */
nr_sectors = drbd_get_capacity(mdev->this_bdev);
esector = sector + (size >> 9) - 1;
@@ -1048,6 +862,8 @@ void __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size,
out:
put_ldev(mdev);
+
+ return count;
}
static
@@ -1128,7 +944,10 @@ int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector)
unsigned int enr = BM_SECT_TO_EXT(sector);
struct bm_extent *bm_ext;
int i, sig;
+ int sa = 200; /* Step aside 200 times, then grab the extent and let app-IO wait.
+ 200 times -> 20 seconds. */
+retry:
sig = wait_event_interruptible(mdev->al_wait,
(bm_ext = _bme_get(mdev, enr)));
if (sig)
@@ -1139,16 +958,25 @@ int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector)
for (i = 0; i < AL_EXT_PER_BM_SECT; i++) {
sig = wait_event_interruptible(mdev->al_wait,
- !_is_in_al(mdev, enr * AL_EXT_PER_BM_SECT + i));
- if (sig) {
+ !_is_in_al(mdev, enr * AL_EXT_PER_BM_SECT + i) ||
+ test_bit(BME_PRIORITY, &bm_ext->flags));
+
+ if (sig || (test_bit(BME_PRIORITY, &bm_ext->flags) && sa)) {
spin_lock_irq(&mdev->al_lock);
if (lc_put(mdev->resync, &bm_ext->lce) == 0) {
- clear_bit(BME_NO_WRITES, &bm_ext->flags);
+ bm_ext->flags = 0; /* clears BME_NO_WRITES and eventually BME_PRIORITY */
mdev->resync_locked--;
wake_up(&mdev->al_wait);
}
spin_unlock_irq(&mdev->al_lock);
- return -EINTR;
+ if (sig)
+ return -EINTR;
+ if (schedule_timeout_interruptible(HZ/10))
+ return -EINTR;
+ if (sa && --sa == 0)
+ dev_warn(DEV,"drbd_rs_begin_io() stepped aside for 20sec."
+ "Resync stalled?\n");
+ goto retry;
}
}
set_bit(BME_LOCKED, &bm_ext->flags);
@@ -1291,8 +1119,7 @@ void drbd_rs_complete_io(struct drbd_conf *mdev, sector_t sector)
}
if (lc_put(mdev->resync, &bm_ext->lce) == 0) {
- clear_bit(BME_LOCKED, &bm_ext->flags);
- clear_bit(BME_NO_WRITES, &bm_ext->flags);
+ bm_ext->flags = 0; /* clear BME_LOCKED, BME_NO_WRITES and BME_PRIORITY */
mdev->resync_locked--;
wake_up(&mdev->al_wait);
}
@@ -1383,7 +1210,7 @@ void drbd_rs_failed_io(struct drbd_conf *mdev, sector_t sector, int size)
sector_t esector, nr_sectors;
int wake_up = 0;
- if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) {
+ if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_BIO_SIZE) {
dev_err(DEV, "drbd_rs_failed_io: sector=%llus size=%d nonsense!\n",
(unsigned long long)sector, size);
return;
@@ -1420,7 +1247,7 @@ void drbd_rs_failed_io(struct drbd_conf *mdev, sector_t sector, int size)
mdev->rs_failed += count;
if (get_ldev(mdev)) {
- drbd_try_clear_on_disk_bm(mdev, sector, count, FALSE);
+ drbd_try_clear_on_disk_bm(mdev, sector, count, false);
put_ldev(mdev);
}
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
index 0645ca829a94..f0ae63d2df65 100644
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -28,18 +28,58 @@
#include <linux/drbd.h>
#include <linux/slab.h>
#include <asm/kmap_types.h>
+
+#include <asm-generic/bitops/le.h>
+
#include "drbd_int.h"
+
/* OPAQUE outside this file!
* interface defined in drbd_int.h
* convention:
* function name drbd_bm_... => used elsewhere, "public".
* function name bm_... => internal to implementation, "private".
+ */
+
+
+/*
+ * LIMITATIONS:
+ * We want to support >= peta byte of backend storage, while for now still using
+ * a granularity of one bit per 4KiB of storage.
+ * 1 << 50 bytes backend storage (1 PiB)
+ * 1 << (50 - 12) bits needed
+ * 38 --> we need u64 to index and count bits
+ * 1 << (38 - 3) bitmap bytes needed
+ * 35 --> we still need u64 to index and count bytes
+ * (that's 32 GiB of bitmap for 1 PiB storage)
+ * 1 << (35 - 2) 32bit longs needed
+ * 33 --> we'd even need u64 to index and count 32bit long words.
+ * 1 << (35 - 3) 64bit longs needed
+ * 32 --> we could get away with a 32bit unsigned int to index and count
+ * 64bit long words, but I rather stay with unsigned long for now.
+ * We probably should neither count nor point to bytes or long words
+ * directly, but either by bitnumber, or by page index and offset.
+ * 1 << (35 - 12)
+ * 22 --> we need that much 4KiB pages of bitmap.
+ * 1 << (22 + 3) --> on a 64bit arch,
+ * we need 32 MiB to store the array of page pointers.
+ *
+ * Because I'm lazy, and because the resulting patch was too large, too ugly
+ * and still incomplete, on 32bit we still "only" support 16 TiB (minus some),
+ * (1 << 32) bits * 4k storage.
+ *
- * Note that since find_first_bit returns int, at the current granularity of
- * the bitmap (4KB per byte), this implementation "only" supports up to
- * 1<<(32+12) == 16 TB...
+ * bitmap storage and IO:
+ * Bitmap is stored little endian on disk, and is kept little endian in
+ * core memory. Currently we still hold the full bitmap in core as long
+ * as we are "attached" to a local disk, which at 32 GiB for 1PiB storage
+ * seems excessive.
+ *
+ * We plan to reduce the amount of in-core bitmap pages by pageing them in
+ * and out against their on-disk location as necessary, but need to make
+ * sure we don't cause too much meta data IO, and must not deadlock in
+ * tight memory situations. This needs some more work.
*/
/*
@@ -55,13 +95,9 @@
struct drbd_bitmap {
struct page **bm_pages;
spinlock_t bm_lock;
- /* WARNING unsigned long bm_*:
- * 32bit number of bit offset is just enough for 512 MB bitmap.
- * it will blow up if we make the bitmap bigger...
- * not that it makes much sense to have a bitmap that large,
- * rather change the granularity to 16k or 64k or something.
- * (that implies other problems, however...)
- */
+
+ /* see LIMITATIONS: above */
+
unsigned long bm_set; /* nr of set bits; THINK maybe atomic_t? */
unsigned long bm_bits;
size_t bm_words;
@@ -69,29 +105,18 @@ struct drbd_bitmap {
sector_t bm_dev_capacity;
struct mutex bm_change; /* serializes resize operations */
- atomic_t bm_async_io;
- wait_queue_head_t bm_io_wait;
+ wait_queue_head_t bm_io_wait; /* used to serialize IO of single pages */
- unsigned long bm_flags;
+ enum bm_flag bm_flags;
/* debugging aid, in case we are still racy somewhere */
char *bm_why;
struct task_struct *bm_task;
};
-/* definition of bits in bm_flags */
-#define BM_LOCKED 0
-#define BM_MD_IO_ERROR 1
-#define BM_P_VMALLOCED 2
-
static int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
unsigned long e, int val, const enum km_type km);
-static int bm_is_locked(struct drbd_bitmap *b)
-{
- return test_bit(BM_LOCKED, &b->bm_flags);
-}
-
#define bm_print_lock_info(m) __bm_print_lock_info(m, __func__)
static void __bm_print_lock_info(struct drbd_conf *mdev, const char *func)
{
@@ -108,7 +133,7 @@ static void __bm_print_lock_info(struct drbd_conf *mdev, const char *func)