summaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-07-11 12:12:28 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2017-07-11 12:12:28 -0700
commit3bf7878f0f7d60c394f6d6631bb179e86f09f73c (patch)
treef998ef959865db1657baa410c4ed281ad9003183 /net
parent07d306c838c5c30196619baae36107d0615e459b (diff)
parent33e9c8dbfbcef8e4cda8e43a445e692ab7e0d8c0 (diff)
Merge tag 'ceph-for-4.13-rc1' of git://github.com/ceph/ceph-client
Pull ceph updates from Ilya Dryomov: "The main item here is support for v12.y.z ("Luminous") clusters: RESEND_ON_SPLIT, RADOS_BACKOFF, OSDMAP_PG_UPMAP and CRUSH_CHOOSE_ARGS feature bits, and various other changes in the RADOS client protocol. On top of that we have a new fsc mount option to allow supplying fscache uniquifier (similar to NFS) and the usual pile of filesystem fixes from Zheng" * tag 'ceph-for-4.13-rc1' of git://github.com/ceph/ceph-client: (44 commits) libceph: advertise support for NEW_OSDOP_ENCODING and SERVER_LUMINOUS libceph: osd_state is 32 bits wide in luminous crush: remove an obsolete comment crush: crush_init_workspace starts with struct crush_work libceph, crush: per-pool crush_choose_arg_map for crush_do_rule() crush: implement weight and id overrides for straw2 libceph: apply_upmap() libceph: compute actual pgid in ceph_pg_to_up_acting_osds() libceph: pg_upmap[_items] infrastructure libceph: ceph_decode_skip_* helpers libceph: kill __{insert,lookup,remove}_pg_mapping() libceph: introduce and switch to decode_pg_mapping() libceph: don't pass pgid by value libceph: respect RADOS_BACKOFF backoffs libceph: make DEFINE_RB_* helpers more general libceph: avoid unnecessary pi lookups in calc_target() libceph: use target pi for calc_target() calculations libceph: always populate t->target_{oid,oloc} in calc_target() libceph: make sure need_resend targets reflect latest map libceph: delete from need_resend_linger before check_linger_pool_dne() ...
Diffstat (limited to 'net')
-rw-r--r--net/ceph/ceph_common.c1
-rw-r--r--net/ceph/crush/crush.c3
-rw-r--r--net/ceph/crush/mapper.c81
-rw-r--r--net/ceph/debugfs.c112
-rw-r--r--net/ceph/messenger.c10
-rw-r--r--net/ceph/mon_client.c8
-rw-r--r--net/ceph/osd_client.c905
-rw-r--r--net/ceph/osdmap.c840
8 files changed, 1657 insertions, 303 deletions
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index 47e94b560ba0..3d265c5cb6d0 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -85,6 +85,7 @@ const char *ceph_msg_type_name(int type)
case CEPH_MSG_OSD_OP: return "osd_op";
case CEPH_MSG_OSD_OPREPLY: return "osd_opreply";
case CEPH_MSG_WATCH_NOTIFY: return "watch_notify";
+ case CEPH_MSG_OSD_BACKOFF: return "osd_backoff";
default: return "unknown";
}
}
diff --git a/net/ceph/crush/crush.c b/net/ceph/crush/crush.c
index 5bf94c04f645..4b428f46a8ca 100644
--- a/net/ceph/crush/crush.c
+++ b/net/ceph/crush/crush.c
@@ -1,6 +1,7 @@
#ifdef __KERNEL__
# include <linux/slab.h>
# include <linux/crush/crush.h>
+void clear_choose_args(struct crush_map *c);
#else
# include "crush_compat.h"
# include "crush.h"
@@ -127,6 +128,8 @@ void crush_destroy(struct crush_map *map)
#ifndef __KERNEL__
kfree(map->choose_tries);
+#else
+ clear_choose_args(map);
#endif
kfree(map);
}
diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
index b5cd8c21bfdf..746b145bfd11 100644
--- a/net/ceph/crush/mapper.c
+++ b/net/ceph/crush/mapper.c
@@ -302,19 +302,42 @@ static __u64 crush_ln(unsigned int xin)
*
*/
+static __u32 *get_choose_arg_weights(const struct crush_bucket_straw2 *bucket,
+ const struct crush_choose_arg *arg,
+ int position)
+{
+ if (!arg || !arg->weight_set || arg->weight_set_size == 0)
+ return bucket->item_weights;
+
+ if (position >= arg->weight_set_size)
+ position = arg->weight_set_size - 1;
+ return arg->weight_set[position].weights;
+}
+
+static __s32 *get_choose_arg_ids(const struct crush_bucket_straw2 *bucket,
+ const struct crush_choose_arg *arg)
+{
+ if (!arg || !arg->ids)
+ return bucket->h.items;
+
+ return arg->ids;
+}
+
static int bucket_straw2_choose(const struct crush_bucket_straw2 *bucket,
- int x, int r)
+ int x, int r,
+ const struct crush_choose_arg *arg,
+ int position)
{
unsigned int i, high = 0;
unsigned int u;
- unsigned int w;
__s64 ln, draw, high_draw = 0;
+ __u32 *weights = get_choose_arg_weights(bucket, arg, position);
+ __s32 *ids = get_choose_arg_ids(bucket, arg);
for (i = 0; i < bucket->h.size; i++) {
- w = bucket->item_weights[i];
- if (w) {
- u = crush_hash32_3(bucket->h.hash, x,
- bucket->h.items[i], r);
+ dprintk("weight 0x%x item %d\n", weights[i], ids[i]);
+ if (weights[i]) {
+ u = crush_hash32_3(bucket->h.hash, x, ids[i], r);
u &= 0xffff;
/*
@@ -335,7 +358,7 @@ static int bucket_straw2_choose(const struct crush_bucket_straw2 *bucket,
* weight means a larger (less negative) value
* for draw.
*/
- draw = div64_s64(ln, w);
+ draw = div64_s64(ln, weights[i]);
} else {
draw = S64_MIN;
}
@@ -352,7 +375,9 @@ static int bucket_straw2_choose(const struct crush_bucket_straw2 *bucket,
static int crush_bucket_choose(const struct crush_bucket *in,
struct crush_work_bucket *work,
- int x, int r)
+ int x, int r,
+ const struct crush_choose_arg *arg,
+ int position)
{
dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r);
BUG_ON(in->size == 0);
@@ -374,7 +399,7 @@ static int crush_bucket_choose(const struct crush_bucket *in,
case CRUSH_BUCKET_STRAW2:
return bucket_straw2_choose(
(const struct crush_bucket_straw2 *)in,
- x, r);
+ x, r, arg, position);
default:
dprintk("unknown bucket %d alg %d\n", in->id, in->alg);
return in->items[0];
@@ -436,7 +461,8 @@ static int crush_choose_firstn(const struct crush_map *map,
unsigned int vary_r,
unsigned int stable,
int *out2,
- int parent_r)
+ int parent_r,
+ const struct crush_choose_arg *choose_args)
{
int rep;
unsigned int ftotal, flocal;
@@ -486,7 +512,10 @@ static int crush_choose_firstn(const struct crush_map *map,
else
item = crush_bucket_choose(
in, work->work[-1-in->id],
- x, r);
+ x, r,
+ (choose_args ?
+ &choose_args[-1-in->id] : 0),
+ outpos);
if (item >= map->max_devices) {
dprintk(" bad item %d\n", item);
skip_rep = 1;
@@ -543,7 +572,8 @@ static int crush_choose_firstn(const struct crush_map *map,
vary_r,
stable,
NULL,
- sub_r) <= outpos)
+ sub_r,
+ choose_args) <= outpos)
/* didn't get leaf */
reject = 1;
} else {
@@ -620,7 +650,8 @@ static void crush_choose_indep(const struct crush_map *map,
unsigned int recurse_tries,
int recurse_to_leaf,
int *out2,
- int parent_r)
+ int parent_r,
+ const struct crush_choose_arg *choose_args)
{
const struct crush_bucket *in = bucket;
int endpos = outpos + left;
@@ -692,7 +723,10 @@ static void crush_choose_indep(const struct crush_map *map,
item = crush_bucket_choose(
in, work->work[-1-in->id],
- x, r);
+ x, r,
+ (choose_args ?
+ &choose_args[-1-in->id] : 0),
+ outpos);
if (item >= map->max_devices) {
dprintk(" bad item %d\n", item);
out[rep] = CRUSH_ITEM_NONE;
@@ -746,7 +780,8 @@ static void crush_choose_indep(const struct crush_map *map,
x, 1, numrep, 0,
out2, rep,
recurse_tries, 0,
- 0, NULL, r);
+ 0, NULL, r,
+ choose_args);
if (out2[rep] == CRUSH_ITEM_NONE) {
/* placed nothing; no leaf */
break;
@@ -823,7 +858,7 @@ void crush_init_workspace(const struct crush_map *map, void *v)
* set the pointer first and then reserve the space for it to
* point to by incrementing the point.
*/
- v += sizeof(struct crush_work *);
+ v += sizeof(struct crush_work);
w->work = v;
v += map->max_buckets * sizeof(struct crush_work_bucket *);
for (b = 0; b < map->max_buckets; ++b) {
@@ -854,11 +889,12 @@ void crush_init_workspace(const struct crush_map *map, void *v)
* @weight: weight vector (for map leaves)
* @weight_max: size of weight vector
* @cwin: pointer to at least crush_work_size() bytes of memory
+ * @choose_args: weights and ids for each known bucket
*/
int crush_do_rule(const struct crush_map *map,
int ruleno, int x, int *result, int result_max,
const __u32 *weight, int weight_max,
- void *cwin)
+ void *cwin, const struct crush_choose_arg *choose_args)
{
int result_len;
struct crush_work *cw = cwin;
@@ -968,11 +1004,6 @@ int crush_do_rule(const struct crush_map *map,
for (i = 0; i < wsize; i++) {
int bno;
- /*
- * see CRUSH_N, CRUSH_N_MINUS macros.
- * basically, numrep <= 0 means relative to
- * the provided result_max
- */
numrep = curstep->arg1;
if (numrep <= 0) {
numrep += result_max;
@@ -1013,7 +1044,8 @@ int crush_do_rule(const struct crush_map *map,
vary_r,
stable,
c+osize,
- 0);
+ 0,
+ choose_args);
} else {
out_size = ((numrep < (result_max-osize)) ?
numrep : (result_max-osize));
@@ -1030,7 +1062,8 @@ int crush_do_rule(const struct crush_map *map,
choose_leaf_tries : 1,
recurse_to_leaf,
c+osize,
- 0);
+ 0,
+ choose_args);
osize += out_size;
}
}
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index 71ba13927b3d..fa5233e0d01c 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -77,7 +77,7 @@ static int osdmap_show(struct seq_file *s, void *p)
}
for (i = 0; i < map->max_osd; i++) {
struct ceph_entity_addr *addr = &map->osd_addr[i];
- int state = map->osd_state[i];
+ u32 state = map->osd_state[i];
char sb[64];
seq_printf(s, "osd%d\t%s\t%3d%%\t(%s)\t%3d%%\n",
@@ -104,6 +104,29 @@ static int osdmap_show(struct seq_file *s, void *p)
seq_printf(s, "primary_temp %llu.%x %d\n", pg->pgid.pool,
pg->pgid.seed, pg->primary_temp.osd);
}
+ for (n = rb_first(&map->pg_upmap); n; n = rb_next(n)) {
+ struct ceph_pg_mapping *pg =
+ rb_entry(n, struct ceph_pg_mapping, node);
+
+ seq_printf(s, "pg_upmap %llu.%x [", pg->pgid.pool,
+ pg->pgid.seed);
+ for (i = 0; i < pg->pg_upmap.len; i++)
+ seq_printf(s, "%s%d", (i == 0 ? "" : ","),
+ pg->pg_upmap.osds[i]);
+ seq_printf(s, "]\n");
+ }
+ for (n = rb_first(&map->pg_upmap_items); n; n = rb_next(n)) {
+ struct ceph_pg_mapping *pg =
+ rb_entry(n, struct ceph_pg_mapping, node);
+
+ seq_printf(s, "pg_upmap_items %llu.%x [", pg->pgid.pool,
+ pg->pgid.seed);
+ for (i = 0; i < pg->pg_upmap_items.len; i++)
+ seq_printf(s, "%s%d->%d", (i == 0 ? "" : ","),
+ pg->pg_upmap_items.from_to[i][0],
+ pg->pg_upmap_items.from_to[i][1]);
+ seq_printf(s, "]\n");
+ }
up_read(&osdc->lock);
return 0;
@@ -147,17 +170,26 @@ static int monc_show(struct seq_file *s, void *p)
return 0;
}
+static void dump_spgid(struct seq_file *s, const struct ceph_spg *spgid)
+{
+ seq_printf(s, "%llu.%x", spgid->pgid.pool, spgid->pgid.seed);
+ if (spgid->shard != CEPH_SPG_NOSHARD)
+ seq_printf(s, "s%d", spgid->shard);
+}
+
static void dump_target(struct seq_file *s, struct ceph_osd_request_target *t)
{
int i;
- seq_printf(s, "osd%d\t%llu.%x\t[", t->osd, t->pgid.pool, t->pgid.seed);
+ seq_printf(s, "osd%d\t%llu.%x\t", t->osd, t->pgid.pool, t->pgid.seed);
+ dump_spgid(s, &t->spgid);
+ seq_puts(s, "\t[");
for (i = 0; i < t->up.size; i++)
seq_printf(s, "%s%d", (!i ? "" : ","), t->up.osds[i]);
seq_printf(s, "]/%d\t[", t->up.primary);
for (i = 0; i < t->acting.size; i++)
seq_printf(s, "%s%d", (!i ? "" : ","), t->acting.osds[i]);
- seq_printf(s, "]/%d\t", t->acting.primary);
+ seq_printf(s, "]/%d\te%u\t", t->acting.primary, t->epoch);
if (t->target_oloc.pool_ns) {
seq_printf(s, "%*pE/%*pE\t0x%x",
(int)t->target_oloc.pool_ns->len,
@@ -234,6 +266,73 @@ static void dump_linger_requests(struct seq_file *s, struct ceph_osd *osd)
mutex_unlock(&osd->lock);
}
+static void dump_snapid(struct seq_file *s, u64 snapid)
+{
+ if (snapid == CEPH_NOSNAP)
+ seq_puts(s, "head");
+ else if (snapid == CEPH_SNAPDIR)
+ seq_puts(s, "snapdir");
+ else
+ seq_printf(s, "%llx", snapid);
+}
+
+static void dump_name_escaped(struct seq_file *s, unsigned char *name,
+ size_t len)
+{
+ size_t i;
+
+ for (i = 0; i < len; i++) {
+ if (name[i] == '%' || name[i] == ':' || name[i] == '/' ||
+ name[i] < 32 || name[i] >= 127) {
+ seq_printf(s, "%%%02x", name[i]);
+ } else {
+ seq_putc(s, name[i]);
+ }
+ }
+}
+
+static void dump_hoid(struct seq_file *s, const struct ceph_hobject_id *hoid)
+{
+ if (hoid->snapid == 0 && hoid->hash == 0 && !hoid->is_max &&
+ hoid->pool == S64_MIN) {
+ seq_puts(s, "MIN");
+ return;
+ }
+ if (hoid->is_max) {
+ seq_puts(s, "MAX");
+ return;
+ }
+ seq_printf(s, "%lld:%08x:", hoid->pool, hoid->hash_reverse_bits);
+ dump_name_escaped(s, hoid->nspace, hoid->nspace_len);
+ seq_putc(s, ':');
+ dump_name_escaped(s, hoid->key, hoid->key_len);
+ seq_putc(s, ':');
+ dump_name_escaped(s, hoid->oid, hoid->oid_len);
+ seq_putc(s, ':');
+ dump_snapid(s, hoid->snapid);
+}
+
+static void dump_backoffs(struct seq_file *s, struct ceph_osd *osd)
+{
+ struct rb_node *n;
+
+ mutex_lock(&osd->lock);
+ for (n = rb_first(&osd->o_backoffs_by_id); n; n = rb_next(n)) {
+ struct ceph_osd_backoff *backoff =
+ rb_entry(n, struct ceph_osd_backoff, id_node);
+
+ seq_printf(s, "osd%d\t", osd->o_osd);
+ dump_spgid(s, &backoff->spgid);
+ seq_printf(s, "\t%llu\t", backoff->id);
+ dump_hoid(s, backoff->begin);
+ seq_putc(s, '\t');
+ dump_hoid(s, backoff->end);
+ seq_putc(s, '\n');
+ }
+
+ mutex_unlock(&osd->lock);
+}
+
static int osdc_show(struct seq_file *s, void *pp)
{
struct ceph_client *client = s->private;
@@ -259,6 +358,13 @@ static int osdc_show(struct seq_file *s, void *pp)
}
dump_linger_requests(s, &osdc->homeless_osd);
+ seq_puts(s, "BACKOFFS\n");
+ for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
+ struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
+
+ dump_backoffs(s, osd);
+ }
+
up_read(&osdc->lock);
return 0;
}
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 588a91930051..0c31035bbfee 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -1288,13 +1288,16 @@ static void prepare_write_message(struct ceph_connection *con)
m->hdr.seq = cpu_to_le64(++con->out_seq);
m->needs_out_seq = false;
}
- WARN_ON(m->data_length != le32_to_cpu(m->hdr.data_len));
+
+ if (con->ops->reencode_message)
+ con->ops->reencode_message(m);
dout("prepare_write_message %p seq %lld type %d len %d+%d+%zd\n",
m, con->out_seq, le16_to_cpu(m->hdr.type),
le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len),
m->data_length);
- BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len);
+ WARN_ON(m->front.iov_len != le32_to_cpu(m->hdr.front_len));
+ WARN_ON(m->data_length != le32_to_cpu(m->hdr.data_len));
/* tag + hdr + front + middle */
con_out_kvec_add(con, sizeof (tag_msg), &tag_msg);
@@ -2033,8 +2036,7 @@ static int process_connect(struct ceph_connection *con)
{
u64 sup_feat = from_msgr(con->msgr)->supported_features;
u64 req_feat = from_msgr(con->msgr)->required_features;
- u64 server_feat = ceph_sanitize_features(
- le64_to_cpu(con->in_reply.features));
+ u64 server_feat = le64_to_cpu(con->in_reply.features);
int ret;
dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index 250f11f78609..875675765531 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -6,6 +6,7 @@
#include <linux/random.h>
#include <linux/sched.h>
+#include <linux/ceph/ceph_features.h>
#include <linux/ceph/mon_client.h>
#include <linux/ceph/libceph.h>
#include <linux/ceph/debugfs.h>
@@ -297,6 +298,10 @@ static void handle_subscribe_ack(struct ceph_mon_client *monc,
mutex_lock(&monc->mutex);
if (monc->sub_renew_sent) {
+ /*
+ * This is only needed for legacy (infernalis or older)
+ * MONs -- see delayed_work().
+ */
monc->sub_renew_after = monc->sub_renew_sent +
(seconds >> 1) * HZ - 1;
dout("%s sent %lu duration %d renew after %lu\n", __func__,
@@ -955,7 +960,8 @@ static void delayed_work(struct work_struct *work)
__validate_auth(monc);
}
- if (is_auth) {
+ if (is_auth &&
+ !(monc->con.peer_features & CEPH_FEATURE_MON_STATEFUL_SUB)) {
unsigned long now = jiffies;
dout("%s renew subs? now %lu renew after %lu\n",
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 924f07c36ddb..86a9737d8e3f 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -12,6 +12,7 @@
#include <linux/bio.h>
#endif
+#include <linux/ceph/ceph_features.h>
#include <linux/ceph/libceph.h>
#include <linux/ceph/osd_client.h>
#include <linux/ceph/messenger.h>
@@ -49,6 +50,7 @@ static void link_linger(struct ceph_osd *osd,
struct ceph_osd_linger_request *lreq);
static void unlink_linger(struct ceph_osd *osd,
struct ceph_osd_linger_request *lreq);
+static void clear_backoffs(struct ceph_osd *osd);
#if 1
static inline bool rwsem_is_wrlocked(struct rw_semaphore *sem)
@@ -373,6 +375,7 @@ static void target_copy(struct ceph_osd_request_target *dest,
ceph_oloc_copy(&dest->target_oloc, &src->target_oloc);
dest->pgid = src->pgid; /* struct */
+ dest->spgid = src->spgid; /* struct */
dest->pg_num = src->pg_num;
dest->pg_num_mask = src->pg_num_mask;
ceph_osds_copy(&dest->acting, &src->acting);
@@ -384,6 +387,9 @@ static void target_copy(struct ceph_osd_request_target *dest,
dest->flags = src->flags;
dest->paused = src->paused;
+ dest->epoch = src->epoch;
+ dest->last_force_resend = src->last_force_resend;
+
dest->osd = src->osd;
}
@@ -537,7 +543,7 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
}
EXPORT_SYMBOL(ceph_osdc_alloc_request);
-static int ceph_oloc_encoding_size(struct ceph_object_locator *oloc)
+static int ceph_oloc_encoding_size(const struct ceph_object_locator *oloc)
{
return 8 + 4 + 4 + 4 + (oloc->pool_ns ? oloc->pool_ns->len : 0);
}
@@ -552,17 +558,21 @@ int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp)
WARN_ON(ceph_oloc_empty(&req->r_base_oloc));
/* create request message */
- msg_size = 4 + 4 + 4; /* client_inc, osdmap_epoch, flags */
- msg_size += 4 + 4 + 4 + 8; /* mtime, reassert_version */
+ msg_size = CEPH_ENCODING_START_BLK_LEN +
+ CEPH_PGID_ENCODING_LEN + 1; /* spgid */
+ msg_size += 4 + 4 + 4; /* hash, osdmap_epoch, flags */
+ msg_size += CEPH_ENCODING_START_BLK_LEN +
+ sizeof(struct ceph_osd_reqid); /* reqid */
+ msg_size += sizeof(struct ceph_blkin_trace_info); /* trace */
+ msg_size += 4 + sizeof(struct ceph_timespec); /* client_inc, mtime */
msg_size += CEPH_ENCODING_START_BLK_LEN +
ceph_oloc_encoding_size(&req->r_base_oloc); /* oloc */
- msg_size += 1 + 8 + 4 + 4; /* pgid */
msg_size += 4 + req->r_base_oid.name_len; /* oid */
msg_size += 2 + req->r_num_ops * sizeof(struct ceph_osd_op);
msg_size += 8; /* snapid */
msg_size += 8; /* snap_seq */
msg_size += 4 + 8 * (req->r_snapc ? req->r_snapc->num_snaps : 0);
- msg_size += 4; /* retry_attempt */
+ msg_size += 4 + 8; /* retry_attempt, features */
if (req->r_mempool)
msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
@@ -1010,6 +1020,8 @@ static void osd_init(struct ceph_osd *osd)
RB_CLEAR_NODE(&osd->o_node);
osd->o_requests = RB_ROOT;
osd->o_linger_requests = RB_ROOT;
+ osd->o_backoff_mappings = RB_ROOT;
+ osd->o_backoffs_by_id = RB_ROOT;
INIT_LIST_HEAD(&osd->o_osd_lru);
INIT_LIST_HEAD(&osd->o_keepalive_item);
osd->o_incarnation = 1;
@@ -1021,6 +1033,8 @@ static void osd_cleanup(struct ceph_osd *osd)
WARN_ON(!RB_EMPTY_NODE(&osd->o_node));
WARN_ON(!RB_EMPTY_ROOT(&osd->o_requests));
WARN_ON(!RB_EMPTY_ROOT(&osd->o_linger_requests));
+ WARN_ON(!RB_EMPTY_ROOT(&osd->o_backoff_mappings));
+ WARN_ON(!RB_EMPTY_ROOT(&osd->o_backoffs_by_id));
WARN_ON(!list_empty(&osd->o_osd_lru));
WARN_ON(!list_empty(&osd->o_keepalive_item));
@@ -1141,6 +1155,7 @@ static void close_osd(struct ceph_osd *osd)
unlink_linger(osd, lreq);
link_linger(&osdc->homeless_osd, lreq);
}
+ clear_backoffs(osd);
__remove_osd_from_lru(osd);
erase_osd(&osdc->osds, osd);
@@ -1297,7 +1312,7 @@ static bool target_should_be_paused(struct ceph_osd_client *osdc,
ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) ||
__pool_full(pi);
- WARN_ON(pi->id != t->base_oloc.pool);
+ WARN_ON(pi->id != t->target_oloc.pool);
return ((t->flags & CEPH_OSD_FLAG_READ) && pauserd) ||
((t->flags & CEPH_OSD_FLAG_WRITE) && pausewr) ||
(osdc->osdmap->epoch < osdc->epoch_barrier);
@@ -1311,19 +1326,21 @@ enum calc_target_result {
static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
struct ceph_osd_request_target *t,
- u32 *last_force_resend,
+ struct ceph_connection *con,
bool any_change)
{
struct ceph_pg_pool_info *pi;
struct ceph_pg pgid, last_pgid;
struct ceph_osds up, acting;
bool force_resend = false;
- bool need_check_tiering = false;
- bool need_resend = false;
+ bool unpaused = false;
+ bool legacy_change;
+ bool split = false;
bool sort_bitwise = ceph_osdmap_flag(osdc, CEPH_OSDMAP_SORTBITWISE);
enum calc_target_result ct_res;
int ret;
+ t->epoch = osdc->osdmap->epoch;
pi = ceph_pg_pool_by_id(osdc->osdmap, t->base_oloc.pool);
if (!pi) {
t->osd = CEPH_HOMELESS_OSD;
@@ -1332,33 +1349,33 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
}
if (osdc->osdmap->epoch == pi->last_force_request_resend) {
- if (last_force_resend &&
- *last_force_resend < pi->last_force_request_resend) {
- *last_force_resend = pi->last_force_request_resend;
+ if (t->last_force_resend < pi->last_force_request_resend) {
+ t->last_force_resend = pi->last_force_request_resend;
force_resend = true;
- } else if (!last_force_resend) {
+ } else if (t->last_force_resend == 0) {
force_resend = true;
}
}
- if (ceph_oid_empty(&t->target_oid) || force_resend) {
- ceph_oid_copy(&t->target_oid, &t->base_oid);
- need_check_tiering = true;
- }
- if (ceph_oloc_empty(&t->target_oloc) || force_resend) {
- ceph_oloc_copy(&t->target_oloc, &t->base_oloc);
- need_check_tiering = true;
- }
- if (need_check_tiering &&
- (t->flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) {
+ /* apply tiering */
+ ceph_oid_copy(&t->target_oid, &t->base_oid);
+ ceph_oloc_copy(&t->target_oloc, &t->base_oloc);
+ if ((t->flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) {
if (t->flags & CEPH_OSD_FLAG_READ && pi->read_tier >= 0)
t->target_oloc.pool = pi->read_tier;
if (t->flags & CEPH_OSD_FLAG_WRITE && pi->write_tier >= 0)
t->target_oloc.pool = pi->write_tier;
+
+ pi = ceph_pg_pool_by_id(osdc->osdmap, t->target_oloc.pool);
+ if (!pi) {
+ t->osd = CEPH_HOMELESS_OSD;
+ ct_res = CALC_TARGET_POOL_DNE;
+ goto out;
+ }
}
- ret = ceph_object_locator_to_pg(osdc->osdmap, &t->target_oid,
- &t->target_oloc, &pgid);
+ ret = __ceph_object_locator_to_pg(pi, &t->target_oid, &t->target_oloc,
+ &pgid);
if (ret) {
WARN_ON(ret != -ENOENT);
t->osd = CEPH_HOMELESS_OSD;
@@ -1368,7 +1385,7 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
last_pgid.pool = pgid.pool;
last_pgid.seed = ceph_stable_mod(pgid.seed, t->pg_num, t->pg_num_mask);
- ceph_pg_to_up_acting_osds(osdc->osdmap, &pgid, &up, &acting);
+ ceph_pg_to_up_acting_osds(osdc->osdmap, pi, &pgid, &up, &acting);
if (any_change &&
ceph_is_new_interval(&t->acting,
&acting,
@@ -1387,13 +1404,16 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
if (t->paused && !target_should_be_paused(osdc, t, pi)) {
t->paused = false;
- need_resend = true;
+ unpaused = true;
}
+ legacy_change = ceph_pg_compare(&t->pgid, &pgid) ||
+ ceph_osds_changed(&t->acting, &acting, any_change);
+ if (t->pg_num)
+ split = ceph_pg_is_split(&last_pgid, t->pg_num, pi->pg_num);
- if (ceph_pg_compare(&t->pgid, &pgid) ||
- ceph_osds_changed(&t->acting, &acting, any_change) ||
- force_resend) {
+ if (legacy_change || force_resend || split) {
t->pgid = pgid; /* struct */
+ ceph_pg_to_primary_shard(osdc->osdmap, pi, &pgid, &t->spgid);
ceph_osds_copy(&t->acting, &acting);
ceph_osds_copy(&t->up, &up);
t->size = pi->size;
@@ -1403,15 +1423,342 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
t->sort_bitwise = sort_bitwise;
t->osd = acting.primary;
- need_resend = true;
}
- ct_res = need_resend ? CALC_TARGET_NEED_RESEND : CALC_TARGET_NO_ACTION;
+ if (unpaused || legacy_change || force_resend ||
+ (split && con && CEPH_HAVE_FEATURE(con->peer_features,
+ RESEND_ON_SPLIT)))
+ ct_res = CALC_TARGET_NEED_RESEND;
+ else
+ ct_res = CALC_TARGET_NO_ACTION;
+
out:
dout("%s t %p -> ct_res %d osd %d\n", __func__, t, ct_res, t->osd);
return ct_res;
}
+static struct ceph_spg_mapping *alloc_spg_mapping(void)
+{
+ struct ceph_spg_mapping *spg;
+
+ spg = kmalloc(sizeof(*spg), GFP_NOIO);
+ if (!spg)
+ return NULL;
+
+ RB_CLEAR_NODE(&spg->node);
+ spg->backoffs = RB_ROOT;
+ return spg;
+}
+
+static void free_spg_mapping(struct ceph_spg_mapping *spg)
+{
+ WARN_ON(!RB_EMPTY_NODE(&spg->node));
+ WARN_ON(!RB_EMPTY_ROOT(&spg->backoffs));
+
+ kfree(spg);
+}
+
+/*
+ * rbtree of ceph_spg_mapping for handling map<spg_t, ...>, similar to
+ * ceph_pg_mapping. Used to track OSD backoffs -- a backoff [range] is
+ * defined only within a specific spgid; it does not pass anything to
+ * children on split, or to another primary.
+ */
+DEFINE_RB_FUNCS2(spg_mapping, struct ceph_spg_mapping, spgid, ceph_spg_compare,
+ RB_BYPTR, const struct ceph_spg *, node)
+
+static u64 hoid_get_bitwise_key(const struct ceph_hobject_id *hoid)
+{
+ return hoid->is_max ? 0x100000000ull : hoid->hash_reverse_bits;
+}
+
+static void hoid_get_effective_key(const struct ceph_hobject_id *hoid,
+ void **pkey, size_t *pkey_len)
+{
+ if (hoid->key_len) {
+ *pkey = hoid->key;
+ *pkey_len = hoid->key_len;
+ } else {
+ *pkey = hoid->oid;
+ *pkey_len = hoid->oid_len;
+ }
+}
+
+static int compare_names(const void *name1, size_t name1_len,
+ const void *name2, size_t name2_len)
+{
+ int ret;
+
+ ret = memcmp(name1, name2, min(name1_len, name2_len));
+ if (!ret) {
+ if (name1_len < name2_len)
+ ret = -1;
+ else if (name1_len > name2_len)
+ ret = 1;
+ }
+ return ret;
+}
+
+static int hoid_compare(const struct ceph_hobject_id *lhs,
+ const struct ceph_hobject_id *rhs)
+{
+ void *effective_key1, *effective_key2;
+ size_t effective_key1_len, effective_key2_len;
+ int ret;
+
+ if (lhs->is_max < rhs->is_max)
+ return -1;
+ if (lhs->is_max > rhs->is_max)
+ return 1;
+
+ if (lhs->pool < rhs->pool)
+ return -1;
+ if (lhs->pool > rhs->pool)
+ return 1;
+
+ if (hoid_get_bitwise_key(lhs) < hoid_get_bitwise_key(rhs))
+ return -1;
+ if (hoid_get_bitwise_key(lhs) > hoid_get_bitwise_key(rhs))
+ return 1;
+
+ ret = compare_names(lhs->nspace, lhs->nspace_len,
+ rhs->nspace, rhs->nspace_len);
+ if (ret)
+ return ret;
+
+ hoid_get_effective_key(lhs, &effective_key1, &effective_key1_len);
+ hoid_get_effective_key(rhs, &effective_key2, &effective_key2_len);
+ ret = compare_names(effective_key1, effective_key1_len,
+ effective_key2, effective_key2_len);
+ if (ret)
+ return ret;
+
+ ret = compare_names(lhs->oid, lhs->oid_len, rhs->oid, rhs->oid_len);
+ if (ret)
+ return ret;
+
+ if (lhs->snapid < rhs->snapid)
+ return -1;
+ if (lhs->snapid > rhs->snapid)
+ return 1;
+
+ return 0;
+}
+
+/*
+ * For decoding ->begin and ->end of MOSDBackoff only -- no MIN/MAX
+ * compat stuff here.
+ *
+ * Assumes @hoid is zero-initialized.
+ */
+static int decode_hoid(void **p, void *end, struct ceph_hobject_id *hoid)
+{
+ u8 struct_v;
+ u32 struct_len;
+ int ret;
+
+ ret = ceph_start_decoding(p, end, 4, "hobject_t", &struct_v,
+ &struct_len);
+ if (ret)
+ return ret;
+
+ if (struct_v < 4) {
+ pr_err("got struct_v %d < 4 of hobject_t\n", struct_v);
+ goto e_inval;
+ }
+
+ hoid->key = ceph_extract_encoded_string(p, end, &hoid->key_len,
+ GFP_NOIO);
+ if (IS_ERR(hoid->key)) {
+ ret = PTR_ERR(hoid->key);
+ hoid->key = NULL;
+ return ret;
+ }
+
+ hoid->oid = ceph_extract_encoded_string(p, end, &hoid->oid_len,
+ GFP_NOIO);
+ if (IS_ERR(hoid->oid)) {
+ ret = PTR_ERR(hoid->oid);
+ hoid->oid = NULL;
+ return ret;
+ }
+
+ ceph_decode_64_safe(p, end, hoid->snapid, e_inval);
+ ceph_decode_32_safe(p, end, hoid->hash, e_inval);
+ ceph_decode_8_safe(p, end, hoid->is_max, e_inval);
+
+ hoid->nspace = ceph_extract_encoded_string(p, end, &hoid->nspace_len,
+ GFP_NOIO);
+ if (IS_ERR(hoid->nspace)) {
+ ret = PTR_ERR(hoid->nspace);
+ hoid->nspace = NULL;
+ return ret;
+ }
+
+ ceph_decode_64_safe(p, end, hoid->pool, e_inval);
+
+ ceph_hoid_build_hash_cache(hoid);
+ return 0;
+
+e_inval:
+ return -EINVAL;
+}
+
+static int hoid_encoding_size(const struct ceph_hobject_id *hoid)
+{
+ return 8 + 4 + 1 + 8 + /* snapid, hash, is_max, pool */
+ 4 + hoid->key_len + 4 + hoid->oid_len + 4 + hoid->nspace_len;
+}
+
+static void encode_hoid(void **p, void *end, const struct ceph_hobject_id *hoid)
+{
+ ceph_start_encoding(p, 4, 3, hoid_encoding_size(hoid));
+ ceph_encode_string(p, end, hoid->key, hoid->key_len);
+ ceph_encode_string(p, end, hoid->oid, hoid->oid_len);
+ ceph_encode_64(p, hoid->snapid);
+ ceph_encode_32(p, hoid->hash);
+ ceph_encode_8(p, hoid->is_max);
+ ceph_encode_string(p, end, hoid->nspace, hoid->nspace_len);
+ ceph_encode_64(p, hoid->pool);
+}
+
+static void free_hoid(struct ceph_hobject_id *hoid)
+{
+ if (hoid) {
+ kfree(hoid->key);
+ kfree(hoid->oid);
+ kfree(hoid->nspace);
+ kfree(hoid);
+ }
+}
+
+static struct ceph_osd_backoff *alloc_backoff(void)
+{
+ struct ceph_osd_backoff *backoff;
+
+ backoff = kzalloc(sizeof(*backoff), GFP_NOIO);
+ if (!backoff)
+ return NULL;
+
+ RB_CLEAR_NODE(&backoff->spg_node);
+ RB_CLEAR_NODE(&backoff->id_node);
+ return backoff;
+}
+
+static void free_backoff(struct ceph_osd_backoff *backoff)
+{
+ WARN_ON(!RB_EMPTY_NODE(&backoff->spg_node));
+ WARN_ON(!RB_EMPTY_NODE(&backoff->id_node));
+
+ free_hoid(backoff->begin);
+ free_hoid(backoff->end);
+ kfree(backoff);
+}
+
+/*
+ * Within a specific spgid, backoffs are managed by ->begin hoid.
+ */
+DEFINE_RB_INSDEL_FUNCS2(backoff, struct ceph_osd_backoff, begin, hoid_compare,
+ RB_BYVAL, spg_node);
+
+static struct ceph_osd_backoff *lookup_containing_backoff(struct rb_root *root,
+ const struct ceph_hobject_id *hoid)
+{
+ struct rb_node *n = root->rb_node;
+
+ while (n) {
+ struct ceph_osd_backoff *cur =
+ rb_entry(n, struct ceph_osd_backoff, spg_node);
+ int cmp;
+
+ cmp = hoid_compare(hoid, cur->begin);
+ if (cmp < 0) {
+ n = n->rb_left;
+ } else if (cmp > 0) {
+ if (hoid_compare(hoid, cur->end) < 0)
+ return cur;
+
+ n = n->rb_right;
+ } else {
+ return cur;
+ }
+ }
+
+ return NULL;
+}
+
+/*
+ * Each backoff has a unique id within its OSD session.
+ */
+DEFINE_RB_FUNCS(backoff_by_id, struct ceph_osd_backoff, id, id_node)
+
+static void clear_backoffs(struct ceph_osd *osd)
+{
+ while (!RB_EMPTY_ROOT(&osd->o_backoff_mappings)) {
+ struct ceph_spg_mapping *spg =
+ rb_entry(rb_first(&osd->o_backoff_mappings),
+ struct ceph_spg_mapping, node);
+
+ while (!RB_EMPTY_ROOT(&spg->backoffs)) {
+ struct ceph_osd_backoff *backoff =
+ rb_entry(rb_first(&spg->backoffs),
+ struct ceph_osd_backoff, spg_node);
+
+ erase_backoff(&spg->backoffs, backoff);
+ erase_backoff_by_id(&osd->o_backoffs_by_id, backoff);
+ free_backoff(backoff);
+ }
+ erase_spg_mapping(&osd->o_backoff_mappings, spg);
+ free_spg_mapping(spg);