summaryrefslogtreecommitdiffstats
path: root/net/ceph
diff options
context:
space:
mode:
authorIlya Dryomov <idryomov@gmail.com>2020-11-19 16:59:08 +0100
committerIlya Dryomov <idryomov@gmail.com>2020-12-14 23:21:50 +0100
commitcd1a677cad994021b19665ed476aea63f5d54f31 (patch)
tree07385b55c4b9aa24cbea60018de04959a3cc91d5 /net/ceph
parent00498b994113a871a556f7ff24a4cf8a00611700 (diff)
libceph, ceph: implement msgr2.1 protocol (crc and secure modes)
Implement msgr2.1 wire protocol, available since nautilus 14.2.11 and octopus 15.2.5. msgr2.0 wire protocol is not implemented -- it has several security, integrity and robustness issues and therefore considered deprecated. Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Diffstat (limited to 'net/ceph')
-rw-r--r--net/ceph/Kconfig3
-rw-r--r--net/ceph/Makefile2
-rw-r--r--net/ceph/auth.c309
-rw-r--r--net/ceph/decode.c45
-rw-r--r--net/ceph/messenger.c68
-rw-r--r--net/ceph/messenger_v2.c3443
-rw-r--r--net/ceph/mon_client.c115
-rw-r--r--net/ceph/osd_client.c85
8 files changed, 4046 insertions, 24 deletions
diff --git a/net/ceph/Kconfig b/net/ceph/Kconfig
index f36f9a3a4e20..c5c4eef3a9ff 100644
--- a/net/ceph/Kconfig
+++ b/net/ceph/Kconfig
@@ -5,6 +5,9 @@ config CEPH_LIB
select LIBCRC32C
select CRYPTO_AES
select CRYPTO_CBC
+ select CRYPTO_GCM
+ select CRYPTO_HMAC
+ select CRYPTO_SHA256
select CRYPTO
select KEYS
default n
diff --git a/net/ceph/Makefile b/net/ceph/Makefile
index df02bd8d6c7b..8802a0c0155d 100644
--- a/net/ceph/Makefile
+++ b/net/ceph/Makefile
@@ -15,4 +15,4 @@ libceph-y := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \
auth_x.o \
ceph_strings.o ceph_hash.o \
pagevec.o snapshot.o string_table.o \
- messenger_v1.o
+ messenger_v1.o messenger_v2.o
diff --git a/net/ceph/auth.c b/net/ceph/auth.c
index 4a0f32b32cc6..6b315c8212b1 100644
--- a/net/ceph/auth.c
+++ b/net/ceph/auth.c
@@ -293,6 +293,39 @@ int ceph_auth_is_authenticated(struct ceph_auth_client *ac)
}
EXPORT_SYMBOL(ceph_auth_is_authenticated);
+int __ceph_auth_get_authorizer(struct ceph_auth_client *ac,
+ struct ceph_auth_handshake *auth,
+ int peer_type, bool force_new,
+ int *proto, int *pref_mode, int *fallb_mode)
+{
+ int ret;
+
+ mutex_lock(&ac->mutex);
+ if (force_new && auth->authorizer) {
+ ceph_auth_destroy_authorizer(auth->authorizer);
+ auth->authorizer = NULL;
+ }
+ if (!auth->authorizer)
+ ret = ac->ops->create_authorizer(ac, peer_type, auth);
+ else if (ac->ops->update_authorizer)
+ ret = ac->ops->update_authorizer(ac, peer_type, auth);
+ else
+ ret = 0;
+ if (ret)
+ goto out;
+
+ *proto = ac->protocol;
+ if (pref_mode && fallb_mode) {
+ *pref_mode = ac->preferred_mode;
+ *fallb_mode = ac->fallback_mode;
+ }
+
+out:
+ mutex_unlock(&ac->mutex);
+ return ret;
+}
+EXPORT_SYMBOL(__ceph_auth_get_authorizer);
+
int ceph_auth_create_authorizer(struct ceph_auth_client *ac,
int peer_type,
struct ceph_auth_handshake *auth)
@@ -369,3 +402,279 @@ void ceph_auth_invalidate_authorizer(struct ceph_auth_client *ac, int peer_type)
mutex_unlock(&ac->mutex);
}
EXPORT_SYMBOL(ceph_auth_invalidate_authorizer);
+
+/*
+ * msgr2 authentication
+ */
+
+static bool contains(const int *arr, int cnt, int val)
+{
+ int i;
+
+ for (i = 0; i < cnt; i++) {
+ if (arr[i] == val)
+ return true;
+ }
+
+ return false;
+}
+
+static int encode_con_modes(void **p, void *end, int pref_mode, int fallb_mode)
+{
+ WARN_ON(pref_mode == CEPH_CON_MODE_UNKNOWN);
+ if (fallb_mode != CEPH_CON_MODE_UNKNOWN) {
+ ceph_encode_32_safe(p, end, 2, e_range);
+ ceph_encode_32_safe(p, end, pref_mode, e_range);
+ ceph_encode_32_safe(p, end, fallb_mode, e_range);
+ } else {
+ ceph_encode_32_safe(p, end, 1, e_range);
+ ceph_encode_32_safe(p, end, pref_mode, e_range);
+ }
+
+ return 0;
+
+e_range:
+ return -ERANGE;
+}
+
+/*
+ * Similar to ceph_auth_build_hello().
+ */
+int ceph_auth_get_request(struct ceph_auth_client *ac, void *buf, int buf_len)
+{
+ int proto = ac->key ? CEPH_AUTH_CEPHX : CEPH_AUTH_NONE;
+ void *end = buf + buf_len;
+ void *lenp;
+ void *p;
+ int ret;
+
+ mutex_lock(&ac->mutex);
+ if (ac->protocol == CEPH_AUTH_UNKNOWN) {
+ ret = init_protocol(ac, proto);
+ if (ret) {
+ pr_err("auth protocol '%s' init failed: %d\n",
+ ceph_auth_proto_name(proto), ret);
+ goto out;
+ }
+ } else {
+ WARN_ON(ac->protocol != proto);
+ ac->ops->reset(ac);
+ }
+
+ p = buf;
+ ceph_encode_32_safe(&p, end, ac->protocol, e_range);
+ ret = encode_con_modes(&p, end, ac->preferred_mode, ac->fallback_mode);
+ if (ret)
+ goto out;
+
+ lenp = p;
+ p += 4; /* space for len */
+
+ ceph_encode_8_safe(&p, end, CEPH_AUTH_MODE_MON, e_range);
+ ret = ceph_auth_entity_name_encode(ac->name, &p, end);
+ if (ret)
+ goto out;
+
+ ceph_encode_64_safe(&p, end, ac->global_id, e_range);
+ ceph_encode_32(&lenp, p - lenp - 4);
+ ret = p - buf;
+
+out:
+ mutex_unlock(&ac->mutex);
+ return ret;
+
+e_range:
+ ret = -ERANGE;
+ goto out;
+}
+
+int ceph_auth_handle_reply_more(struct ceph_auth_client *ac, void *reply,
+ int reply_len, void *buf, int buf_len)
+{
+ int ret;
+
+ mutex_lock(&ac->mutex);
+ ret = ac->ops->handle_reply(ac, 0, reply, reply + reply_len,
+ NULL, NULL, NULL, NULL);
+ if (ret == -EAGAIN)
+ ret = build_request(ac, false, buf, buf_len);
+ else
+ WARN_ON(ret >= 0);
+ mutex_unlock(&ac->mutex);
+ return ret;
+}
+
+int ceph_auth_handle_reply_done(struct ceph_auth_client *ac,
+ u64 global_id, void *reply, int reply_len,
+ u8 *session_key, int *session_key_len,
+ u8 *con_secret, int *con_secret_len)
+{
+ int ret;
+
+ mutex_lock(&ac->mutex);
+ if (global_id && ac->global_id != global_id) {
+ dout("%s global_id %llu -> %llu\n", __func__, ac->global_id,
+ global_id);
+ ac->global_id = global_id;
+ }
+
+ ret = ac->ops->handle_reply(ac, 0, reply, reply + reply_len,
+ session_key, session_key_len,
+ con_secret, con_secret_len);
+ mutex_unlock(&ac->mutex);
+ return ret;
+}
+
+bool ceph_auth_handle_bad_method(struct ceph_auth_client *ac,
+ int used_proto, int result,
+ const int *allowed_protos, int proto_cnt,
+ const int *allowed_modes, int mode_cnt)
+{
+ mutex_lock(&ac->mutex);
+ WARN_ON(used_proto != ac->protocol);
+
+ if (result == -EOPNOTSUPP) {
+ if (!contains(allowed_protos, proto_cnt, ac->protocol)) {
+ pr_err("auth protocol '%s' not allowed\n",
+ ceph_auth_proto_name(ac->protocol));
+ goto not_allowed;
+ }
+ if (!contains(allowed_modes, mode_cnt, ac->preferred_mode) &&
+ (ac->fallback_mode == CEPH_CON_MODE_UNKNOWN ||
+ !contains(allowed_modes, mode_cnt, ac->fallback_mode))) {
+ pr_err("preferred mode '%s' not allowed\n",
+ ceph_con_mode_name(ac->preferred_mode));
+ if (ac->fallback_mode == CEPH_CON_MODE_UNKNOWN)
+ pr_err("no fallback mode\n");
+ else
+ pr_err("fallback mode '%s' not allowed\n",
+ ceph_con_mode_name(ac->fallback_mode));
+ goto not_allowed;
+ }
+ }
+
+ WARN_ON(result == -EOPNOTSUPP || result >= 0);
+ pr_err("auth protocol '%s' msgr authentication failed: %d\n",
+ ceph_auth_proto_name(ac->protocol), result);
+
+ mutex_unlock(&ac->mutex);
+ return true;
+
+not_allowed:
+ mutex_unlock(&ac->mutex);
+ return false;
+}
+
+int ceph_auth_get_authorizer(struct ceph_auth_client *ac,
+ struct ceph_auth_handshake *auth,
+ int peer_type, void *buf, int *buf_len)
+{
+ void *end = buf + *buf_len;
+ int pref_mode, fallb_mode;
+ int proto;
+ void *p;
+ int ret;
+
+ ret = __ceph_auth_get_authorizer(ac, auth, peer_type, true, &proto,
+ &pref_mode, &fallb_mode);
+ if (ret)
+ return ret;
+
+ p = buf;
+ ceph_encode_32_safe(&p, end, proto, e_range);
+ ret = encode_con_modes(&p, end, pref_mode, fallb_mode);
+ if (ret)
+ return ret;
+
+ ceph_encode_32_safe(&p, end, auth->authorizer_buf_len, e_range);
+ *buf_len = p - buf;
+ return 0;
+
+e_range:
+ return -ERANGE;
+}
+EXPORT_SYMBOL(ceph_auth_get_authorizer);
+
+int ceph_auth_handle_svc_reply_more(struct ceph_auth_client *ac,
+ struct ceph_auth_handshake *auth,
+ void *reply, int reply_len,
+ void *buf, int *buf_len)
+{
+ void *end = buf + *buf_len;
+ void *p;
+ int ret;
+
+ ret = ceph_auth_add_authorizer_challenge(ac, auth->authorizer,
+ reply, reply_len);
+ if (ret)
+ return ret;
+
+ p = buf;
+ ceph_encode_32_safe(&p, end, auth->authorizer_buf_len, e_range);
+ *buf_len = p - buf;
+ return 0;
+
+e_range:
+ return -ERANGE;
+}
+EXPORT_SYMBOL(ceph_auth_handle_svc_reply_more);
+
+int ceph_auth_handle_svc_reply_done(struct ceph_auth_client *ac,
+ struct ceph_auth_handshake *auth,
+ void *reply, int reply_len,
+ u8 *session_key, int *session_key_len,
+ u8 *con_secret, int *con_secret_len)
+{
+ return ceph_auth_verify_authorizer_reply(ac, auth->authorizer,
+ reply, reply_len, session_key, session_key_len,
+ con_secret, con_secret_len);
+}
+EXPORT_SYMBOL(ceph_auth_handle_svc_reply_done);
+
+bool ceph_auth_handle_bad_authorizer(struct ceph_auth_client *ac,
+ int peer_type, int used_proto, int result,
+ const int *allowed_protos, int proto_cnt,
+ const int *allowed_modes, int mode_cnt)
+{
+ mutex_lock(&ac->mutex);
+ WARN_ON(used_proto != ac->protocol);
+
+ if (result == -EOPNOTSUPP) {
+ if (!contains(allowed_protos, proto_cnt, ac->protocol)) {
+ pr_err("auth protocol '%s' not allowed by %s\n",
+ ceph_auth_proto_name(ac->protocol),
+ ceph_entity_type_name(peer_type));
+ goto not_allowed;
+ }
+ if (!contains(allowed_modes, mode_cnt, ac->preferred_mode) &&
+ (ac->fallback_mode == CEPH_CON_MODE_UNKNOWN ||
+ !contains(allowed_modes, mode_cnt, ac->fallback_mode))) {
+ pr_err("preferred mode '%s' not allowed by %s\n",
+ ceph_con_mode_name(ac->preferred_mode),
+ ceph_entity_type_name(peer_type));
+ if (ac->fallback_mode == CEPH_CON_MODE_UNKNOWN)
+ pr_err("no fallback mode\n");
+ else
+ pr_err("fallback mode '%s' not allowed by %s\n",
+ ceph_con_mode_name(ac->fallback_mode),
+ ceph_entity_type_name(peer_type));
+ goto not_allowed;
+ }
+ }
+
+ WARN_ON(result == -EOPNOTSUPP || result >= 0);
+ pr_err("auth protocol '%s' authorization to %s failed: %d\n",
+ ceph_auth_proto_name(ac->protocol),
+ ceph_entity_type_name(peer_type), result);
+
+ if (ac->ops->invalidate_authorizer)
+ ac->ops->invalidate_authorizer(ac, peer_type);
+
+ mutex_unlock(&ac->mutex);
+ return true;
+
+not_allowed:
+ mutex_unlock(&ac->mutex);
+ return false;
+}
+EXPORT_SYMBOL(ceph_auth_handle_bad_authorizer);
diff --git a/net/ceph/decode.c b/net/ceph/decode.c
index 6429b6713507..b44f7651be04 100644
--- a/net/ceph/decode.c
+++ b/net/ceph/decode.c
@@ -1,6 +1,8 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/ceph/ceph_debug.h>
+#include <linux/inet.h>
+
#include <linux/ceph/decode.h>
static int
@@ -138,3 +140,46 @@ e_inval:
return -EINVAL;
}
EXPORT_SYMBOL(ceph_decode_entity_addrvec);
+
+static int get_sockaddr_encoding_len(sa_family_t family)
+{
+ union {
+ struct sockaddr sa;
+ struct sockaddr_in sin;
+ struct sockaddr_in6 sin6;
+ } u;
+
+ switch (family) {
+ case AF_INET:
+ return sizeof(u.sin);
+ case AF_INET6:
+ return sizeof(u.sin6);
+ default:
+ return sizeof(u);
+ }
+}
+
+int ceph_entity_addr_encoding_len(const struct ceph_entity_addr *addr)
+{
+ sa_family_t family = get_unaligned(&addr->in_addr.ss_family);
+ int addr_len = get_sockaddr_encoding_len(family);
+
+ return 1 + CEPH_ENCODING_START_BLK_LEN + 4 + 4 + 4 + addr_len;
+}
+
+void ceph_encode_entity_addr(void **p, const struct ceph_entity_addr *addr)
+{
+ sa_family_t family = get_unaligned(&addr->in_addr.ss_family);
+ int addr_len = get_sockaddr_encoding_len(family);
+
+ ceph_encode_8(p, 1); /* marker */
+ ceph_start_encoding(p, 1, 1, sizeof(addr->type) +
+ sizeof(addr->nonce) +
+ sizeof(u32) + addr_len);
+ ceph_encode_copy(p, &addr->type, sizeof(addr->type));
+ ceph_encode_copy(p, &addr->nonce, sizeof(addr->nonce));
+
+ ceph_encode_32(p, addr_len);
+ ceph_encode_16(p, family);
+ ceph_encode_copy(p, addr->in_addr.__data, addr_len - sizeof(family));
+}
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 4fb3c33a7b03..57d043b382ed 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -195,8 +195,11 @@ EXPORT_SYMBOL(ceph_pr_addr);
void ceph_encode_my_addr(struct ceph_messenger *msgr)
{
- memcpy(&msgr->my_enc_addr, &msgr->inst.addr, sizeof(msgr->my_enc_addr));
- ceph_encode_banner_addr(&msgr->my_enc_addr);
+ if (!ceph_msgr2(from_msgr(msgr))) {
+ memcpy(&msgr->my_enc_addr, &msgr->inst.addr,
+ sizeof(msgr->my_enc_addr));
+ ceph_encode_banner_addr(&msgr->my_enc_addr);
+ }
}
/*
@@ -513,7 +516,10 @@ static void ceph_con_reset_protocol(struct ceph_connection *con)
con->out_msg = NULL;
}
- ceph_con_v1_reset_protocol(con);
+ if (ceph_msgr2(from_msgr(con->msgr)))
+ ceph_con_v2_reset_protocol(con);
+ else
+ ceph_con_v1_reset_protocol(con);
}
/*
@@ -526,6 +532,7 @@ static void ceph_msg_remove(struct ceph_msg *msg)
ceph_msg_put(msg);
}
+
static void ceph_msg_remove_list(struct list_head *head)
{
while (!list_empty(head)) {
@@ -547,7 +554,10 @@ void ceph_con_reset_session(struct ceph_connection *con)
con->in_seq = 0;
con->in_seq_acked = 0;
- ceph_con_v1_reset_session(con);
+ if (ceph_msgr2(from_msgr(con->msgr)))
+ ceph_con_v2_reset_session(con);
+ else
+ ceph_con_v1_reset_session(con);
}
/*
@@ -600,6 +610,9 @@ EXPORT_SYMBOL(ceph_con_open);
*/
bool ceph_con_opened(struct ceph_connection *con)
{
+ if (ceph_msgr2(from_msgr(con->msgr)))
+ return ceph_con_v2_opened(con);
+
return ceph_con_v1_opened(con);
}
@@ -1302,7 +1315,16 @@ int ceph_parse_ips(const char *c, const char *end,
}
ceph_addr_set_port(&addr[i], port);
+ /*
+ * We want the type to be set according to ms_mode
+ * option, but options are normally parsed after mon
+ * addresses. Rather than complicating parsing, set
+ * to LEGACY and override in build_initial_monmap()
+ * for mon addresses and ceph_messenger_init() for
+ * ip option.
+ */
addr[i].type = CEPH_ENTITY_ADDR_TYPE_LEGACY;
+ addr[i].nonce = 0;
dout("parse_ips got %s\n", ceph_pr_addr(&addr[i]));
@@ -1410,6 +1432,13 @@ static bool con_sock_closed(struct ceph_connection *con)
CASE(PREOPEN);
CASE(V1_BANNER);
CASE(V1_CONNECT_MSG);
+ CASE(V2_BANNER_PREFIX);
+ CASE(V2_BANNER_PAYLOAD);
+ CASE(V2_HELLO);
+ CASE(V2_AUTH);
+ CASE(V2_AUTH_SIGNATURE);
+ CASE(V2_SESSION_CONNECT);
+ CASE(V2_SESSION_RECONNECT);
CASE(OPEN);
CASE(STANDBY);
default:
@@ -1494,7 +1523,10 @@ static void ceph_con_workfn(struct work_struct *work)
BUG_ON(con->sock);
}
- ret = ceph_con_v1_try_read(con);
+ if (ceph_msgr2(from_msgr(con->msgr)))
+ ret = ceph_con_v2_try_read(con);
+ else
+ ret = ceph_con_v1_try_read(con);
if (ret < 0) {
if (ret == -EAGAIN)
continue;
@@ -1504,7 +1536,10 @@ static void ceph_con_workfn(struct work_struct *work)
break;
}
- ret = ceph_con_v1_try_write(con);
+ if (ceph_msgr2(from_msgr(con->msgr)))
+ ret = ceph_con_v2_try_write(con);
+ else
+ ret = ceph_con_v1_try_write(con);
if (ret < 0) {
if (ret == -EAGAIN)
continue;
@@ -1538,9 +1573,8 @@ static void con_fault(struct ceph_connection *con)
ceph_pr_addr(&con->peer_addr), con->error_msg);
con->error_msg = NULL;
- WARN_ON(con->state != CEPH_CON_S_V1_BANNER &&
- con->state != CEPH_CON_S_V1_CONNECT_MSG &&
- con->state != CEPH_CON_S_OPEN);
+ WARN_ON(con->state == CEPH_CON_S_STANDBY ||
+ con->state == CEPH_CON_S_CLOSED);
ceph_con_reset_protocol(con);
@@ -1596,7 +1630,11 @@ void ceph_messenger_init(struct ceph_messenger *msgr,
ceph_addr_set_port(&msgr->inst.addr, 0);
}
- msgr->inst.addr.type = 0;
+ /*
+ * Since nautilus, clients are identified using type ANY.
+ * For msgr1, ceph_encode_banner_addr() munges it to NONE.
+ */
+ msgr->inst.addr.type = CEPH_ENTITY_ADDR_TYPE_ANY;
/* generate a random non-zero nonce */
do {
@@ -1706,7 +1744,10 @@ void ceph_msg_revoke(struct ceph_msg *msg)
if (con->out_msg == msg) {
WARN_ON(con->state != CEPH_CON_S_OPEN);
dout("%s con %p msg %p was sending\n", __func__, con, msg);
- ceph_con_v1_revoke(con);
+ if (ceph_msgr2(from_msgr(con->msgr)))
+ ceph_con_v2_revoke(con);
+ else
+ ceph_con_v1_revoke(con);
ceph_msg_put(con->out_msg);
con->out_msg = NULL;
} else {
@@ -1732,7 +1773,10 @@ void ceph_msg_revoke_incoming(struct ceph_msg *msg)
if (con->in_msg == msg) {
WARN_ON(con->state != CEPH_CON_S_OPEN);
dout("%s con %p msg %p was recving\n", __func__, con, msg);
- ceph_con_v1_revoke_incoming(con);
+ if (ceph_msgr2(from_msgr(con->msgr)))
+ ceph_con_v2_revoke_incoming(con);
+ else
+ ceph_con_v1_revoke_incoming(con);
ceph_msg_put(con->in_msg);
con->in_msg = NULL;
} else {
diff --git a/net/ceph/messenger_v2.c b/net/ceph/messenger_v2.c
new file mode 100644
index 000000000000..5e38c847317b
--- /dev/null
+++ b/net/ceph/messenger_v2.c
@@ -0,0 +1,3443 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Ceph msgr2 protocol implementation
+ *
+ * Copyright (C) 2020 Ilya Dryomov <idryomov@gmail.com>
+ */
+
+#include <linux/ceph/ceph_debug.h>
+
+#include <crypto/aead.h>
+#include <crypto/algapi.h> /* for crypto_memneq() */
+#include <crypto/hash.h>
+#include <crypto/sha.h>
+#include <linux/bvec.h>
+#include <linux/crc32c.h>
+#include <linux/net.h>
+#include <linux/scatterlist.h>
+#include <linux/socket.h>
+#include <linux/sched/mm.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+
+#include <linux/ceph/ceph_features.h>
+#include <linux/ceph/decode.h>
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/messenger.h>
+
+#include "crypto.h" /* for CEPH_KEY_LEN and CEPH_MAX_CON_SECRET_LEN */
+
+#define FRAME_TAG_HELLO 1
+#define FRAME_TAG_AUTH_REQUEST 2
+#define FRAME_TAG_AUTH_BAD_METHOD 3
+#define FRAME_TAG_AUTH_REPLY_MORE 4
+#define FRAME_TAG_AUTH_REQUEST_MORE 5
+#define FRAME_TAG_AUTH_DONE 6
+#define FRAME_TAG_AUTH_SIGNATURE 7
+#define FRAME_TAG_CLIENT_IDENT 8
+#define FRAME_TAG_SERVER_IDENT 9
+#define FRAME_TAG_IDENT_MISSING_FEATURES 10
+#define FRAME_TAG_SESSION_RECONNECT 11
+#define FRAME_TAG_SESSION_RESET 12
+#define FRAME_TAG_SESSION_RETRY 13
+#define FRAME_TAG_SESSION_RETRY_GLOBAL 14
+#define FRAME_TAG_SESSION_RECONNECT_OK 15
+#define FRAME_TAG_WAIT 16
+#define FRAME_TAG_MESSAGE 17
+#define FRAME_TAG_KEEPALIVE2 18
+#define FRAME_TAG_KEEPALIVE2_ACK 19
+#define FRAME_TAG_ACK 20
+
+#define FRAME_LATE_STATUS_ABORTED 0x1
+#define FRAME_LATE_STATUS_COMPLETE 0xe
+#define FRAME_LATE_STATUS_ABORTED_MASK 0xf
+
+#define IN_S_HANDLE_PREAMBLE 1
+#define IN_S_HANDLE_CONTROL 2
+#define IN_S_HANDLE_CONTROL_REMAINDER 3
+#define IN_S_PREPARE_READ_DATA 4
+#define IN_S_PREPARE_READ_DATA_CONT 5
+#define IN_S_HANDLE_EPILOGUE 6
+#define IN_S_FINISH_SKIP 7
+
+#define OUT_S_QUEUE_DATA 1
+#define OUT_S_QUEUE_DATA_CONT 2
+#define OUT_S_QUEUE_ENC_PAGE 3
+#define OUT_S_QUEUE_ZEROS 4
+#define OUT_S_FINISH_MESSAGE 5
+#define OUT_S_GET_NEXT 6
+
+#define CTRL_BODY(p) ((void *)(p) + CEPH_PREAMBLE_LEN)
+#define FRONT_PAD(p) ((void *)(p) + CEPH_EPILOGUE_SECURE_LEN)
+#define MIDDLE_PAD(p) (FRONT_PAD(p) + CEPH_GCM_BLOCK_LEN)
+#define DATA_PAD(p) (MIDDLE_PAD(p) + CEPH_GCM_BLOCK_LEN)
+
+#define CEPH_MSG_FLAGS (MSG_DONTWAIT | MSG_NOSIGNAL)
+
+static int do_recvmsg(struct socket *sock, struct iov_iter *it)
+{
+ struct msghdr msg = { .msg_flags = CEPH_MSG_FLAGS };
+ int ret;
+
+ msg.msg_iter = *it;
+ while (iov_iter_count(it)) {
+ ret = sock_recvmsg(sock, &msg, msg.msg_flags);
+ if (ret <= 0) {
+ if (ret == -EAGAIN)
+ ret = 0;
+ return ret;
+ }
+
+ iov_iter_advance(it, ret);
+ }
+
+ WARN_ON(msg_data_left(&msg));
+ return 1;
+}
+
+/*
+ * Read as much as possible.
+ *
+ * Return:
+ * 1 - done, nothing (else) to read
+ * 0 - socket is empty, need to wait
+ * <0 - error
+ */
+static int ceph_tcp_recv(struct ceph_connection *con)
+{
+ int ret;
+
+ dout("%s con %p %s %zu\n", __func__, con,
+ iov_iter_is_discard(&con->v2.in_iter) ? "discard" : "need",
+ iov_iter_count(&con->v2.in_iter));
+ ret = do_recvmsg(con->sock, &con->v2.in_iter);
+ dout("%s con %p ret %d left %zu\n", __func__, con, ret,
+ iov_iter_count(&con->v2.in_iter));
+ return ret;
+}
+
+static int do_sendmsg(struct socket *sock, struct iov_iter *it)
+{
+ struct msghdr msg = { .msg_flags = CEPH_MSG_FLAGS };
+ int ret;
+
+ msg.msg_iter = *it;
+ while (iov_iter_count(it)) {
+ ret = sock_sendmsg(sock, &msg);
+ if (ret <= 0) {
+ if (ret == -EAGAIN)
+ ret = 0;
+ return ret;
+ }
+
+ iov_iter_advance(it, ret);
+ }
+
+ WARN_ON(msg_data_left(&msg));
+ return 1;
+}
+
+static int do_try_sendpage(struct socket *sock, struct iov_iter *it)
+{
+ struct msghdr msg = { .msg_flags = CEPH_MSG_FLAGS };
+ struct bio_vec bv;
+ int ret;
+
+ if (WARN_ON(!iov_iter_is_bvec(it)))
+ return -EINVAL;
+
+ while (iov_iter_count(it)) {
+ /* iov_iter_iovec() for ITER_BVEC */
+ bv.bv_page = it->bvec->bv_page;
+ bv.bv_offset = it->bvec->bv_offset + it->iov_offset;
+ bv.bv_len = min(iov_iter_count(it),
+ it->bvec->bv_len - it->iov_offset);
+
+ /*
+ * sendpage cannot properly handle pages with
+ * page_count == 0, we need to fall back to sendmsg if
+ * that's the case.
+ *
+ * Same goes for slab pages: skb_can_coalesce() allows
+ * coalescing neighboring slab objects into a single frag
+ * which triggers one of hardened usercopy checks.
+ */
+ if (sendpage_ok(bv.bv_page)) {
+ ret = sock->ops->sendpage(sock, bv.bv_page,
+ bv.bv_offset, bv.bv_len,
+ CEPH_MSG_FLAGS);
+ } else {
+ iov_iter_bvec(&msg.msg_iter, WRITE, &bv, 1, bv.bv_len);
+ ret = sock_sendmsg(sock, &msg);
+ }
+ if (ret <= 0) {
+ if (ret == -EAGAIN)
+ ret = 0;
+ return ret;
+ }
+
+ iov_iter_advance(it, ret);
+ }
+
+ return 1;
+}
+
+/*
+ * Write as much as possible. The socket is expected to be corked,
+ * so we don't bother with MSG_MORE/MSG_SENDPAGE_NOTLAST here.
+ *
+ * Return:
+ * 1 - done, nothing (else) to write
+ * 0 - socket is full, need to wait
+ * <0 - error
+ */
+static int ceph_tcp_send(struct ceph_connection *con)
+{
+ int ret;
+
+ dout("%s con %p have %zu try_sendpage %d\n", __func__, con,
+ iov_iter_count(&con->v2.out_iter), con->v2.out_iter_sendpage);
+ if (con->v2.out_iter_sendpage)
+ ret = do_try_sendpage(con->sock, &con->v2.out_iter);
+ else
+ ret = do_sendmsg(con->sock, &con->v2.out_iter);
+ dout("%s con %p ret %d left %zu\n", __func__, con, ret,
+ iov_iter_count(&con->v2.out_iter));
+ return ret;
+}
+
+static void add_in_kvec(struct ceph_connection *con, void *buf, int len)
+{
+ BUG_ON(con->v2.in_kvec_cnt >= ARRAY_SIZE(con->v2.in_kvecs));
+ WARN_ON(!iov_iter_is_kvec(&con->v2.in_iter));
+
+ con->v2.in_kvecs[con->v2.in_kvec_cnt].iov_base = buf;
+ con->v2.in_kvecs[con->v2.in_kvec_cnt].iov_len = len;
+ con->v2.in_kvec_cnt++;
+
+ con->v2.in_iter.nr_segs++;
+ con->v2.in_iter.count += len;
+}
+
+static void reset_in_kvecs(struct ceph_connection *con)
+{
+ WARN_ON(iov_iter_count(&con->v2.in_iter));
+
+ con->v2.in_kvec_cnt = 0;
+ iov_iter_kvec(&con->v2.in_iter, READ, con->v2.in_kvecs, 0, 0);
+}
+
+static void set_in_bvec(struct ceph_connection *con, const struct bio_vec *bv)
+{
+ WARN_ON(iov_iter_count(&con->v2.in_iter));
+
+ con->v2.in_bvec = *bv;
+ iov_iter_bvec(&con->v2.in_iter, READ, &con->v2.in_bvec, 1, bv->bv_len);
+}
+
+static void set_in_skip(struct ceph_connection *con, int len)
+{
+ WARN_ON(iov_iter_count(&con->v2.in_iter));
+
+ dout("%s con %p len %d\n", __func__, con, len);
+ iov_iter_discard(&con->v2.in_iter, READ, len);
+}
+
+static void add_out_kvec(struct ceph_connection *con, void *buf, int len)
+{
+ BUG_ON(con->v2.out_kvec_cnt >= ARRAY_SIZE(con->v2.out_kvecs));
+ WARN_ON(!iov_iter_is_kvec(&con->v2.out_iter));
+ WARN_ON(con->v2.out_zero);
+
+ con->v2.out_kvecs[con->v2.out_kvec_cnt].iov_base = buf;
+ con->v2.out_kvecs[con->v2.out_kvec_cnt].iov_len = len;
+ con->v2.out_kvec_cnt++;
+
+ con->v2.out_iter.nr_segs++;
+ con->v2.out_iter.count += len;
+}
+
+static void reset_out_kvecs(struct ceph_connection *con)
+{
+ WARN_ON(iov_iter_count(&con->v2.out_iter));
+ WARN_ON(con->v2.out_zero);
+
+ con->v2.out_kvec_cnt = 0;
+
+ iov_iter_kvec(&con->v2.out_iter, WRITE, con->v2.out_kvecs, 0, 0);
+ con->v2.out_iter_sendpage = false;
+}
+
+static void set_out_bvec(struct ceph_connection *con, const struct bio_vec *bv,
+ bool zerocopy)
+{
+ WARN_ON(iov_iter_count(&con->v2.out_iter));
+ WARN_ON(con->v2.out_zero);
+
+ con->v2.out_bvec = *bv;
+ con->v2.out_iter_sendpage = zerocopy;
+ iov_iter_bvec(&con->v2.out_iter, WRITE, &con->v2.out_bvec, 1,
+ con->v2.out_bvec.bv_len);
+}
+
+static void set_out_bvec_zero(struct ceph_connection *con)
+{
+ WARN_ON(iov_iter_count(&con->v2.out_iter));
+ WARN_ON(!con->v2.out_zero);
+
+ con->v2.out_bvec.bv_page = ceph_zero_page;
+ con->v2.out_bvec.bv_offset = 0;
+ con->v2.out_bvec.bv_len = min(con->v2.out_zero, (int)PAGE_SIZE);
+ con->v2.out_iter_sendpage = true;
+ iov_iter_bvec(&con->v2.out_iter, WRITE, &con->v2.out_bvec, 1,
+ con->v2.out_bvec.bv_len);
+}
+
+static void out_zero_add(struct ceph_connection *con, int len)
+{
+ dout("%s con %p len %d\n", __func__, con, len);
+ con->v2.out_zero += len;
+}
+
+static void *alloc_conn_buf(struct ceph_connection *con, int len)
+{
+ void *buf;
+
+ dout("%s con %p len %d\n", __func__, con, len);
+
+ if (WARN_ON(con->v2.conn_buf_cnt >= ARRAY_SIZE(con->v2.conn_bufs)))
+ return NULL;
+
+ buf = ceph_kvmalloc(len, GFP_NOIO);
+ if (!buf)
+ return NULL;
+
+ con->v2.conn_bufs[con->v2.conn_buf_cnt++] = buf;
+ return buf;
+}
+
+static void free_conn_bufs(struct ceph_connection *con)
+{
+ while (con->v2.conn_buf_cnt)
+ kvfree(con->v2.conn_bufs[--con->v2.conn_buf_cnt]);
+}
+
+static void add_in_sign_kvec(struct ceph_connection *con, void *buf, int len)
+{
+ BUG_ON(con->v2.in_sign_kvec_cnt >= ARRAY_SIZE(con->v2.in_sign_kvecs));
+
+ con->v2.in_sign_kvecs[con->v2.in_sign_kvec_cnt].iov_base = buf;
+ con->v2.in_sign_kvecs[con->v2.in_sign_kvec_cnt].iov_len = len;
+ con->v2.in_sign_kvec_cnt++;
+}
+
+static void clear_in_sign_kvecs(struct ceph_connection *con)
+{
+ con->v2.in_sign_kvec_cnt = 0;
+}
+
+static void add_out_sign_kvec(struct ceph_connection *con, void *buf, int len)
+{
+ BUG_ON(con->v2.out_sign_kvec_cnt >= ARRAY_SIZE(con->v2.out_sign_kvecs));
+
+ con->v2.out_sign_kvecs[con->v2.out_sign_kvec_cnt].iov_base = buf;
+ con->v2.out_sign_kvecs[con->v2.out_sign_kvec_cnt].iov_len = len;
+ con->v2.out_sign_kvec_cnt++;
+}
+
+static void clear_out_sign_kvecs(struct ceph_connection *con)
+{
+ con->v2.out_sign_kvec_cnt = 0;
+}
+
+static bool con_secure(struct ceph_connection *con)
+{
+ return con->v2.con_mode == CEPH_CON_MODE_SECURE;
+}
+
+static int front_len(const struct ceph_msg *msg)
+{
+ return le32_to_cpu(msg->hdr.front_len);
+}
+
+static int middle_len(const struct ceph_msg *msg)
+{
+ return le32_to_cpu(msg->hdr.middle_len);
+}
+
+static int data_len(const struct ceph_msg *msg)
+{
+ return le32_to_cpu(msg->hdr.data_len);
+}
+
+static bool need_padding(int len)
+{
+ return !IS_ALIGNED(len, CEPH_GCM_BLOCK_LEN);
+}
+
+static int padded_len(int len)
+{
+ return ALIGN(len, CEPH_GCM_BLOCK_LEN);
+}
+
+static int padding_len(int len)
+{
+ return padded_len(len) - len;
+}
+
+/* preamble + control segment */
+static int head_onwire_len(int ctrl_len, bool secure)
+{
+ int head_len;
+ int rem_len;
+
+ if (secure) {
+ head_len = CEPH_PREAMBLE_SECURE_LEN;
+ if (ctrl_len > CEPH_PREAMBLE_INLINE_LEN) {
+ rem_len = ctrl_len - CEPH_PREAMBLE_INLINE_LEN;
+ head_len += padded_len(rem_len) + CEPH_GCM_TAG_LEN;
+ }
+ } else {
+ head_len = CEPH_PREAMBLE_PLAIN_LEN;
+ if (ctrl_len)
+ head_len += ctrl_len + CEPH_CRC_LEN;
+ }
+ return head_len;
+}
+
+/* front, middle and data segments + epilogue */
+static int __tail_onwire_len(int front_len, int middle_len, int data_len,
+ bool secure)
+{
+ if (!front_len && !middle_len && !data_len)
+ return 0;
+
+ if (!secure)
+ return front_len + middle_len + data_len +
+ CEPH_EPILOGUE_PLAIN_LEN;
+
+ return padded_len(front_len) + padded_len(middle_len) +
+ padded_len(data_len) + CEPH_EPILOGUE_SECURE_LEN;
+}
+
+static int tail_onwire_len(const struct ceph_msg *msg, bool secure)
+{
+ return __tail_onwire_len(front_len(msg), middle_len(msg),
+ data_len(msg), secure);
+}
+
+/* head_onwire_len(sizeof(struct ceph_msg_header2), false) */
+#define MESSAGE_HEAD_PLAIN_LEN (CEPH_PREAMBLE_PLAIN_LEN + \
+ sizeof(struct ceph_msg_header2) + \
+ CEPH_CRC_LEN)
+
+static const int frame_aligns[] = {
+ sizeof(void *),
+ sizeof(void *),
+ sizeof(void *),
+ PAGE_SIZE
+};
+
+/*
+ * Discards trailing empty segments, unless there is just one segment.
+ * A frame always has at least one (possibly empty) segment.
+ */
+static int calc_segment_count(const int *lens, int len_cnt)
+{
+ int i;
+
+ for (i = len_cnt - 1; i >= 0; i--) {
+ if (lens[i])
+ return i + 1;
+ }
+
+ return 1;
+}
+
+static void init_frame_desc(struct ceph_frame_desc *desc, int tag,
+ const int *lens, int len_cnt)
+{
+ int i;
+
+ memset(desc, 0, sizeof(*desc));
+
+ desc->fd_tag = tag;
+ desc->fd_seg_cnt = calc_segment_count(lens, len_cnt);
+ BUG_ON(desc->fd_seg_cnt > CEPH_FRAME_MAX_SEGMENT_COUNT);
+ for (i = 0; i < desc->fd_seg_cnt; i++) {
+ desc->fd_lens[i] = lens[i];
+ desc->fd_aligns[i] = frame_aligns[i];
+ }
+}
+
+/*
+ * Preamble crc covers everything up to itself (28 bytes) and
+ * is calculated and verified irrespective of the connection mode
+ * (i.e. even if the frame is encrypted).
+ */
+static void encode_preamble(const struct ceph_frame_desc *desc, void *p)
+{
+ void *crcp = p + CEPH_PREAMBLE_LEN - CEPH_CRC_LEN;
+ void *start = p;
+ int i;
+
+ memset(p, 0, CEPH_PREAMBLE_LEN);
+
+ ceph_encode_8(&p, desc->fd_tag);
+ ceph_encode_8(&p, desc->fd_seg_cnt);
+ for (i = 0; i < desc->fd_seg_cnt; i++) {
+ ceph_encode_32(&p, desc->fd_lens[i]);
+ ceph_encode_16(&p, desc->fd_aligns[i]);
+ }
+
+ put_unaligned_le32(crc32c(0, start, crcp - start), crcp);
+}
+
+static int decode_preamble(void *p, struct ceph_frame_desc *desc)
+{
+ void *crcp = p + CEPH_PREAMBLE_LEN - CEPH_CRC_LEN;
+ u32 crc, expected_crc;
+ int i;
+
+ crc = crc32c(0, p, crcp - p);
+ expected_crc = get_unaligned_le32(crcp);
+ if (crc != expected_crc) {
+ pr_err("bad preamble crc, calculated %u, expected %u\n",
+ crc, expected_crc);
+ return -EBADMSG;
+ }
+
+ memset(desc, 0, sizeof(*desc));
+
+ desc->fd_tag = ceph_decode_8(&p);
+ desc->fd_seg_cnt = ceph_decode_8(&p);
+ if (desc->fd_seg_cnt < 1 ||
+ desc->fd_seg_cnt > CEPH_FRAME_MAX_SEGMENT_COUNT) {
+ pr_err("bad segment count %d\n", desc->fd_seg_cnt);
+ return -EINVAL;
+ }
+ for (i = 0; i < desc->fd_seg_cnt; i++) {
+ desc->fd_lens[i] = ceph_decode_32(&p);
+ desc->fd_aligns[i] = ceph_decode_16(&p);
+ }
+
+ /*
+ * This would fire for FRAME_TAG_WAIT (it has one empty
+ * segment), but we should never get it as client.
+ */
+ if (!desc->fd_