diff options
author | Ilya Dryomov <idryomov@gmail.com> | 2020-11-19 16:59:08 +0100 |
---|---|---|
committer | Ilya Dryomov <idryomov@gmail.com> | 2020-12-14 23:21:50 +0100 |
commit | cd1a677cad994021b19665ed476aea63f5d54f31 (patch) | |
tree | 07385b55c4b9aa24cbea60018de04959a3cc91d5 /net/ceph | |
parent | 00498b994113a871a556f7ff24a4cf8a00611700 (diff) |
libceph, ceph: implement msgr2.1 protocol (crc and secure modes)
Implement msgr2.1 wire protocol, available since nautilus 14.2.11
and octopus 15.2.5. msgr2.0 wire protocol is not implemented -- it
has several security, integrity and robustness issues and therefore
considered deprecated.
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Diffstat (limited to 'net/ceph')
-rw-r--r-- | net/ceph/Kconfig | 3 | ||||
-rw-r--r-- | net/ceph/Makefile | 2 | ||||
-rw-r--r-- | net/ceph/auth.c | 309 | ||||
-rw-r--r-- | net/ceph/decode.c | 45 | ||||
-rw-r--r-- | net/ceph/messenger.c | 68 | ||||
-rw-r--r-- | net/ceph/messenger_v2.c | 3443 | ||||
-rw-r--r-- | net/ceph/mon_client.c | 115 | ||||
-rw-r--r-- | net/ceph/osd_client.c | 85 |
8 files changed, 4046 insertions, 24 deletions
diff --git a/net/ceph/Kconfig b/net/ceph/Kconfig index f36f9a3a4e20..c5c4eef3a9ff 100644 --- a/net/ceph/Kconfig +++ b/net/ceph/Kconfig @@ -5,6 +5,9 @@ config CEPH_LIB select LIBCRC32C select CRYPTO_AES select CRYPTO_CBC + select CRYPTO_GCM + select CRYPTO_HMAC + select CRYPTO_SHA256 select CRYPTO select KEYS default n diff --git a/net/ceph/Makefile b/net/ceph/Makefile index df02bd8d6c7b..8802a0c0155d 100644 --- a/net/ceph/Makefile +++ b/net/ceph/Makefile @@ -15,4 +15,4 @@ libceph-y := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \ auth_x.o \ ceph_strings.o ceph_hash.o \ pagevec.o snapshot.o string_table.o \ - messenger_v1.o + messenger_v1.o messenger_v2.o diff --git a/net/ceph/auth.c b/net/ceph/auth.c index 4a0f32b32cc6..6b315c8212b1 100644 --- a/net/ceph/auth.c +++ b/net/ceph/auth.c @@ -293,6 +293,39 @@ int ceph_auth_is_authenticated(struct ceph_auth_client *ac) } EXPORT_SYMBOL(ceph_auth_is_authenticated); +int __ceph_auth_get_authorizer(struct ceph_auth_client *ac, + struct ceph_auth_handshake *auth, + int peer_type, bool force_new, + int *proto, int *pref_mode, int *fallb_mode) +{ + int ret; + + mutex_lock(&ac->mutex); + if (force_new && auth->authorizer) { + ceph_auth_destroy_authorizer(auth->authorizer); + auth->authorizer = NULL; + } + if (!auth->authorizer) + ret = ac->ops->create_authorizer(ac, peer_type, auth); + else if (ac->ops->update_authorizer) + ret = ac->ops->update_authorizer(ac, peer_type, auth); + else + ret = 0; + if (ret) + goto out; + + *proto = ac->protocol; + if (pref_mode && fallb_mode) { + *pref_mode = ac->preferred_mode; + *fallb_mode = ac->fallback_mode; + } + +out: + mutex_unlock(&ac->mutex); + return ret; +} +EXPORT_SYMBOL(__ceph_auth_get_authorizer); + int ceph_auth_create_authorizer(struct ceph_auth_client *ac, int peer_type, struct ceph_auth_handshake *auth) @@ -369,3 +402,279 @@ void ceph_auth_invalidate_authorizer(struct ceph_auth_client *ac, int peer_type) mutex_unlock(&ac->mutex); } EXPORT_SYMBOL(ceph_auth_invalidate_authorizer); + +/* + * msgr2 authentication + */ + +static bool contains(const int *arr, int cnt, int val) +{ + int i; + + for (i = 0; i < cnt; i++) { + if (arr[i] == val) + return true; + } + + return false; +} + +static int encode_con_modes(void **p, void *end, int pref_mode, int fallb_mode) +{ + WARN_ON(pref_mode == CEPH_CON_MODE_UNKNOWN); + if (fallb_mode != CEPH_CON_MODE_UNKNOWN) { + ceph_encode_32_safe(p, end, 2, e_range); + ceph_encode_32_safe(p, end, pref_mode, e_range); + ceph_encode_32_safe(p, end, fallb_mode, e_range); + } else { + ceph_encode_32_safe(p, end, 1, e_range); + ceph_encode_32_safe(p, end, pref_mode, e_range); + } + + return 0; + +e_range: + return -ERANGE; +} + +/* + * Similar to ceph_auth_build_hello(). + */ +int ceph_auth_get_request(struct ceph_auth_client *ac, void *buf, int buf_len) +{ + int proto = ac->key ? CEPH_AUTH_CEPHX : CEPH_AUTH_NONE; + void *end = buf + buf_len; + void *lenp; + void *p; + int ret; + + mutex_lock(&ac->mutex); + if (ac->protocol == CEPH_AUTH_UNKNOWN) { + ret = init_protocol(ac, proto); + if (ret) { + pr_err("auth protocol '%s' init failed: %d\n", + ceph_auth_proto_name(proto), ret); + goto out; + } + } else { + WARN_ON(ac->protocol != proto); + ac->ops->reset(ac); + } + + p = buf; + ceph_encode_32_safe(&p, end, ac->protocol, e_range); + ret = encode_con_modes(&p, end, ac->preferred_mode, ac->fallback_mode); + if (ret) + goto out; + + lenp = p; + p += 4; /* space for len */ + + ceph_encode_8_safe(&p, end, CEPH_AUTH_MODE_MON, e_range); + ret = ceph_auth_entity_name_encode(ac->name, &p, end); + if (ret) + goto out; + + ceph_encode_64_safe(&p, end, ac->global_id, e_range); + ceph_encode_32(&lenp, p - lenp - 4); + ret = p - buf; + +out: + mutex_unlock(&ac->mutex); + return ret; + +e_range: + ret = -ERANGE; + goto out; +} + +int ceph_auth_handle_reply_more(struct ceph_auth_client *ac, void *reply, + int reply_len, void *buf, int buf_len) +{ + int ret; + + mutex_lock(&ac->mutex); + ret = ac->ops->handle_reply(ac, 0, reply, reply + reply_len, + NULL, NULL, NULL, NULL); + if (ret == -EAGAIN) + ret = build_request(ac, false, buf, buf_len); + else + WARN_ON(ret >= 0); + mutex_unlock(&ac->mutex); + return ret; +} + +int ceph_auth_handle_reply_done(struct ceph_auth_client *ac, + u64 global_id, void *reply, int reply_len, + u8 *session_key, int *session_key_len, + u8 *con_secret, int *con_secret_len) +{ + int ret; + + mutex_lock(&ac->mutex); + if (global_id && ac->global_id != global_id) { + dout("%s global_id %llu -> %llu\n", __func__, ac->global_id, + global_id); + ac->global_id = global_id; + } + + ret = ac->ops->handle_reply(ac, 0, reply, reply + reply_len, + session_key, session_key_len, + con_secret, con_secret_len); + mutex_unlock(&ac->mutex); + return ret; +} + +bool ceph_auth_handle_bad_method(struct ceph_auth_client *ac, + int used_proto, int result, + const int *allowed_protos, int proto_cnt, + const int *allowed_modes, int mode_cnt) +{ + mutex_lock(&ac->mutex); + WARN_ON(used_proto != ac->protocol); + + if (result == -EOPNOTSUPP) { + if (!contains(allowed_protos, proto_cnt, ac->protocol)) { + pr_err("auth protocol '%s' not allowed\n", + ceph_auth_proto_name(ac->protocol)); + goto not_allowed; + } + if (!contains(allowed_modes, mode_cnt, ac->preferred_mode) && + (ac->fallback_mode == CEPH_CON_MODE_UNKNOWN || + !contains(allowed_modes, mode_cnt, ac->fallback_mode))) { + pr_err("preferred mode '%s' not allowed\n", + ceph_con_mode_name(ac->preferred_mode)); + if (ac->fallback_mode == CEPH_CON_MODE_UNKNOWN) + pr_err("no fallback mode\n"); + else + pr_err("fallback mode '%s' not allowed\n", + ceph_con_mode_name(ac->fallback_mode)); + goto not_allowed; + } + } + + WARN_ON(result == -EOPNOTSUPP || result >= 0); + pr_err("auth protocol '%s' msgr authentication failed: %d\n", + ceph_auth_proto_name(ac->protocol), result); + + mutex_unlock(&ac->mutex); + return true; + +not_allowed: + mutex_unlock(&ac->mutex); + return false; +} + +int ceph_auth_get_authorizer(struct ceph_auth_client *ac, + struct ceph_auth_handshake *auth, + int peer_type, void *buf, int *buf_len) +{ + void *end = buf + *buf_len; + int pref_mode, fallb_mode; + int proto; + void *p; + int ret; + + ret = __ceph_auth_get_authorizer(ac, auth, peer_type, true, &proto, + &pref_mode, &fallb_mode); + if (ret) + return ret; + + p = buf; + ceph_encode_32_safe(&p, end, proto, e_range); + ret = encode_con_modes(&p, end, pref_mode, fallb_mode); + if (ret) + return ret; + + ceph_encode_32_safe(&p, end, auth->authorizer_buf_len, e_range); + *buf_len = p - buf; + return 0; + +e_range: + return -ERANGE; +} +EXPORT_SYMBOL(ceph_auth_get_authorizer); + +int ceph_auth_handle_svc_reply_more(struct ceph_auth_client *ac, + struct ceph_auth_handshake *auth, + void *reply, int reply_len, + void *buf, int *buf_len) +{ + void *end = buf + *buf_len; + void *p; + int ret; + + ret = ceph_auth_add_authorizer_challenge(ac, auth->authorizer, + reply, reply_len); + if (ret) + return ret; + + p = buf; + ceph_encode_32_safe(&p, end, auth->authorizer_buf_len, e_range); + *buf_len = p - buf; + return 0; + +e_range: + return -ERANGE; +} +EXPORT_SYMBOL(ceph_auth_handle_svc_reply_more); + +int ceph_auth_handle_svc_reply_done(struct ceph_auth_client *ac, + struct ceph_auth_handshake *auth, + void *reply, int reply_len, + u8 *session_key, int *session_key_len, + u8 *con_secret, int *con_secret_len) +{ + return ceph_auth_verify_authorizer_reply(ac, auth->authorizer, + reply, reply_len, session_key, session_key_len, + con_secret, con_secret_len); +} +EXPORT_SYMBOL(ceph_auth_handle_svc_reply_done); + +bool ceph_auth_handle_bad_authorizer(struct ceph_auth_client *ac, + int peer_type, int used_proto, int result, + const int *allowed_protos, int proto_cnt, + const int *allowed_modes, int mode_cnt) +{ + mutex_lock(&ac->mutex); + WARN_ON(used_proto != ac->protocol); + + if (result == -EOPNOTSUPP) { + if (!contains(allowed_protos, proto_cnt, ac->protocol)) { + pr_err("auth protocol '%s' not allowed by %s\n", + ceph_auth_proto_name(ac->protocol), + ceph_entity_type_name(peer_type)); + goto not_allowed; + } + if (!contains(allowed_modes, mode_cnt, ac->preferred_mode) && + (ac->fallback_mode == CEPH_CON_MODE_UNKNOWN || + !contains(allowed_modes, mode_cnt, ac->fallback_mode))) { + pr_err("preferred mode '%s' not allowed by %s\n", + ceph_con_mode_name(ac->preferred_mode), + ceph_entity_type_name(peer_type)); + if (ac->fallback_mode == CEPH_CON_MODE_UNKNOWN) + pr_err("no fallback mode\n"); + else + pr_err("fallback mode '%s' not allowed by %s\n", + ceph_con_mode_name(ac->fallback_mode), + ceph_entity_type_name(peer_type)); + goto not_allowed; + } + } + + WARN_ON(result == -EOPNOTSUPP || result >= 0); + pr_err("auth protocol '%s' authorization to %s failed: %d\n", + ceph_auth_proto_name(ac->protocol), + ceph_entity_type_name(peer_type), result); + + if (ac->ops->invalidate_authorizer) + ac->ops->invalidate_authorizer(ac, peer_type); + + mutex_unlock(&ac->mutex); + return true; + +not_allowed: + mutex_unlock(&ac->mutex); + return false; +} +EXPORT_SYMBOL(ceph_auth_handle_bad_authorizer); diff --git a/net/ceph/decode.c b/net/ceph/decode.c index 6429b6713507..b44f7651be04 100644 --- a/net/ceph/decode.c +++ b/net/ceph/decode.c @@ -1,6 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/ceph/ceph_debug.h> +#include <linux/inet.h> + #include <linux/ceph/decode.h> static int @@ -138,3 +140,46 @@ e_inval: return -EINVAL; } EXPORT_SYMBOL(ceph_decode_entity_addrvec); + +static int get_sockaddr_encoding_len(sa_family_t family) +{ + union { + struct sockaddr sa; + struct sockaddr_in sin; + struct sockaddr_in6 sin6; + } u; + + switch (family) { + case AF_INET: + return sizeof(u.sin); + case AF_INET6: + return sizeof(u.sin6); + default: + return sizeof(u); + } +} + +int ceph_entity_addr_encoding_len(const struct ceph_entity_addr *addr) +{ + sa_family_t family = get_unaligned(&addr->in_addr.ss_family); + int addr_len = get_sockaddr_encoding_len(family); + + return 1 + CEPH_ENCODING_START_BLK_LEN + 4 + 4 + 4 + addr_len; +} + +void ceph_encode_entity_addr(void **p, const struct ceph_entity_addr *addr) +{ + sa_family_t family = get_unaligned(&addr->in_addr.ss_family); + int addr_len = get_sockaddr_encoding_len(family); + + ceph_encode_8(p, 1); /* marker */ + ceph_start_encoding(p, 1, 1, sizeof(addr->type) + + sizeof(addr->nonce) + + sizeof(u32) + addr_len); + ceph_encode_copy(p, &addr->type, sizeof(addr->type)); + ceph_encode_copy(p, &addr->nonce, sizeof(addr->nonce)); + + ceph_encode_32(p, addr_len); + ceph_encode_16(p, family); + ceph_encode_copy(p, addr->in_addr.__data, addr_len - sizeof(family)); +} diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 4fb3c33a7b03..57d043b382ed 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -195,8 +195,11 @@ EXPORT_SYMBOL(ceph_pr_addr); void ceph_encode_my_addr(struct ceph_messenger *msgr) { - memcpy(&msgr->my_enc_addr, &msgr->inst.addr, sizeof(msgr->my_enc_addr)); - ceph_encode_banner_addr(&msgr->my_enc_addr); + if (!ceph_msgr2(from_msgr(msgr))) { + memcpy(&msgr->my_enc_addr, &msgr->inst.addr, + sizeof(msgr->my_enc_addr)); + ceph_encode_banner_addr(&msgr->my_enc_addr); + } } /* @@ -513,7 +516,10 @@ static void ceph_con_reset_protocol(struct ceph_connection *con) con->out_msg = NULL; } - ceph_con_v1_reset_protocol(con); + if (ceph_msgr2(from_msgr(con->msgr))) + ceph_con_v2_reset_protocol(con); + else + ceph_con_v1_reset_protocol(con); } /* @@ -526,6 +532,7 @@ static void ceph_msg_remove(struct ceph_msg *msg) ceph_msg_put(msg); } + static void ceph_msg_remove_list(struct list_head *head) { while (!list_empty(head)) { @@ -547,7 +554,10 @@ void ceph_con_reset_session(struct ceph_connection *con) con->in_seq = 0; con->in_seq_acked = 0; - ceph_con_v1_reset_session(con); + if (ceph_msgr2(from_msgr(con->msgr))) + ceph_con_v2_reset_session(con); + else + ceph_con_v1_reset_session(con); } /* @@ -600,6 +610,9 @@ EXPORT_SYMBOL(ceph_con_open); */ bool ceph_con_opened(struct ceph_connection *con) { + if (ceph_msgr2(from_msgr(con->msgr))) + return ceph_con_v2_opened(con); + return ceph_con_v1_opened(con); } @@ -1302,7 +1315,16 @@ int ceph_parse_ips(const char *c, const char *end, } ceph_addr_set_port(&addr[i], port); + /* + * We want the type to be set according to ms_mode + * option, but options are normally parsed after mon + * addresses. Rather than complicating parsing, set + * to LEGACY and override in build_initial_monmap() + * for mon addresses and ceph_messenger_init() for + * ip option. + */ addr[i].type = CEPH_ENTITY_ADDR_TYPE_LEGACY; + addr[i].nonce = 0; dout("parse_ips got %s\n", ceph_pr_addr(&addr[i])); @@ -1410,6 +1432,13 @@ static bool con_sock_closed(struct ceph_connection *con) CASE(PREOPEN); CASE(V1_BANNER); CASE(V1_CONNECT_MSG); + CASE(V2_BANNER_PREFIX); + CASE(V2_BANNER_PAYLOAD); + CASE(V2_HELLO); + CASE(V2_AUTH); + CASE(V2_AUTH_SIGNATURE); + CASE(V2_SESSION_CONNECT); + CASE(V2_SESSION_RECONNECT); CASE(OPEN); CASE(STANDBY); default: @@ -1494,7 +1523,10 @@ static void ceph_con_workfn(struct work_struct *work) BUG_ON(con->sock); } - ret = ceph_con_v1_try_read(con); + if (ceph_msgr2(from_msgr(con->msgr))) + ret = ceph_con_v2_try_read(con); + else + ret = ceph_con_v1_try_read(con); if (ret < 0) { if (ret == -EAGAIN) continue; @@ -1504,7 +1536,10 @@ static void ceph_con_workfn(struct work_struct *work) break; } - ret = ceph_con_v1_try_write(con); + if (ceph_msgr2(from_msgr(con->msgr))) + ret = ceph_con_v2_try_write(con); + else + ret = ceph_con_v1_try_write(con); if (ret < 0) { if (ret == -EAGAIN) continue; @@ -1538,9 +1573,8 @@ static void con_fault(struct ceph_connection *con) ceph_pr_addr(&con->peer_addr), con->error_msg); con->error_msg = NULL; - WARN_ON(con->state != CEPH_CON_S_V1_BANNER && - con->state != CEPH_CON_S_V1_CONNECT_MSG && - con->state != CEPH_CON_S_OPEN); + WARN_ON(con->state == CEPH_CON_S_STANDBY || + con->state == CEPH_CON_S_CLOSED); ceph_con_reset_protocol(con); @@ -1596,7 +1630,11 @@ void ceph_messenger_init(struct ceph_messenger *msgr, ceph_addr_set_port(&msgr->inst.addr, 0); } - msgr->inst.addr.type = 0; + /* + * Since nautilus, clients are identified using type ANY. + * For msgr1, ceph_encode_banner_addr() munges it to NONE. + */ + msgr->inst.addr.type = CEPH_ENTITY_ADDR_TYPE_ANY; /* generate a random non-zero nonce */ do { @@ -1706,7 +1744,10 @@ void ceph_msg_revoke(struct ceph_msg *msg) if (con->out_msg == msg) { WARN_ON(con->state != CEPH_CON_S_OPEN); dout("%s con %p msg %p was sending\n", __func__, con, msg); - ceph_con_v1_revoke(con); + if (ceph_msgr2(from_msgr(con->msgr))) + ceph_con_v2_revoke(con); + else + ceph_con_v1_revoke(con); ceph_msg_put(con->out_msg); con->out_msg = NULL; } else { @@ -1732,7 +1773,10 @@ void ceph_msg_revoke_incoming(struct ceph_msg *msg) if (con->in_msg == msg) { WARN_ON(con->state != CEPH_CON_S_OPEN); dout("%s con %p msg %p was recving\n", __func__, con, msg); - ceph_con_v1_revoke_incoming(con); + if (ceph_msgr2(from_msgr(con->msgr))) + ceph_con_v2_revoke_incoming(con); + else + ceph_con_v1_revoke_incoming(con); ceph_msg_put(con->in_msg); con->in_msg = NULL; } else { diff --git a/net/ceph/messenger_v2.c b/net/ceph/messenger_v2.c new file mode 100644 index 000000000000..5e38c847317b --- /dev/null +++ b/net/ceph/messenger_v2.c @@ -0,0 +1,3443 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Ceph msgr2 protocol implementation + * + * Copyright (C) 2020 Ilya Dryomov <idryomov@gmail.com> + */ + +#include <linux/ceph/ceph_debug.h> + +#include <crypto/aead.h> +#include <crypto/algapi.h> /* for crypto_memneq() */ +#include <crypto/hash.h> +#include <crypto/sha.h> +#include <linux/bvec.h> +#include <linux/crc32c.h> +#include <linux/net.h> +#include <linux/scatterlist.h> +#include <linux/socket.h> +#include <linux/sched/mm.h> +#include <net/sock.h> +#include <net/tcp.h> + +#include <linux/ceph/ceph_features.h> +#include <linux/ceph/decode.h> +#include <linux/ceph/libceph.h> +#include <linux/ceph/messenger.h> + +#include "crypto.h" /* for CEPH_KEY_LEN and CEPH_MAX_CON_SECRET_LEN */ + +#define FRAME_TAG_HELLO 1 +#define FRAME_TAG_AUTH_REQUEST 2 +#define FRAME_TAG_AUTH_BAD_METHOD 3 +#define FRAME_TAG_AUTH_REPLY_MORE 4 +#define FRAME_TAG_AUTH_REQUEST_MORE 5 +#define FRAME_TAG_AUTH_DONE 6 +#define FRAME_TAG_AUTH_SIGNATURE 7 +#define FRAME_TAG_CLIENT_IDENT 8 +#define FRAME_TAG_SERVER_IDENT 9 +#define FRAME_TAG_IDENT_MISSING_FEATURES 10 +#define FRAME_TAG_SESSION_RECONNECT 11 +#define FRAME_TAG_SESSION_RESET 12 +#define FRAME_TAG_SESSION_RETRY 13 +#define FRAME_TAG_SESSION_RETRY_GLOBAL 14 +#define FRAME_TAG_SESSION_RECONNECT_OK 15 +#define FRAME_TAG_WAIT 16 +#define FRAME_TAG_MESSAGE 17 +#define FRAME_TAG_KEEPALIVE2 18 +#define FRAME_TAG_KEEPALIVE2_ACK 19 +#define FRAME_TAG_ACK 20 + +#define FRAME_LATE_STATUS_ABORTED 0x1 +#define FRAME_LATE_STATUS_COMPLETE 0xe +#define FRAME_LATE_STATUS_ABORTED_MASK 0xf + +#define IN_S_HANDLE_PREAMBLE 1 +#define IN_S_HANDLE_CONTROL 2 +#define IN_S_HANDLE_CONTROL_REMAINDER 3 +#define IN_S_PREPARE_READ_DATA 4 +#define IN_S_PREPARE_READ_DATA_CONT 5 +#define IN_S_HANDLE_EPILOGUE 6 +#define IN_S_FINISH_SKIP 7 + +#define OUT_S_QUEUE_DATA 1 +#define OUT_S_QUEUE_DATA_CONT 2 +#define OUT_S_QUEUE_ENC_PAGE 3 +#define OUT_S_QUEUE_ZEROS 4 +#define OUT_S_FINISH_MESSAGE 5 +#define OUT_S_GET_NEXT 6 + +#define CTRL_BODY(p) ((void *)(p) + CEPH_PREAMBLE_LEN) +#define FRONT_PAD(p) ((void *)(p) + CEPH_EPILOGUE_SECURE_LEN) +#define MIDDLE_PAD(p) (FRONT_PAD(p) + CEPH_GCM_BLOCK_LEN) +#define DATA_PAD(p) (MIDDLE_PAD(p) + CEPH_GCM_BLOCK_LEN) + +#define CEPH_MSG_FLAGS (MSG_DONTWAIT | MSG_NOSIGNAL) + +static int do_recvmsg(struct socket *sock, struct iov_iter *it) +{ + struct msghdr msg = { .msg_flags = CEPH_MSG_FLAGS }; + int ret; + + msg.msg_iter = *it; + while (iov_iter_count(it)) { + ret = sock_recvmsg(sock, &msg, msg.msg_flags); + if (ret <= 0) { + if (ret == -EAGAIN) + ret = 0; + return ret; + } + + iov_iter_advance(it, ret); + } + + WARN_ON(msg_data_left(&msg)); + return 1; +} + +/* + * Read as much as possible. + * + * Return: + * 1 - done, nothing (else) to read + * 0 - socket is empty, need to wait + * <0 - error + */ +static int ceph_tcp_recv(struct ceph_connection *con) +{ + int ret; + + dout("%s con %p %s %zu\n", __func__, con, + iov_iter_is_discard(&con->v2.in_iter) ? "discard" : "need", + iov_iter_count(&con->v2.in_iter)); + ret = do_recvmsg(con->sock, &con->v2.in_iter); + dout("%s con %p ret %d left %zu\n", __func__, con, ret, + iov_iter_count(&con->v2.in_iter)); + return ret; +} + +static int do_sendmsg(struct socket *sock, struct iov_iter *it) +{ + struct msghdr msg = { .msg_flags = CEPH_MSG_FLAGS }; + int ret; + + msg.msg_iter = *it; + while (iov_iter_count(it)) { + ret = sock_sendmsg(sock, &msg); + if (ret <= 0) { + if (ret == -EAGAIN) + ret = 0; + return ret; + } + + iov_iter_advance(it, ret); + } + + WARN_ON(msg_data_left(&msg)); + return 1; +} + +static int do_try_sendpage(struct socket *sock, struct iov_iter *it) +{ + struct msghdr msg = { .msg_flags = CEPH_MSG_FLAGS }; + struct bio_vec bv; + int ret; + + if (WARN_ON(!iov_iter_is_bvec(it))) + return -EINVAL; + + while (iov_iter_count(it)) { + /* iov_iter_iovec() for ITER_BVEC */ + bv.bv_page = it->bvec->bv_page; + bv.bv_offset = it->bvec->bv_offset + it->iov_offset; + bv.bv_len = min(iov_iter_count(it), + it->bvec->bv_len - it->iov_offset); + + /* + * sendpage cannot properly handle pages with + * page_count == 0, we need to fall back to sendmsg if + * that's the case. + * + * Same goes for slab pages: skb_can_coalesce() allows + * coalescing neighboring slab objects into a single frag + * which triggers one of hardened usercopy checks. + */ + if (sendpage_ok(bv.bv_page)) { + ret = sock->ops->sendpage(sock, bv.bv_page, + bv.bv_offset, bv.bv_len, + CEPH_MSG_FLAGS); + } else { + iov_iter_bvec(&msg.msg_iter, WRITE, &bv, 1, bv.bv_len); + ret = sock_sendmsg(sock, &msg); + } + if (ret <= 0) { + if (ret == -EAGAIN) + ret = 0; + return ret; + } + + iov_iter_advance(it, ret); + } + + return 1; +} + +/* + * Write as much as possible. The socket is expected to be corked, + * so we don't bother with MSG_MORE/MSG_SENDPAGE_NOTLAST here. + * + * Return: + * 1 - done, nothing (else) to write + * 0 - socket is full, need to wait + * <0 - error + */ +static int ceph_tcp_send(struct ceph_connection *con) +{ + int ret; + + dout("%s con %p have %zu try_sendpage %d\n", __func__, con, + iov_iter_count(&con->v2.out_iter), con->v2.out_iter_sendpage); + if (con->v2.out_iter_sendpage) + ret = do_try_sendpage(con->sock, &con->v2.out_iter); + else + ret = do_sendmsg(con->sock, &con->v2.out_iter); + dout("%s con %p ret %d left %zu\n", __func__, con, ret, + iov_iter_count(&con->v2.out_iter)); + return ret; +} + +static void add_in_kvec(struct ceph_connection *con, void *buf, int len) +{ + BUG_ON(con->v2.in_kvec_cnt >= ARRAY_SIZE(con->v2.in_kvecs)); + WARN_ON(!iov_iter_is_kvec(&con->v2.in_iter)); + + con->v2.in_kvecs[con->v2.in_kvec_cnt].iov_base = buf; + con->v2.in_kvecs[con->v2.in_kvec_cnt].iov_len = len; + con->v2.in_kvec_cnt++; + + con->v2.in_iter.nr_segs++; + con->v2.in_iter.count += len; +} + +static void reset_in_kvecs(struct ceph_connection *con) +{ + WARN_ON(iov_iter_count(&con->v2.in_iter)); + + con->v2.in_kvec_cnt = 0; + iov_iter_kvec(&con->v2.in_iter, READ, con->v2.in_kvecs, 0, 0); +} + +static void set_in_bvec(struct ceph_connection *con, const struct bio_vec *bv) +{ + WARN_ON(iov_iter_count(&con->v2.in_iter)); + + con->v2.in_bvec = *bv; + iov_iter_bvec(&con->v2.in_iter, READ, &con->v2.in_bvec, 1, bv->bv_len); +} + +static void set_in_skip(struct ceph_connection *con, int len) +{ + WARN_ON(iov_iter_count(&con->v2.in_iter)); + + dout("%s con %p len %d\n", __func__, con, len); + iov_iter_discard(&con->v2.in_iter, READ, len); +} + +static void add_out_kvec(struct ceph_connection *con, void *buf, int len) +{ + BUG_ON(con->v2.out_kvec_cnt >= ARRAY_SIZE(con->v2.out_kvecs)); + WARN_ON(!iov_iter_is_kvec(&con->v2.out_iter)); + WARN_ON(con->v2.out_zero); + + con->v2.out_kvecs[con->v2.out_kvec_cnt].iov_base = buf; + con->v2.out_kvecs[con->v2.out_kvec_cnt].iov_len = len; + con->v2.out_kvec_cnt++; + + con->v2.out_iter.nr_segs++; + con->v2.out_iter.count += len; +} + +static void reset_out_kvecs(struct ceph_connection *con) +{ + WARN_ON(iov_iter_count(&con->v2.out_iter)); + WARN_ON(con->v2.out_zero); + + con->v2.out_kvec_cnt = 0; + + iov_iter_kvec(&con->v2.out_iter, WRITE, con->v2.out_kvecs, 0, 0); + con->v2.out_iter_sendpage = false; +} + +static void set_out_bvec(struct ceph_connection *con, const struct bio_vec *bv, + bool zerocopy) +{ + WARN_ON(iov_iter_count(&con->v2.out_iter)); + WARN_ON(con->v2.out_zero); + + con->v2.out_bvec = *bv; + con->v2.out_iter_sendpage = zerocopy; + iov_iter_bvec(&con->v2.out_iter, WRITE, &con->v2.out_bvec, 1, + con->v2.out_bvec.bv_len); +} + +static void set_out_bvec_zero(struct ceph_connection *con) +{ + WARN_ON(iov_iter_count(&con->v2.out_iter)); + WARN_ON(!con->v2.out_zero); + + con->v2.out_bvec.bv_page = ceph_zero_page; + con->v2.out_bvec.bv_offset = 0; + con->v2.out_bvec.bv_len = min(con->v2.out_zero, (int)PAGE_SIZE); + con->v2.out_iter_sendpage = true; + iov_iter_bvec(&con->v2.out_iter, WRITE, &con->v2.out_bvec, 1, + con->v2.out_bvec.bv_len); +} + +static void out_zero_add(struct ceph_connection *con, int len) +{ + dout("%s con %p len %d\n", __func__, con, len); + con->v2.out_zero += len; +} + +static void *alloc_conn_buf(struct ceph_connection *con, int len) +{ + void *buf; + + dout("%s con %p len %d\n", __func__, con, len); + + if (WARN_ON(con->v2.conn_buf_cnt >= ARRAY_SIZE(con->v2.conn_bufs))) + return NULL; + + buf = ceph_kvmalloc(len, GFP_NOIO); + if (!buf) + return NULL; + + con->v2.conn_bufs[con->v2.conn_buf_cnt++] = buf; + return buf; +} + +static void free_conn_bufs(struct ceph_connection *con) +{ + while (con->v2.conn_buf_cnt) + kvfree(con->v2.conn_bufs[--con->v2.conn_buf_cnt]); +} + +static void add_in_sign_kvec(struct ceph_connection *con, void *buf, int len) +{ + BUG_ON(con->v2.in_sign_kvec_cnt >= ARRAY_SIZE(con->v2.in_sign_kvecs)); + + con->v2.in_sign_kvecs[con->v2.in_sign_kvec_cnt].iov_base = buf; + con->v2.in_sign_kvecs[con->v2.in_sign_kvec_cnt].iov_len = len; + con->v2.in_sign_kvec_cnt++; +} + +static void clear_in_sign_kvecs(struct ceph_connection *con) +{ + con->v2.in_sign_kvec_cnt = 0; +} + +static void add_out_sign_kvec(struct ceph_connection *con, void *buf, int len) +{ + BUG_ON(con->v2.out_sign_kvec_cnt >= ARRAY_SIZE(con->v2.out_sign_kvecs)); + + con->v2.out_sign_kvecs[con->v2.out_sign_kvec_cnt].iov_base = buf; + con->v2.out_sign_kvecs[con->v2.out_sign_kvec_cnt].iov_len = len; + con->v2.out_sign_kvec_cnt++; +} + +static void clear_out_sign_kvecs(struct ceph_connection *con) +{ + con->v2.out_sign_kvec_cnt = 0; +} + +static bool con_secure(struct ceph_connection *con) +{ + return con->v2.con_mode == CEPH_CON_MODE_SECURE; +} + +static int front_len(const struct ceph_msg *msg) +{ + return le32_to_cpu(msg->hdr.front_len); +} + +static int middle_len(const struct ceph_msg *msg) +{ + return le32_to_cpu(msg->hdr.middle_len); +} + +static int data_len(const struct ceph_msg *msg) +{ + return le32_to_cpu(msg->hdr.data_len); +} + +static bool need_padding(int len) +{ + return !IS_ALIGNED(len, CEPH_GCM_BLOCK_LEN); +} + +static int padded_len(int len) +{ + return ALIGN(len, CEPH_GCM_BLOCK_LEN); +} + +static int padding_len(int len) +{ + return padded_len(len) - len; +} + +/* preamble + control segment */ +static int head_onwire_len(int ctrl_len, bool secure) +{ + int head_len; + int rem_len; + + if (secure) { + head_len = CEPH_PREAMBLE_SECURE_LEN; + if (ctrl_len > CEPH_PREAMBLE_INLINE_LEN) { + rem_len = ctrl_len - CEPH_PREAMBLE_INLINE_LEN; + head_len += padded_len(rem_len) + CEPH_GCM_TAG_LEN; + } + } else { + head_len = CEPH_PREAMBLE_PLAIN_LEN; + if (ctrl_len) + head_len += ctrl_len + CEPH_CRC_LEN; + } + return head_len; +} + +/* front, middle and data segments + epilogue */ +static int __tail_onwire_len(int front_len, int middle_len, int data_len, + bool secure) +{ + if (!front_len && !middle_len && !data_len) + return 0; + + if (!secure) + return front_len + middle_len + data_len + + CEPH_EPILOGUE_PLAIN_LEN; + + return padded_len(front_len) + padded_len(middle_len) + + padded_len(data_len) + CEPH_EPILOGUE_SECURE_LEN; +} + +static int tail_onwire_len(const struct ceph_msg *msg, bool secure) +{ + return __tail_onwire_len(front_len(msg), middle_len(msg), + data_len(msg), secure); +} + +/* head_onwire_len(sizeof(struct ceph_msg_header2), false) */ +#define MESSAGE_HEAD_PLAIN_LEN (CEPH_PREAMBLE_PLAIN_LEN + \ + sizeof(struct ceph_msg_header2) + \ + CEPH_CRC_LEN) + +static const int frame_aligns[] = { + sizeof(void *), + sizeof(void *), + sizeof(void *), + PAGE_SIZE +}; + +/* + * Discards trailing empty segments, unless there is just one segment. + * A frame always has at least one (possibly empty) segment. + */ +static int calc_segment_count(const int *lens, int len_cnt) +{ + int i; + + for (i = len_cnt - 1; i >= 0; i--) { + if (lens[i]) + return i + 1; + } + + return 1; +} + +static void init_frame_desc(struct ceph_frame_desc *desc, int tag, + const int *lens, int len_cnt) +{ + int i; + + memset(desc, 0, sizeof(*desc)); + + desc->fd_tag = tag; + desc->fd_seg_cnt = calc_segment_count(lens, len_cnt); + BUG_ON(desc->fd_seg_cnt > CEPH_FRAME_MAX_SEGMENT_COUNT); + for (i = 0; i < desc->fd_seg_cnt; i++) { + desc->fd_lens[i] = lens[i]; + desc->fd_aligns[i] = frame_aligns[i]; + } +} + +/* + * Preamble crc covers everything up to itself (28 bytes) and + * is calculated and verified irrespective of the connection mode + * (i.e. even if the frame is encrypted). + */ +static void encode_preamble(const struct ceph_frame_desc *desc, void *p) +{ + void *crcp = p + CEPH_PREAMBLE_LEN - CEPH_CRC_LEN; + void *start = p; + int i; + + memset(p, 0, CEPH_PREAMBLE_LEN); + + ceph_encode_8(&p, desc->fd_tag); + ceph_encode_8(&p, desc->fd_seg_cnt); + for (i = 0; i < desc->fd_seg_cnt; i++) { + ceph_encode_32(&p, desc->fd_lens[i]); + ceph_encode_16(&p, desc->fd_aligns[i]); + } + + put_unaligned_le32(crc32c(0, start, crcp - start), crcp); +} + +static int decode_preamble(void *p, struct ceph_frame_desc *desc) +{ + void *crcp = p + CEPH_PREAMBLE_LEN - CEPH_CRC_LEN; + u32 crc, expected_crc; + int i; + + crc = crc32c(0, p, crcp - p); + expected_crc = get_unaligned_le32(crcp); + if (crc != expected_crc) { + pr_err("bad preamble crc, calculated %u, expected %u\n", + crc, expected_crc); + return -EBADMSG; + } + + memset(desc, 0, sizeof(*desc)); + + desc->fd_tag = ceph_decode_8(&p); + desc->fd_seg_cnt = ceph_decode_8(&p); + if (desc->fd_seg_cnt < 1 || + desc->fd_seg_cnt > CEPH_FRAME_MAX_SEGMENT_COUNT) { + pr_err("bad segment count %d\n", desc->fd_seg_cnt); + return -EINVAL; + } + for (i = 0; i < desc->fd_seg_cnt; i++) { + desc->fd_lens[i] = ceph_decode_32(&p); + desc->fd_aligns[i] = ceph_decode_16(&p); + } + + /* + * This would fire for FRAME_TAG_WAIT (it has one empty + * segment), but we should never get it as client. + */ + if (!desc->fd_ |