summaryrefslogtreecommitdiffstats
path: root/arch/arm64
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2019-11-25 19:49:58 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2019-11-25 19:49:58 -0800
commit642356cb5f4a8c82b5ca5ebac288c327d10df236 (patch)
tree85bdf911a1307d33838449cb8209b828dcfef1c7 /arch/arm64
parentf838767555d40f29bc4771c5c8cc63193094b7cc (diff)
parent4ee812f6143d78d8ba1399671d78c8d78bf2817c (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6
Pull crypto updates from Herbert Xu: "API: - Add library interfaces of certain crypto algorithms for WireGuard - Remove the obsolete ablkcipher and blkcipher interfaces - Move add_early_randomness() out of rng_mutex Algorithms: - Add blake2b shash algorithm - Add blake2s shash algorithm - Add curve25519 kpp algorithm - Implement 4 way interleave in arm64/gcm-ce - Implement ciphertext stealing in powerpc/spe-xts - Add Eric Biggers's scalar accelerated ChaCha code for ARM - Add accelerated 32r2 code from Zinc for MIPS - Add OpenSSL/CRYPTOGRAMS poly1305 implementation for ARM and MIPS Drivers: - Fix entropy reading failures in ks-sa - Add support for sam9x60 in atmel - Add crypto accelerator for amlogic GXL - Add sun8i-ce Crypto Engine - Add sun8i-ss cryptographic offloader - Add a host of algorithms to inside-secure - Add NPCM RNG driver - add HiSilicon HPRE accelerator - Add HiSilicon TRNG driver" * git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6: (285 commits) crypto: vmx - Avoid weird build failures crypto: lib/chacha20poly1305 - use chacha20_crypt() crypto: x86/chacha - only unregister algorithms if registered crypto: chacha_generic - remove unnecessary setkey() functions crypto: amlogic - enable working on big endian kernel crypto: sun8i-ce - enable working on big endian crypto: mips/chacha - select CRYPTO_SKCIPHER, not CRYPTO_BLKCIPHER hwrng: ks-sa - Enable COMPILE_TEST crypto: essiv - remove redundant null pointer check before kfree crypto: atmel-aes - Change data type for "lastc" buffer crypto: atmel-tdes - Set the IV after {en,de}crypt crypto: sun4i-ss - fix big endian issues crypto: sun4i-ss - hide the Invalid keylen message crypto: sun4i-ss - use crypto_ahash_digestsize crypto: sun4i-ss - remove dependency on not 64BIT crypto: sun4i-ss - Fix 64-bit size_t warnings on sun4i-ss-hash.c MAINTAINERS: Add maintainer for HiSilicon SEC V2 driver crypto: hisilicon - add DebugFS for HiSilicon SEC Documentation: add DebugFS doc for HiSilicon SEC crypto: hisilicon - add SRIOV for HiSilicon SEC ...
Diffstat (limited to 'arch/arm64')
-rw-r--r--arch/arm64/Kconfig2
-rw-r--r--arch/arm64/crypto/Kconfig17
-rw-r--r--arch/arm64/crypto/Makefile10
-rw-r--r--arch/arm64/crypto/aes-neonbs-glue.c2
-rw-r--r--arch/arm64/crypto/chacha-neon-glue.c81
-rw-r--r--arch/arm64/crypto/ghash-ce-core.S501
-rw-r--r--arch/arm64/crypto/ghash-ce-glue.c293
-rw-r--r--arch/arm64/crypto/poly1305-armv8.pl913
-rw-r--r--arch/arm64/crypto/poly1305-core.S_shipped835
-rw-r--r--arch/arm64/crypto/poly1305-glue.c237
10 files changed, 2533 insertions, 358 deletions
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index d66a9727344d..fcc6635666b4 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -67,7 +67,7 @@ config ARM64
select ARCH_USE_QUEUED_SPINLOCKS
select ARCH_SUPPORTS_MEMORY_FAILURE
select ARCH_SUPPORTS_ATOMIC_RMW
- select ARCH_SUPPORTS_INT128 if GCC_VERSION >= 50000 || CC_IS_CLANG
+ select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 && (GCC_VERSION >= 50000 || CC_IS_CLANG)
select ARCH_SUPPORTS_NUMA_BALANCING
select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT
select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig
index 4922c4451e7c..b8eb0453123d 100644
--- a/arch/arm64/crypto/Kconfig
+++ b/arch/arm64/crypto/Kconfig
@@ -86,7 +86,7 @@ config CRYPTO_AES_ARM64_CE_CCM
config CRYPTO_AES_ARM64_CE_BLK
tristate "AES in ECB/CBC/CTR/XTS modes using ARMv8 Crypto Extensions"
depends on KERNEL_MODE_NEON
- select CRYPTO_BLKCIPHER
+ select CRYPTO_SKCIPHER
select CRYPTO_AES_ARM64_CE
select CRYPTO_AES_ARM64
select CRYPTO_SIMD
@@ -94,7 +94,7 @@ config CRYPTO_AES_ARM64_CE_BLK
config CRYPTO_AES_ARM64_NEON_BLK
tristate "AES in ECB/CBC/CTR/XTS modes using NEON instructions"
depends on KERNEL_MODE_NEON
- select CRYPTO_BLKCIPHER
+ select CRYPTO_SKCIPHER
select CRYPTO_AES_ARM64
select CRYPTO_LIB_AES
select CRYPTO_SIMD
@@ -102,8 +102,15 @@ config CRYPTO_AES_ARM64_NEON_BLK
config CRYPTO_CHACHA20_NEON
tristate "ChaCha20, XChaCha20, and XChaCha12 stream ciphers using NEON instructions"
depends on KERNEL_MODE_NEON
- select CRYPTO_BLKCIPHER
- select CRYPTO_CHACHA20
+ select CRYPTO_SKCIPHER
+ select CRYPTO_LIB_CHACHA_GENERIC
+ select CRYPTO_ARCH_HAVE_LIB_CHACHA
+
+config CRYPTO_POLY1305_NEON
+ tristate "Poly1305 hash function using scalar or NEON instructions"
+ depends on KERNEL_MODE_NEON
+ select CRYPTO_HASH
+ select CRYPTO_ARCH_HAVE_LIB_POLY1305
config CRYPTO_NHPOLY1305_NEON
tristate "NHPoly1305 hash function using NEON instructions (for Adiantum)"
@@ -113,7 +120,7 @@ config CRYPTO_NHPOLY1305_NEON
config CRYPTO_AES_ARM64_BS
tristate "AES in ECB/CBC/CTR/XTS modes using bit-sliced NEON algorithm"
depends on KERNEL_MODE_NEON
- select CRYPTO_BLKCIPHER
+ select CRYPTO_SKCIPHER
select CRYPTO_AES_ARM64_NEON_BLK
select CRYPTO_AES_ARM64
select CRYPTO_LIB_AES
diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile
index 0435f2a0610e..d0901e610df3 100644
--- a/arch/arm64/crypto/Makefile
+++ b/arch/arm64/crypto/Makefile
@@ -50,6 +50,10 @@ sha512-arm64-y := sha512-glue.o sha512-core.o
obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o
chacha-neon-y := chacha-neon-core.o chacha-neon-glue.o
+obj-$(CONFIG_CRYPTO_POLY1305_NEON) += poly1305-neon.o
+poly1305-neon-y := poly1305-core.o poly1305-glue.o
+AFLAGS_poly1305-core.o += -Dpoly1305_init=poly1305_init_arm64
+
obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o
nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o
@@ -68,11 +72,15 @@ ifdef REGENERATE_ARM64_CRYPTO
quiet_cmd_perlasm = PERLASM $@
cmd_perlasm = $(PERL) $(<) void $(@)
+$(src)/poly1305-core.S_shipped: $(src)/poly1305-armv8.pl
+ $(call cmd,perlasm)
+
$(src)/sha256-core.S_shipped: $(src)/sha512-armv8.pl
$(call cmd,perlasm)
$(src)/sha512-core.S_shipped: $(src)/sha512-armv8.pl
$(call cmd,perlasm)
+
endif
-clean-files += sha256-core.S sha512-core.S
+clean-files += poly1305-core.S sha256-core.S sha512-core.S
diff --git a/arch/arm64/crypto/aes-neonbs-glue.c b/arch/arm64/crypto/aes-neonbs-glue.c
index ea873b8904c4..e3e27349a9fe 100644
--- a/arch/arm64/crypto/aes-neonbs-glue.c
+++ b/arch/arm64/crypto/aes-neonbs-glue.c
@@ -384,7 +384,7 @@ static int __xts_crypt(struct skcipher_request *req, bool encrypt,
goto xts_tail;
kernel_neon_end();
- skcipher_walk_done(&walk, nbytes);
+ err = skcipher_walk_done(&walk, nbytes);
}
if (err || likely(!tail))
diff --git a/arch/arm64/crypto/chacha-neon-glue.c b/arch/arm64/crypto/chacha-neon-glue.c
index 1495d2b18518..b08029d7bde6 100644
--- a/arch/arm64/crypto/chacha-neon-glue.c
+++ b/arch/arm64/crypto/chacha-neon-glue.c
@@ -1,5 +1,5 @@
/*
- * ARM NEON accelerated ChaCha and XChaCha stream ciphers,
+ * ARM NEON and scalar accelerated ChaCha and XChaCha stream ciphers,
* including ChaCha20 (RFC7539)
*
* Copyright (C) 2016 - 2017 Linaro, Ltd. <ard.biesheuvel@linaro.org>
@@ -20,9 +20,10 @@
*/
#include <crypto/algapi.h>
-#include <crypto/chacha.h>
+#include <crypto/internal/chacha.h>
#include <crypto/internal/simd.h>
#include <crypto/internal/skcipher.h>
+#include <linux/jump_label.h>
#include <linux/kernel.h>
#include <linux/module.h>
@@ -36,6 +37,8 @@ asmlinkage void chacha_4block_xor_neon(u32 *state, u8 *dst, const u8 *src,
int nrounds, int bytes);
asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+
static void chacha_doneon(u32 *state, u8 *dst, const u8 *src,
int bytes, int nrounds)
{
@@ -59,6 +62,37 @@ static void chacha_doneon(u32 *state, u8 *dst, const u8 *src,
}
}
+void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds)
+{
+ if (!static_branch_likely(&have_neon) || !crypto_simd_usable()) {
+ hchacha_block_generic(state, stream, nrounds);
+ } else {
+ kernel_neon_begin();
+ hchacha_block_neon(state, stream, nrounds);
+ kernel_neon_end();
+ }
+}
+EXPORT_SYMBOL(hchacha_block_arch);
+
+void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv)
+{
+ chacha_init_generic(state, key, iv);
+}
+EXPORT_SYMBOL(chacha_init_arch);
+
+void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes,
+ int nrounds)
+{
+ if (!static_branch_likely(&have_neon) || bytes <= CHACHA_BLOCK_SIZE ||
+ !crypto_simd_usable())
+ return chacha_crypt_generic(state, dst, src, bytes, nrounds);
+
+ kernel_neon_begin();
+ chacha_doneon(state, dst, src, bytes, nrounds);
+ kernel_neon_end();
+}
+EXPORT_SYMBOL(chacha_crypt_arch);
+
static int chacha_neon_stream_xor(struct skcipher_request *req,
const struct chacha_ctx *ctx, const u8 *iv)
{
@@ -68,7 +102,7 @@ static int chacha_neon_stream_xor(struct skcipher_request *req,
err = skcipher_walk_virt(&walk, req, false);
- crypto_chacha_init(state, ctx, iv);
+ chacha_init_generic(state, ctx->key, iv);
while (walk.nbytes > 0) {
unsigned int nbytes = walk.nbytes;
@@ -76,10 +110,17 @@ static int chacha_neon_stream_xor(struct skcipher_request *req,
if (nbytes < walk.total)
nbytes = rounddown(nbytes, walk.stride);
- kernel_neon_begin();
- chacha_doneon(state, walk.dst.virt.addr, walk.src.virt.addr,
- nbytes, ctx->nrounds);
- kernel_neon_end();
+ if (!static_branch_likely(&have_neon) ||
+ !crypto_simd_usable()) {
+ chacha_crypt_generic(state, walk.dst.virt.addr,
+ walk.src.virt.addr, nbytes,
+ ctx->nrounds);
+ } else {
+ kernel_neon_begin();
+ chacha_doneon(state, walk.dst.virt.addr,
+ walk.src.virt.addr, nbytes, ctx->nrounds);
+ kernel_neon_end();
+ }
err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
}
@@ -91,9 +132,6 @@ static int chacha_neon(struct skcipher_request *req)
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
- if (req->cryptlen <= CHACHA_BLOCK_SIZE || !crypto_simd_usable())
- return crypto_chacha_crypt(req);
-
return chacha_neon_stream_xor(req, ctx, req->iv);
}
@@ -105,14 +143,8 @@ static int xchacha_neon(struct skcipher_request *req)
u32 state[16];
u8 real_iv[16];
- if (req->cryptlen <= CHACHA_BLOCK_SIZE || !crypto_simd_usable())
- return crypto_xchacha_crypt(req);
-
- crypto_chacha_init(state, ctx, req->iv);
-
- kernel_neon_begin();
- hchacha_block_neon(state, subctx.key, ctx->nrounds);
- kernel_neon_end();
+ chacha_init_generic(state, ctx->key, req->iv);
+ hchacha_block_arch(state, subctx.key, ctx->nrounds);
subctx.nrounds = ctx->nrounds;
memcpy(&real_iv[0], req->iv + 24, 8);
@@ -134,7 +166,7 @@ static struct skcipher_alg algs[] = {
.ivsize = CHACHA_IV_SIZE,
.chunksize = CHACHA_BLOCK_SIZE,
.walksize = 5 * CHACHA_BLOCK_SIZE,
- .setkey = crypto_chacha20_setkey,
+ .setkey = chacha20_setkey,
.encrypt = chacha_neon,
.decrypt = chacha_neon,
}, {
@@ -150,7 +182,7 @@ static struct skcipher_alg algs[] = {
.ivsize = XCHACHA_IV_SIZE,
.chunksize = CHACHA_BLOCK_SIZE,
.walksize = 5 * CHACHA_BLOCK_SIZE,
- .setkey = crypto_chacha20_setkey,
+ .setkey = chacha20_setkey,
.encrypt = xchacha_neon,
.decrypt = xchacha_neon,
}, {
@@ -166,7 +198,7 @@ static struct skcipher_alg algs[] = {
.ivsize = XCHACHA_IV_SIZE,
.chunksize = CHACHA_BLOCK_SIZE,
.walksize = 5 * CHACHA_BLOCK_SIZE,
- .setkey = crypto_chacha12_setkey,
+ .setkey = chacha12_setkey,
.encrypt = xchacha_neon,
.decrypt = xchacha_neon,
}
@@ -175,14 +207,17 @@ static struct skcipher_alg algs[] = {
static int __init chacha_simd_mod_init(void)
{
if (!cpu_have_named_feature(ASIMD))
- return -ENODEV;
+ return 0;
+
+ static_branch_enable(&have_neon);
return crypto_register_skciphers(algs, ARRAY_SIZE(algs));
}
static void __exit chacha_simd_mod_fini(void)
{
- crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
+ if (cpu_have_named_feature(ASIMD))
+ crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
}
module_init(chacha_simd_mod_init);
diff --git a/arch/arm64/crypto/ghash-ce-core.S b/arch/arm64/crypto/ghash-ce-core.S
index 410e8afcf5a7..a791c4adf8e6 100644
--- a/arch/arm64/crypto/ghash-ce-core.S
+++ b/arch/arm64/crypto/ghash-ce-core.S
@@ -13,8 +13,8 @@
T1 .req v2
T2 .req v3
MASK .req v4
- XL .req v5
- XM .req v6
+ XM .req v5
+ XL .req v6
XH .req v7
IN1 .req v7
@@ -358,20 +358,37 @@ ENTRY(pmull_ghash_update_p8)
__pmull_ghash p8
ENDPROC(pmull_ghash_update_p8)
- KS0 .req v12
- KS1 .req v13
- INP0 .req v14
- INP1 .req v15
-
- .macro load_round_keys, rounds, rk
- cmp \rounds, #12
- blo 2222f /* 128 bits */
- beq 1111f /* 192 bits */
- ld1 {v17.4s-v18.4s}, [\rk], #32
-1111: ld1 {v19.4s-v20.4s}, [\rk], #32
-2222: ld1 {v21.4s-v24.4s}, [\rk], #64
- ld1 {v25.4s-v28.4s}, [\rk], #64
- ld1 {v29.4s-v31.4s}, [\rk]
+ KS0 .req v8
+ KS1 .req v9
+ KS2 .req v10
+ KS3 .req v11
+
+ INP0 .req v21
+ INP1 .req v22
+ INP2 .req v23
+ INP3 .req v24
+
+ K0 .req v25
+ K1 .req v26
+ K2 .req v27
+ K3 .req v28
+ K4 .req v12
+ K5 .req v13
+ K6 .req v4
+ K7 .req v5
+ K8 .req v14
+ K9 .req v15
+ KK .req v29
+ KL .req v30
+ KM .req v31
+
+ .macro load_round_keys, rounds, rk, tmp
+ add \tmp, \rk, #64
+ ld1 {K0.4s-K3.4s}, [\rk]
+ ld1 {K4.4s-K5.4s}, [\tmp]
+ add \tmp, \rk, \rounds, lsl #4
+ sub \tmp, \tmp, #32
+ ld1 {KK.4s-KM.4s}, [\tmp]
.endm
.macro enc_round, state, key
@@ -379,197 +396,367 @@ ENDPROC(pmull_ghash_update_p8)
aesmc \state\().16b, \state\().16b
.endm
- .macro enc_block, state, rounds
- cmp \rounds, #12
- b.lo 2222f /* 128 bits */
- b.eq 1111f /* 192 bits */
- enc_round \state, v17
- enc_round \state, v18
-1111: enc_round \state, v19
- enc_round \state, v20
-2222: .irp key, v21, v22, v23, v24, v25, v26, v27, v28, v29
+ .macro enc_qround, s0, s1, s2, s3, key
+ enc_round \s0, \key
+ enc_round \s1, \key
+ enc_round \s2, \key
+ enc_round \s3, \key
+ .endm
+
+ .macro enc_block, state, rounds, rk, tmp
+ add \tmp, \rk, #96
+ ld1 {K6.4s-K7.4s}, [\tmp], #32
+ .irp key, K0, K1, K2, K3, K4 K5
enc_round \state, \key
.endr
- aese \state\().16b, v30.16b
- eor \state\().16b, \state\().16b, v31.16b
+
+ tbnz \rounds, #2, .Lnot128_\@
+.Lout256_\@:
+ enc_round \state, K6
+ enc_round \state, K7
+
+.Lout192_\@:
+ enc_round \state, KK
+ aese \state\().16b, KL.16b
+ eor \state\().16b, \state\().16b, KM.16b
+
+ .subsection 1
+.Lnot128_\@:
+ ld1 {K8.4s-K9.4s}, [\tmp], #32
+ enc_round \state, K6
+ enc_round \state, K7
+ ld1 {K6.4s-K7.4s}, [\tmp]
+ enc_round \state, K8
+ enc_round \state, K9
+ tbz \rounds, #1, .Lout192_\@
+ b .Lout256_\@
+ .previous
.endm
+ .align 6
.macro pmull_gcm_do_crypt, enc
- ld1 {SHASH.2d}, [x4], #16
- ld1 {HH.2d}, [x4]
- ld1 {XL.2d}, [x1]
- ldr x8, [x5, #8] // load lower counter
+ stp x29, x30, [sp, #-32]!
+ mov x29, sp
+ str x19, [sp, #24]
+
+ load_round_keys x7, x6, x8
+
+ ld1 {SHASH.2d}, [x3], #16
+ ld1 {HH.2d-HH4.2d}, [x3]
- movi MASK.16b, #0xe1
trn1 SHASH2.2d, SHASH.2d, HH.2d
trn2 T1.2d, SHASH.2d, HH.2d
-CPU_LE( rev x8, x8 )
- shl MASK.2d, MASK.2d, #57
eor SHASH2.16b, SHASH2.16b, T1.16b
- .if \enc == 1
- ldr x10, [sp]
- ld1 {KS0.16b-KS1.16b}, [x10]
- .endif
+ trn1 HH34.2d, HH3.2d, HH4.2d
+ trn2 T1.2d, HH3.2d, HH4.2d
+ eor HH34.16b, HH34.16b, T1.16b
- cbnz x6, 4f
+ ld1 {XL.2d}, [x4]
-0: ld1 {INP0.16b-INP1.16b}, [x3], #32
+ cbz x0, 3f // tag only?
- rev x9, x8
- add x11, x8, #1
- add x8, x8, #2
+ ldr w8, [x5, #12] // load lower counter
+CPU_LE( rev w8, w8 )
- .if \enc == 1
- eor INP0.16b, INP0.16b, KS0.16b // encrypt input
- eor INP1.16b, INP1.16b, KS1.16b
+0: mov w9, #4 // max blocks per round
+ add x10, x0, #0xf
+ lsr x10, x10, #4 // remaining blocks
+
+ subs x0, x0, #64
+ csel w9, w10, w9, mi
+ add w8, w8, w9
+
+ bmi 1f
+ ld1 {INP0.16b-INP3.16b}, [x2], #64
+ .subsection 1
+ /*
+ * Populate the four input registers right to left with up to 63 bytes
+ * of data, using overlapping loads to avoid branches.
+ *
+ * INP0 INP1 INP2 INP3
+ * 1 byte | | | |x |
+ * 16 bytes | | | |xxxxxxxx|
+ * 17 bytes | | |xxxxxxxx|x |
+ * 47 bytes | |xxxxxxxx|xxxxxxxx|xxxxxxx |
+ * etc etc
+ *
+ * Note that this code may read up to 15 bytes before the start of
+ * the input. It is up to the calling code to ensure this is safe if
+ * this happens in the first iteration of the loop (i.e., when the
+ * input size is < 16 bytes)
+ */
+1: mov x15, #16
+ ands x19, x0, #0xf
+ csel x19, x19, x15, ne
+ adr_l x17, .Lpermute_table + 16
+
+ sub x11, x15, x19
+ add x12, x17, x11
+ sub x17, x17, x11
+ ld1 {T1.16b}, [x12]
+ sub x10, x1, x11
+ sub x11, x2, x11
+
+ cmp x0, #-16
+ csel x14, x15, xzr, gt
+ cmp x0, #-32
+ csel x15, x15, xzr, gt
+ cmp x0, #-48
+ csel x16, x19, xzr, gt
+ csel x1, x1, x10, gt
+ csel x2, x2, x11, gt
+
+ ld1 {INP0.16b}, [x2], x14
+ ld1 {INP1.16b}, [x2], x15
+ ld1 {INP2.16b}, [x2], x16
+ ld1 {INP3.16b}, [x2]
+ tbl INP3.16b, {INP3.16b}, T1.16b
+ b 2f
+ .previous
+
+2: .if \enc == 0
+ bl pmull_gcm_ghash_4x
.endif
- ld1 {KS0.8b}, [x5] // load upper counter
- rev x11, x11
- sub w0, w0, #2
- mov KS1.8b, KS0.8b
- ins KS0.d[1], x9 // set lower counter
- ins KS1.d[1], x11
+ bl pmull_gcm_enc_4x
- rev64 T1.16b, INP1.16b
+ tbnz x0, #63, 6f
+ st1 {INP0.16b-INP3.16b}, [x1], #64
+ .if \enc == 1
+ bl pmull_gcm_ghash_4x
+ .endif
+ bne 0b
- cmp w7, #12
- b.ge 2f // AES-192/256?
+3: ldp x19, x10, [sp, #24]
+ cbz x10, 5f // output tag?
-1: enc_round KS0, v21
- ext IN1.16b, T1.16b, T1.16b, #8
+ ld1 {INP3.16b}, [x10] // load lengths[]
+ mov w9, #1
+ bl pmull_gcm_ghash_4x
- enc_round KS1, v21
- pmull2 XH2.1q, SHASH.2d, IN1.2d // a1 * b1
+ mov w11, #(0x1 << 24) // BE '1U'
+ ld1 {KS0.16b}, [x5]
+ mov KS0.s[3], w11
- enc_round KS0, v22
- eor T1.16b, T1.16b, IN1.16b
+ enc_block KS0, x7, x6, x12
- enc_round KS1, v22
- pmull XL2.1q, SHASH.1d, IN1.1d // a0 * b0
+ ext XL.16b, XL.16b, XL.16b, #8
+ rev64 XL.16b, XL.16b
+ eor XL.16b, XL.16b, KS0.16b
+ st1 {XL.16b}, [x10] // store tag
- enc_round KS0, v23
- pmull XM2.1q, SHASH2.1d, T1.1d // (a1 + a0)(b1 + b0)
+4: ldp x29, x30, [sp], #32
+ ret
- enc_round KS1, v23
- rev64 T1.16b, INP0.16b
- ext T2.16b, XL.16b, XL.16b, #8
+5:
+CPU_LE( rev w8, w8 )
+ str w8, [x5, #12] // store lower counter
+ st1 {XL.2d}, [x4]
+ b 4b
+
+6: ld1 {T1.16b-T2.16b}, [x17], #32 // permute vectors
+ sub x17, x17, x19, lsl #1
+
+ cmp w9, #1
+ beq 7f
+ .subsection 1
+7: ld1 {INP2.16b}, [x1]
+ tbx INP2.16b, {INP3.16b}, T1.16b
+ mov INP3.16b, INP2.16b
+ b 8f
+ .previous
+
+ st1 {INP0.16b}, [x1], x14
+ st1 {INP1.16b}, [x1], x15
+ st1 {INP2.16b}, [x1], x16
+ tbl INP3.16b, {INP3.16b}, T1.16b
+ tbx INP3.16b, {INP2.16b}, T2.16b
+8: st1 {INP3.16b}, [x1]
- enc_round KS0, v24
- ext IN1.16b, T1.16b, T1.16b, #8
- eor T1.16b, T1.16b, T2.16b
+ .if \enc == 1
+ ld1 {T1.16b}, [x17]
+ tbl INP3.16b, {INP3.16b}, T1.16b // clear non-data bits
+ bl pmull_gcm_ghash_4x
+ .endif
+ b 3b
+ .endm
- enc_round KS1, v24
- eor XL.16b, XL.16b, IN1.16b
+ /*
+ * void pmull_gcm_encrypt(int blocks, u8 dst[], const u8 src[],
+ * struct ghash_key const *k, u64 dg[], u8 ctr[],
+ * int rounds, u8 tag)
+ */
+ENTRY(pmull_gcm_encrypt)
+ pmull_gcm_do_crypt 1
+ENDPROC(pmull_gcm_encrypt)
- enc_round KS0, v25
- eor T1.16b, T1.16b, XL.16b
+ /*
+ * void pmull_gcm_decrypt(int blocks, u8 dst[], const u8 src[],
+ * struct ghash_key const *k, u64 dg[], u8 ctr[],
+ * int rounds, u8 tag)
+ */
+ENTRY(pmull_gcm_decrypt)
+ pmull_gcm_do_crypt 0
+ENDPROC(pmull_gcm_decrypt)
- enc_round KS1, v25
- pmull2 XH.1q, HH.2d, XL.2d // a1 * b1
+pmull_gcm_ghash_4x:
+ movi MASK.16b, #0xe1
+ shl MASK.2d, MASK.2d, #57
- enc_round KS0, v26
- pmull XL.1q, HH.1d, XL.1d // a0 * b0
+ rev64 T1.16b, INP0.16b
+ rev64 T2.16b, INP1.16b
+ rev64 TT3.16b, INP2.16b
+ rev64 TT4.16b, INP3.16b
- enc_round KS1, v26
- pmull2 XM.1q, SHASH2.2d, T1.2d // (a1 + a0)(b1 + b0)
+ ext XL.16b, XL.16b, XL.16b, #8
- enc_round KS0, v27
- eor XL.16b, XL.16b, XL2.16b
- eor XH.16b, XH.16b, XH2.16b
+ tbz w9, #2, 0f // <4 blocks?
+ .subsection 1
+0: movi XH2.16b, #0
+ movi XM2.16b, #0
+ movi XL2.16b, #0
- enc_round KS1, v27
- eor XM.16b, XM.16b, XM2.16b
- ext T1.16b, XL.16b, XH.16b, #8
+ tbz w9, #0, 1f // 2 blocks?
+ tbz w9, #1, 2f // 1 block?
- enc_round KS0, v28
- eor T2.16b, XL.16b, XH.16b
- eor XM.16b, XM.16b, T1.16b
+ eor T2.16b, T2.16b, XL.16b
+ ext T1.16b, T2.16b, T2.16b, #8
+ b .Lgh3
- enc_round KS1, v28
- eor XM.16b, XM.16b, T2.16b
+1: eor TT3.16b, TT3.16b, XL.16b
+ ext T2.16b, TT3.16b, TT3.16b, #8
+ b .Lgh2
- enc_round KS0, v29
- pmull T2.1q, XL.1d, MASK.1d
+2: eor TT4.16b, TT4.16b, XL.16b
+ ext IN1.16b, TT4.16b, TT4.16b, #8
+ b .Lgh1
+ .previous
- enc_round KS1, v29
- mov XH.d[0], XM.d[1]
- mov XM.d[1], XL.d[0]
+ eor T1.16b, T1.16b, XL.16b
+ ext IN1.16b, T1.16b, T1.16b, #8
- aese KS0.16b, v30.16b
- eor XL.16b, XM.16b, T2.16b
+ pmull2 XH2.1q, HH4.2d, IN1.2d // a1 * b1
+ eor T1.16b, T1.16b, IN1.16b
+ pmull XL2.1q, HH4.1d, IN1.1d // a0 * b0
+ pmull2 XM2.1q, HH34.2d, T1.2d // (a1 + a0)(b1 + b0)
- aese KS1.16b, v30.16b
- ext T2.16b, XL.16b, XL.16b, #8
+ ext T1.16b, T2.16b, T2.16b, #8
+.Lgh3: eor T2.16b, T2.16b, T1.16b
+ pmull2 XH.1q, HH3.2d, T1.2d // a1 * b1
+ pmull XL.1q, HH3.1d, T1.1d // a0 * b0
+ pmull XM.1q, HH34.1d, T2.1d // (a1 + a0)(b1 + b0)
- eor KS0.16b, KS0.16b, v31.16b
- pmull XL.1q, XL.1d, MASK.1d
- eor T2.16b, T2.16b, XH.16b
+ eor XH2.16b, XH2.16b, XH.16b
+ eor XL2.16b, XL2.16b, XL.16b
+ eor XM2.16b, XM2.16b, XM.16b
- eor KS1.16b, KS1.16b, v31.16b
- eor XL.16b, XL.16b, T2.16b
+ ext T2.16b, TT3.16b, TT3.16b, #8
+.Lgh2: eor TT3.16b, TT3.16b, T2.16b
+ pmull2 XH.1q, HH.2d, T2.2d // a1 * b1
+ pmull XL.1q, HH.1d, T2.1d // a0 * b0
+ pmull2 XM.1q, SHASH2.2d, TT3.2d // (a1 + a0)(b1 + b0)
- .if \enc == 0
- eor INP0.16b, INP0.16b, KS0.16b
- eor INP1.16b, INP1.16b, KS1.16b
- .endif
+ eor XH2.16b, XH2.16b, XH.16b
+ eor XL2.16b, XL2.16b, XL.16b
+ eor XM2.16b, XM2.16b, XM.16b
- st1 {INP0.16b-INP1.16b}, [x2], #32
+ ext IN1.16b, TT4.16b, TT4.16b, #8
+.Lgh1: eor TT4.16b, TT4.16b, IN1.16b
+ pmull XL.1q, SHASH.1d, IN1.1d // a0 * b0
+ pmull2 XH.1q, SHASH.2d, IN1.2d // a1 * b1
+ pmull XM.1q, SHASH2.1d, TT4.1d // (a1 + a0)(b1 + b0)
- cbnz w0, 0b
+ eor XH.16b, XH.16b, XH2.16b
+ eor XL.16b, XL.16b, XL2.16b
+ eor XM.16b, XM.16b, XM2.16b
-CPU_LE( rev x8, x8 )
- st1 {XL.2d}, [x1]
- str x8, [x5, #8] // store lower counter
+ eor T2.16b, XL.16b, XH.16b
+ ext T1.16b, XL.16b, XH.16b, #8
+ eor XM.16b, XM.16b, T2.16b
- .if \enc == 1
- st1 {KS0.16b-KS1.16b}, [x10]
- .endif
+ __pmull_reduce_p64
+
+ eor T2.16b, T2.16b, XH.16b
+ eor XL.16b, XL.16b, T2.16b
ret
+ENDPROC(pmull_gcm_ghash_4x)
+
+pmull_gcm_enc_4x:
+ ld1 {KS0.16b}, [x5] // load upper counter
+ sub w10, w8, #4
+ sub w11, w8, #3
+ sub w12, w8, #2
+ sub w13, w8, #1
+ rev w10, w10
+ rev w11, w11
+ rev w12, w12
+ rev w13, w13
+ mov KS1.16b, KS0.16b
+ mov KS2.16b, KS0.16b
+ mov KS3.16b, KS0.16b
+ ins KS0.s[3], w10 // set lower counter
+ ins KS1.s[3], w11
+ ins KS2.s[3], w12
+ ins KS3.s[3], w13
+
+ add x10, x6, #96 // round key pointer
+ ld1 {K6.4s-K7.4s}, [x10], #32
+ .irp key, K0, K1, K2, K3, K4, K5
+ enc_qround KS0, KS1, KS2, KS3, \key
+ .endr
-2: b.eq 3f // AES-192?
- enc_round KS0, v17
- enc_round KS1, v17
- enc_round KS0, v18
- enc_round KS1, v18
-3: enc_round KS0, v19
- enc_round KS1, v19
- enc_round KS0, v20
- enc_round KS1, v20
- b 1b
+ tbnz x7, #2, .Lnot128
+ .subsection 1
+.Lnot128:
+ ld1 {K8.4s-K9.4s}, [x10], #32
+ .irp key, K6, K7
+ enc_qround KS0, KS1, KS2, KS3, \key
+ .endr
+ ld1 {K6.4s-K7.4s}, [x10]
+ .irp key, K8, K9
+ enc_qround KS0, KS1, KS2, KS3, \key
+ .endr
+ tbz x7, #1, .Lout192
+ b .Lout256
+ .previous
-4: load_round_keys w7, x6
- b 0b
- .endm
+.Lout256:
+ .irp key, K6, K7
+ enc_qround KS0, KS1, KS2, KS3, \key
+ .endr
- /*
- * void pmull_gcm_encrypt(int blocks, u64 dg[], u8 dst[], const u8 src[],
- * struct ghash_key const *k, u8 ctr[],
- * int rounds, u8 ks[])
- */
-ENTRY(pmull_gcm_encrypt)
- pmull_gcm_do_crypt 1
-ENDPROC(pmull_gcm_encrypt)
+.Lout192:
+ enc_qround KS0, KS1, KS2, KS3, KK
- /*
- * void pmull_gcm_decrypt(int blocks, u64 dg[], u8 dst[], const u8 src[],
- * struct ghash_key const *k, u8 ctr[],
- * int rounds)
- */
-ENTRY(pmull_gcm_decrypt)
- pmull_gcm_do_crypt 0
-ENDPROC(pmull_gcm_decrypt)
+ aese KS0.16b, KL.16b
+ aese KS1.16b, KL.16b
+ aese KS2.16b, KL.16b
+ aese KS3.16b, KL.16b
+
+ eor KS0.16b, KS0.16b, KM.16b
+ eor KS1.16b, KS1.16b, KM.16b
+ eor KS2.16b, KS2.16b, KM.16b
+ eor KS3.16b, KS3.16b, KM.16b
+
+ eor INP0.16b, INP0.16b, KS0.16b
+ eor INP1.16b, INP1.16b, KS1.16b
+ eor INP2.16b, INP2.16b, KS2.16b
+ eor INP3.16b, INP3.16b, KS3.16b
- /*
- * void pmull_gcm_encrypt_block(u8 dst[], u8 src[], u8 rk[], int rounds)
- */
-ENTRY(pmull_gcm_encrypt_block)
- cbz x2, 0f
- load_round_keys w3, x2
-0: ld1 {v0.16b}, [x1]
- enc_block v0, w3
- st1 {v0.16b}, [x0]
ret
-ENDPROC(pmull_gcm_encrypt_block)
+ENDPROC(pmull_gcm_enc_4x)
+
+ .section ".rodata", "a"
+ .align 6
+.Lpermute_table:
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
+ .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
+ .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
+ .previous
diff --git a/arch/arm64/crypto/ghash-ce-glue.c b/arch/arm64/crypto/ghash-ce-glue.c
index 70b1469783f9..522cf004ce65 100644
--- a/arch/arm64/crypto/ghash-ce-glue.c
+++ b/arch/arm64/crypto/ghash-ce-glue.c
@@ -58,17 +58,15 @@ asmlinkage void pmull_ghash_update_p8(int blocks, u64 dg[], const char *src,
struct ghash_key const *k,
const char *head);
-asmlinkage void pmull_gcm_encrypt(int blocks, u64 dg[], u8 dst[],
- const u8 src[], struct ghash_key const *k,
+asmlinkage void pmull_gcm_encrypt(int bytes, u8 dst[], const u8 src[],
+ struct ghash_key const *k, u64 dg[],
u8 ctr[], u32 const rk[], int rounds,
- u8 ks[]);
+ u8 tag[]);
-asmlinkage void pmull_gcm_decrypt(int blocks, u64 dg[], u8 dst[],
- const u8 src[], struct ghash_key const *k,
- u8 ctr[], u32 const rk[], int rounds);
-
-asmlinkage void pmull_gcm_encrypt_block(u8 dst[], u8 const src[],
- u32 const rk[], int rounds);
+asmlinkage void pmull_gcm_decrypt(int bytes, u8 dst[], const u8 src[],
+ struct ghash_key const *k, u64 dg[],
+ u8 ctr[], u32 const rk[], int rounds,
+ u8 tag[]);
static int ghash_init(struct shash_desc *desc)
{
@@ -85,7 +83,7 @@ static void ghash_do_update(int blocks, u64 dg[], const char *src,
struct ghash_key const *k,
const char *head))
{
- if (likely(crypto_simd_usable())) {
+ if (likely(crypto_simd_usable() && simd_update)) {
kernel_neon_begin();
simd_update(blocks, dg, src, key, head);
kernel_neon_end();
@@ -398,136 +396,112 @@ static void gcm_calculate_auth_mac(struct aead_request *req, u64 dg[])
}
}
-static void gcm_final(struct aead_request *req, struct gcm_aes_ctx *ctx,
- u64 dg[], u8 tag[], int cryptlen)
-{
- u8 mac[AES_BLOCK_SIZE];
- u128 lengths;
-
- lengths.a = cpu_to_be64(req->assoclen * 8);
- lengths.b = cpu_to_be64(cryptlen * 8);
-
- ghash_do_update(1, dg, (void *)&lengths, &ctx->ghash_key, NULL,
- pmull_ghash_update_p64);
-
- put_unaligned_be64(dg[1], mac);
- put_unaligned_be64(dg[0], mac + 8);
-
- crypto_xor(tag, mac, AES_BLOCK_SIZE);
-}
-
static int gcm_encrypt(struct aead_request *req)
{
struct crypto_aead *aead = crypto_aead_reqtfm(req);
struct gcm_aes_ctx *ctx = crypto_aead_ctx(aead);
+ int nrounds = num_rounds(&ctx->aes_key);
struct skcipher_walk walk;
+ u8 buf[AES_BLOCK_SIZE];
u8 iv[AES_BLOCK_SIZE];
- u8 ks[2 * AES_BLOCK_SIZE];
- u8 tag[AES_BLOCK_SIZE];
u64 dg[2] = {};
- int nrounds = num_rounds(&ctx->aes_key);
+ u128 lengths;
+ u8 *tag;
int err;
+ lengths.a = cpu_to_be64(req->assoclen * 8);
+ lengths.b = cpu_to_be64(req->cryptlen * 8);
+
if (req->assoclen)
gcm_calculate_auth_mac(req, dg);
memcpy(iv, req->iv, GCM_IV_SIZE);
- put_unaligned_be32(1, iv + GCM_IV_SIZE);
+ put_unaligned_be32(2, iv + GCM_IV_SIZE);
err = skcipher_walk_aead_encrypt(&walk, req, false);
- if (likely(crypto_simd_usable() && walk.total >= 2 * AES_BLOCK_SIZE)) {
- u32 const *rk = NULL;
-
- kernel_neon_begin();
- pmull_gcm_encrypt_block(tag, iv, ctx->aes_key.key_enc, nrounds);
- put_unaligned_be32(2, iv + GCM_IV_SIZE);
- pmull_gcm_encrypt_block(ks, iv, NULL, nrounds);
- put_unaligned_be32(3, iv + GCM_IV_SIZE);
- pmull_gcm_encrypt_block(ks + AES_BLOCK_SIZE, iv, NULL, nrounds);
- put_unaligned_be32(4, iv + GCM_IV_SIZE);
-
+ if (likely(crypto_simd_usable())) {
do {
- int blocks = walk.nbytes / (2 * AES_BLOCK_SIZE) * 2;
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+ int nbytes = walk.nbytes;
+
+ tag = (u8 *)&lengths;
- if (rk)
- kernel_neon_begin();
+ if (unlikely(nbytes > 0 && nbytes < AES_BLOCK_SIZE)) {
+ src = dst = memcpy(buf + sizeof(buf) - nbytes,
+ src, nbytes);
+ } else if (nbytes < walk.total) {
+ nbytes &= ~(AES_BLOCK_SIZE - 1);
+ tag = NULL;
+ }
- pmull_gcm_encrypt(blocks, dg, walk.dst.virt.addr,
- walk.src.virt.addr, &ctx->ghash_key,
- iv, rk, nrounds, ks);
+ kernel_neon_begin();
+ pmull_gcm_encrypt(nbytes, dst, src, &ctx->ghash_key, dg,
+ iv, ctx->aes_key.key_enc, nrounds,
+ tag);
kernel_neon_end();
- err = skcipher_walk_done(&walk,
- walk.nbytes % (2 * AES_BLOCK_SIZE));
+ if (unlikely(!nbytes))
+ break;
- rk = ctx->aes_key.key_enc;
- } while (walk.nbytes >= 2 * AES_BLOCK_SIZE);
- } else {
- aes_encrypt(&ctx->aes_key, tag, iv);
- put_unaligned_be32(2, iv + GCM_IV_SIZE);
+ if (unlikely(nbytes > 0 && nbytes < AES_BLOCK_SIZE))
+ memcpy(walk.dst.virt.addr,
+ buf + sizeof(buf) - nbytes, nbytes);
- while (walk.nbytes >= (2 * AES_BLOCK_SIZE)) {
- const int blocks =
- walk.nbytes / (2 * AES_BLOCK_SIZE) * 2;
+ err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
+ } while (walk.nbytes);
+ } else {
+ while (walk.nbytes >= AES_BLOCK_SIZE) {
+ int blocks = walk.nbytes / AES_BLOCK_SIZE;
+ const u8 *src = walk.src.virt.addr;
u8 *dst = walk.dst.virt.addr;
- u8 *src = walk.src.virt.addr;
int remaining = blocks;
do {
- aes_encrypt(&ctx->aes_key, ks, iv);
- crypto_xor_cpy(dst, src, ks, AES_BLOCK_SIZE);
+ aes_encrypt(&ctx->aes_key, buf, iv);
+ crypto_xor_cpy(dst, src, buf, AES_BLOCK_SIZE);
crypto_inc(iv, AES_BLOCK_SIZE);
dst += AES_BLOCK_SIZE;
src += AES_BLOCK_SIZE;
} while (--remaining > 0);
- ghash_do_update(blocks, dg,
- walk.dst.virt.addr, &ctx->ghash_key,
- NULL, pmull_ghash_update_p64);
+ ghash_do_update(blocks, dg, walk.dst.virt.addr,
+ &ctx->ghash_key, NULL, NULL);
err = skcipher_walk_done(