From 8621caa0d45e731f2e9f5889ff5bb384fcd6e059 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 8 Dec 2016 14:28:58 +0000 Subject: crypto: arm64/chacha20 - implement NEON version based on SSE3 code This is a straight port to arm64/NEON of the x86 SSE3 implementation of the ChaCha20 stream cipher. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm64/crypto/Kconfig | 6 + arch/arm64/crypto/Makefile | 3 + arch/arm64/crypto/chacha20-neon-core.S | 480 +++++++++++++++++++++++++++++++++ arch/arm64/crypto/chacha20-neon-glue.c | 131 +++++++++ 4 files changed, 620 insertions(+) create mode 100644 arch/arm64/crypto/chacha20-neon-core.S create mode 100644 arch/arm64/crypto/chacha20-neon-glue.c (limited to 'arch/arm64') diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig index 450a85df041a..0bf0f531f539 100644 --- a/arch/arm64/crypto/Kconfig +++ b/arch/arm64/crypto/Kconfig @@ -72,4 +72,10 @@ config CRYPTO_CRC32_ARM64 depends on ARM64 select CRYPTO_HASH +config CRYPTO_CHACHA20_NEON + tristate "NEON accelerated ChaCha20 symmetric cipher" + depends on KERNEL_MODE_NEON + select CRYPTO_BLKCIPHER + select CRYPTO_CHACHA20 + endif diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile index aa8888d7b744..9d2826c5fccf 100644 --- a/arch/arm64/crypto/Makefile +++ b/arch/arm64/crypto/Makefile @@ -41,6 +41,9 @@ sha256-arm64-y := sha256-glue.o sha256-core.o obj-$(CONFIG_CRYPTO_SHA512_ARM64) += sha512-arm64.o sha512-arm64-y := sha512-glue.o sha512-core.o +obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha20-neon.o +chacha20-neon-y := chacha20-neon-core.o chacha20-neon-glue.o + AFLAGS_aes-ce.o := -DINTERLEAVE=4 AFLAGS_aes-neon.o := -DINTERLEAVE=4 diff --git a/arch/arm64/crypto/chacha20-neon-core.S b/arch/arm64/crypto/chacha20-neon-core.S new file mode 100644 index 000000000000..e2cd65580807 --- /dev/null +++ b/arch/arm64/crypto/chacha20-neon-core.S @@ -0,0 +1,480 @@ +/* + * ChaCha20 256-bit cipher algorithm, RFC7539, arm64 NEON functions + * + * Copyright (C) 2016 Linaro, Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Based on: + * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions + * + * Copyright (C) 2015 Martin Willi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include + + .text + .align 6 + +ENTRY(chacha20_block_xor_neon) + // x0: Input state matrix, s + // x1: 1 data block output, o + // x2: 1 data block input, i + + // + // This function encrypts one ChaCha20 block by loading the state matrix + // in four NEON registers. It performs matrix operation on four words in + // parallel, but requires shuffling to rearrange the words after each + // round. + // + + // x0..3 = s0..3 + ld1 {v0.4s-v3.4s}, [x0] + ld1 {v8.4s-v11.4s}, [x0] + + mov x3, #10 + +.Ldoubleround: + // x0 += x1, x3 = rotl32(x3 ^ x0, 16) + add v0.4s, v0.4s, v1.4s + eor v3.16b, v3.16b, v0.16b + rev32 v3.8h, v3.8h + + // x2 += x3, x1 = rotl32(x1 ^ x2, 12) + add v2.4s, v2.4s, v3.4s + eor v4.16b, v1.16b, v2.16b + shl v1.4s, v4.4s, #12 + sri v1.4s, v4.4s, #20 + + // x0 += x1, x3 = rotl32(x3 ^ x0, 8) + add v0.4s, v0.4s, v1.4s + eor v4.16b, v3.16b, v0.16b + shl v3.4s, v4.4s, #8 + sri v3.4s, v4.4s, #24 + + // x2 += x3, x1 = rotl32(x1 ^ x2, 7) + add v2.4s, v2.4s, v3.4s + eor v4.16b, v1.16b, v2.16b + shl v1.4s, v4.4s, #7 + sri v1.4s, v4.4s, #25 + + // x1 = shuffle32(x1, MASK(0, 3, 2, 1)) + ext v1.16b, v1.16b, v1.16b, #4 + // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + ext v2.16b, v2.16b, v2.16b, #8 + // x3 = shuffle32(x3, MASK(2, 1, 0, 3)) + ext v3.16b, v3.16b, v3.16b, #12 + + // x0 += x1, x3 = rotl32(x3 ^ x0, 16) + add v0.4s, v0.4s, v1.4s + eor v3.16b, v3.16b, v0.16b + rev32 v3.8h, v3.8h + + // x2 += x3, x1 = rotl32(x1 ^ x2, 12) + add v2.4s, v2.4s, v3.4s + eor v4.16b, v1.16b, v2.16b + shl v1.4s, v4.4s, #12 + sri v1.4s, v4.4s, #20 + + // x0 += x1, x3 = rotl32(x3 ^ x0, 8) + add v0.4s, v0.4s, v1.4s + eor v4.16b, v3.16b, v0.16b + shl v3.4s, v4.4s, #8 + sri v3.4s, v4.4s, #24 + + // x2 += x3, x1 = rotl32(x1 ^ x2, 7) + add v2.4s, v2.4s, v3.4s + eor v4.16b, v1.16b, v2.16b + shl v1.4s, v4.4s, #7 + sri v1.4s, v4.4s, #25 + + // x1 = shuffle32(x1, MASK(2, 1, 0, 3)) + ext v1.16b, v1.16b, v1.16b, #12 + // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + ext v2.16b, v2.16b, v2.16b, #8 + // x3 = shuffle32(x3, MASK(0, 3, 2, 1)) + ext v3.16b, v3.16b, v3.16b, #4 + + subs x3, x3, #1 + b.ne .Ldoubleround + + ld1 {v4.16b-v7.16b}, [x2] + + // o0 = i0 ^ (x0 + s0) + add v0.4s, v0.4s, v8.4s + eor v0.16b, v0.16b, v4.16b + + // o1 = i1 ^ (x1 + s1) + add v1.4s, v1.4s, v9.4s + eor v1.16b, v1.16b, v5.16b + + // o2 = i2 ^ (x2 + s2) + add v2.4s, v2.4s, v10.4s + eor v2.16b, v2.16b, v6.16b + + // o3 = i3 ^ (x3 + s3) + add v3.4s, v3.4s, v11.4s + eor v3.16b, v3.16b, v7.16b + + st1 {v0.16b-v3.16b}, [x1] + + ret +ENDPROC(chacha20_block_xor_neon) + + .align 6 +ENTRY(chacha20_4block_xor_neon) + // x0: Input state matrix, s + // x1: 4 data blocks output, o + // x2: 4 data blocks input, i + + // + // This function encrypts four consecutive ChaCha20 blocks by loading + // the state matrix in NEON registers four times. The algorithm performs + // each operation on the corresponding word of each state matrix, hence + // requires no word shuffling. For final XORing step we transpose the + // matrix by interleaving 32- and then 64-bit words, which allows us to + // do XOR in NEON registers. + // + adr x3, CTRINC + ld1 {v16.4s}, [x3] + + // x0..15[0-3] = s0..3[0..3] + mov x4, x0 + ld4r { v0.4s- v3.4s}, [x4], #16 + ld4r { v4.4s- v7.4s}, [x4], #16 + ld4r { v8.4s-v11.4s}, [x4], #16 + ld4r {v12.4s-v15.4s}, [x4] + + // x12 += counter values 0-3 + add v12.4s, v12.4s, v16.4s + + mov x3, #10 + +.Ldoubleround4: + // x0 += x4, x12 = rotl32(x12 ^ x0, 16) + // x1 += x5, x13 = rotl32(x13 ^ x1, 16) + // x2 += x6, x14 = rotl32(x14 ^ x2, 16) + // x3 += x7, x15 = rotl32(x15 ^ x3, 16) + add v0.4s, v0.4s, v4.4s + add v1.4s, v1.4s, v5.4s + add v2.4s, v2.4s, v6.4s + add v3.4s, v3.4s, v7.4s + + eor v12.16b, v12.16b, v0.16b + eor v13.16b, v13.16b, v1.16b + eor v14.16b, v14.16b, v2.16b + eor v15.16b, v15.16b, v3.16b + + rev32 v12.8h, v12.8h + rev32 v13.8h, v13.8h + rev32 v14.8h, v14.8h + rev32 v15.8h, v15.8h + + // x8 += x12, x4 = rotl32(x4 ^ x8, 12) + // x9 += x13, x5 = rotl32(x5 ^ x9, 12) + // x10 += x14, x6 = rotl32(x6 ^ x10, 12) + // x11 += x15, x7 = rotl32(x7 ^ x11, 12) + add v8.4s, v8.4s, v12.4s + add v9.4s, v9.4s, v13.4s + add v10.4s, v10.4s, v14.4s + add v11.4s, v11.4s, v15.4s + + eor v17.16b, v4.16b, v8.16b + eor v18.16b, v5.16b, v9.16b + eor v19.16b, v6.16b, v10.16b + eor v20.16b, v7.16b, v11.16b + + shl v4.4s, v17.4s, #12 + shl v5.4s, v18.4s, #12 + shl v6.4s, v19.4s, #12 + shl v7.4s, v20.4s, #12 + + sri v4.4s, v17.4s, #20 + sri v5.4s, v18.4s, #20 + sri v6.4s, v19.4s, #20 + sri v7.4s, v20.4s, #20 + + // x0 += x4, x12 = rotl32(x12 ^ x0, 8) + // x1 += x5, x13 = rotl32(x13 ^ x1, 8) + // x2 += x6, x14 = rotl32(x14 ^ x2, 8) + // x3 += x7, x15 = rotl32(x15 ^ x3, 8) + add v0.4s, v0.4s, v4.4s + add v1.4s, v1.4s, v5.4s + add v2.4s, v2.4s, v6.4s + add v3.4s, v3.4s, v7.4s + + eor v17.16b, v12.16b, v0.16b + eor v18.16b, v13.16b, v1.16b + eor v19.16b, v14.16b, v2.16b + eor v20.16b, v15.16b, v3.16b + + shl v12.4s, v17.4s, #8 + shl v13.4s, v18.4s, #8 + shl v14.4s, v19.4s, #8 + shl v15.4s, v20.4s, #8 + + sri v12.4s, v17.4s, #24 + sri v13.4s, v18.4s, #24 + sri v14.4s, v19.4s, #24 + sri v15.4s, v20.4s, #24 + + // x8 += x12, x4 = rotl32(x4 ^ x8, 7) + // x9 += x13, x5 = rotl32(x5 ^ x9, 7) + // x10 += x14, x6 = rotl32(x6 ^ x10, 7) + // x11 += x15, x7 = rotl32(x7 ^ x11, 7) + add v8.4s, v8.4s, v12.4s + add v9.4s, v9.4s, v13.4s + add v10.4s, v10.4s, v14.4s + add v11.4s, v11.4s, v15.4s + + eor v17.16b, v4.16b, v8.16b + eor v18.16b, v5.16b, v9.16b + eor v19.16b, v6.16b, v10.16b + eor v20.16b, v7.16b, v11.16b + + shl v4.4s, v17.4s, #7 + shl v5.4s, v18.4s, #7 + shl v6.4s, v19.4s, #7 + shl v7.4s, v20.4s, #7 + + sri v4.4s, v17.4s, #25 + sri v5.4s, v18.4s, #25 + sri v6.4s, v19.4s, #25 + sri v7.4s, v20.4s, #25 + + // x0 += x5, x15 = rotl32(x15 ^ x0, 16) + // x1 += x6, x12 = rotl32(x12 ^ x1, 16) + // x2 += x7, x13 = rotl32(x13 ^ x2, 16) + // x3 += x4, x14 = rotl32(x14 ^ x3, 16) + add v0.4s, v0.4s, v5.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + add v3.4s, v3.4s, v4.4s + + eor v15.16b, v15.16b, v0.16b + eor v12.16b, v12.16b, v1.16b + eor v13.16b, v13.16b, v2.16b + eor v14.16b, v14.16b, v3.16b + + rev32 v15.8h, v15.8h + rev32 v12.8h, v12.8h + rev32 v13.8h, v13.8h + rev32 v14.8h, v14.8h + + // x10 += x15, x5 = rotl32(x5 ^ x10, 12) + // x11 += x12, x6 = rotl32(x6 ^ x11, 12) + // x8 += x13, x7 = rotl32(x7 ^ x8, 12) + // x9 += x14, x4 = rotl32(x4 ^ x9, 12) + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v12.4s + add v8.4s, v8.4s, v13.4s + add v9.4s, v9.4s, v14.4s + + eor v17.16b, v5.16b, v10.16b + eor v18.16b, v6.16b, v11.16b + eor v19.16b, v7.16b, v8.16b + eor v20.16b, v4.16b, v9.16b + + shl v5.4s, v17.4s, #12 + shl v6.4s, v18.4s, #12 + shl v7.4s, v19.4s, #12 + shl v4.4s, v20.4s, #12 + + sri v5.4s, v17.4s, #20 + sri v6.4s, v18.4s, #20 + sri v7.4s, v19.4s, #20 + sri v4.4s, v20.4s, #20 + + // x0 += x5, x15 = rotl32(x15 ^ x0, 8) + // x1 += x6, x12 = rotl32(x12 ^ x1, 8) + // x2 += x7, x13 = rotl32(x13 ^ x2, 8) + // x3 += x4, x14 = rotl32(x14 ^ x3, 8) + add v0.4s, v0.4s, v5.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + add v3.4s, v3.4s, v4.4s + + eor v17.16b, v15.16b, v0.16b + eor v18.16b, v12.16b, v1.16b + eor v19.16b, v13.16b, v2.16b + eor v20.16b, v14.16b, v3.16b + + shl v15.4s, v17.4s, #8 + shl v12.4s, v18.4s, #8 + shl v13.4s, v19.4s, #8 + shl v14.4s, v20.4s, #8 + + sri v15.4s, v17.4s, #24 + sri v12.4s, v18.4s, #24 + sri v13.4s, v19.4s, #24 + sri v14.4s, v20.4s, #24 + + // x10 += x15, x5 = rotl32(x5 ^ x10, 7) + // x11 += x12, x6 = rotl32(x6 ^ x11, 7) + // x8 += x13, x7 = rotl32(x7 ^ x8, 7) + // x9 += x14, x4 = rotl32(x4 ^ x9, 7) + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v12.4s + add v8.4s, v8.4s, v13.4s + add v9.4s, v9.4s, v14.4s + + eor v17.16b, v5.16b, v10.16b + eor v18.16b, v6.16b, v11.16b + eor v19.16b, v7.16b, v8.16b + eor v20.16b, v4.16b, v9.16b + + shl v5.4s, v17.4s, #7 + shl v6.4s, v18.4s, #7 + shl v7.4s, v19.4s, #7 + shl v4.4s, v20.4s, #7 + + sri v5.4s, v17.4s, #25 + sri v6.4s, v18.4s, #25 + sri v7.4s, v19.4s, #25 + sri v4.4s, v20.4s, #25 + + subs x3, x3, #1 + b.ne .Ldoubleround4 + + // x0[0-3] += s0[0] + // x1[0-3] += s0[1] + // x2[0-3] += s0[2] + // x3[0-3] += s0[3] + ld4r {v17.4s-v20.4s}, [x0], #16 + add v0.4s, v0.4s, v17.4s + add v1.4s, v1.4s, v18.4s + add v2.4s, v2.4s, v19.4s + add v3.4s, v3.4s, v20.4s + + // x4[0-3] += s1[0] + // x5[0-3] += s1[1] + // x6[0-3] += s1[2] + // x7[0-3] += s1[3] + ld4r {v21.4s-v24.4s}, [x0], #16 + add v4.4s, v4.4s, v21.4s + add v5.4s, v5.4s, v22.4s + add v6.4s, v6.4s, v23.4s + add v7.4s, v7.4s, v24.4s + + // x8[0-3] += s2[0] + // x9[0-3] += s2[1] + // x10[0-3] += s2[2] + // x11[0-3] += s2[3] + ld4r {v17.4s-v20.4s}, [x0], #16 + add v8.4s, v8.4s, v17.4s + add v9.4s, v9.4s, v18.4s + add v10.4s, v10.4s, v19.4s + add v11.4s, v11.4s, v20.4s + + // x12[0-3] += s3[0] + // x13[0-3] += s3[1] + // x14[0-3] += s3[2] + // x15[0-3] += s3[3] + ld4r {v21.4s-v24.4s}, [x0] + add v12.4s, v12.4s, v21.4s + add v13.4s, v13.4s, v22.4s + add v14.4s, v14.4s, v23.4s + add v15.4s, v15.4s, v24.4s + + // x12 += counter values 0-3 + add v12.4s, v12.4s, v16.4s + + ld1 {v16.16b-v19.16b}, [x2], #64 + ld1 {v20.16b-v23.16b}, [x2], #64 + + // interleave 32-bit words in state n, n+1 + zip1 v24.4s, v0.4s, v1.4s + zip1 v25.4s, v2.4s, v3.4s + zip1 v26.4s, v4.4s, v5.4s + zip1 v27.4s, v6.4s, v7.4s + zip1 v28.4s, v8.4s, v9.4s + zip1 v29.4s, v10.4s, v11.4s + zip1 v30.4s, v12.4s, v13.4s + zip1 v31.4s, v14.4s, v15.4s + + zip2 v1.4s, v0.4s, v1.4s + zip2 v3.4s, v2.4s, v3.4s + zip2 v5.4s, v4.4s, v5.4s + zip2 v7.4s, v6.4s, v7.4s + zip2 v9.4s, v8.4s, v9.4s + zip2 v11.4s, v10.4s, v11.4s + zip2 v13.4s, v12.4s, v13.4s + zip2 v15.4s, v14.4s, v15.4s + + mov v0.16b, v24.16b + mov v2.16b, v25.16b + mov v4.16b, v26.16b + mov v6.16b, v27.16b + mov v8.16b, v28.16b + mov v10.16b, v29.16b + mov v12.16b, v30.16b + mov v14.16b, v31.16b + + // interleave 64-bit words in state n, n+2 + zip1 v24.2d, v0.2d, v2.2d + zip1 v25.2d, v1.2d, v3.2d + zip1 v26.2d, v4.2d, v6.2d + zip1 v27.2d, v5.2d, v7.2d + zip1 v28.2d, v8.2d, v10.2d + zip1 v29.2d, v9.2d, v11.2d + zip1 v30.2d, v12.2d, v14.2d + zip1 v31.2d, v13.2d, v15.2d + + zip2 v2.2d, v0.2d, v2.2d + zip2 v3.2d, v1.2d, v3.2d + zip2 v6.2d, v4.2d, v6.2d + zip2 v7.2d, v5.2d, v7.2d + zip2 v10.2d, v8.2d, v10.2d + zip2 v11.2d, v9.2d, v11.2d + zip2 v14.2d, v12.2d, v14.2d + zip2 v15.2d, v13.2d, v15.2d + + mov v0.16b, v24.16b + mov v1.16b, v25.16b + mov v4.16b, v26.16b + mov v5.16b, v27.16b + + mov v8.16b, v28.16b + mov v9.16b, v29.16b + mov v12.16b, v30.16b + mov v13.16b, v31.16b + + ld1 {v24.16b-v27.16b}, [x2], #64 + ld1 {v28.16b-v31.16b}, [x2] + + // xor with corresponding input, write to output + eor v16.16b, v16.16b, v0.16b + eor v17.16b, v17.16b, v4.16b + eor v18.16b, v18.16b, v8.16b + eor v19.16b, v19.16b, v12.16b + st1 {v16.16b-v19.16b}, [x1], #64 + + eor v20.16b, v20.16b, v2.16b + eor v21.16b, v21.16b, v6.16b + eor v22.16b, v22.16b, v10.16b + eor v23.16b, v23.16b, v14.16b + st1 {v20.16b-v23.16b}, [x1], #64 + + eor v24.16b, v24.16b, v1.16b + eor v25.16b, v25.16b, v5.16b + eor v26.16b, v26.16b, v9.16b + eor v27.16b, v27.16b, v13.16b + st1 {v24.16b-v27.16b}, [x1], #64 + + eor v28.16b, v28.16b, v3.16b + eor v29.16b, v29.16b, v7.16b + eor v30.16b, v30.16b, v11.16b + eor v31.16b, v31.16b, v15.16b + st1 {v28.16b-v31.16b}, [x1] + + ret +ENDPROC(chacha20_4block_xor_neon) + +CTRINC: .word 0, 1, 2, 3 diff --git a/arch/arm64/crypto/chacha20-neon-glue.c b/arch/arm64/crypto/chacha20-neon-glue.c new file mode 100644 index 000000000000..705b42b06d00 --- /dev/null +++ b/arch/arm64/crypto/chacha20-neon-glue.c @@ -0,0 +1,131 @@ +/* + * ChaCha20 256-bit cipher algorithm, RFC7539, arm64 NEON functions + * + * Copyright (C) 2016 Linaro, Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Based on: + * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code + * + * Copyright (C) 2015 Martin Willi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include +#include +#include +#include +#include + +#include + +asmlinkage void chacha20_block_xor_neon(u32 *state, u8 *dst, const u8 *src); +asmlinkage void chacha20_4block_xor_neon(u32 *state, u8 *dst, const u8 *src); + +static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src, + unsigned int bytes) +{ + u8 buf[CHACHA20_BLOCK_SIZE]; + + while (bytes >= CHACHA20_BLOCK_SIZE * 4) { + chacha20_4block_xor_neon(state, dst, src); + bytes -= CHACHA20_BLOCK_SIZE * 4; + src += CHACHA20_BLOCK_SIZE * 4; + dst += CHACHA20_BLOCK_SIZE * 4; + state[12] += 4; + } + while (bytes >= CHACHA20_BLOCK_SIZE) { + chacha20_block_xor_neon(state, dst, src); + bytes -= CHACHA20_BLOCK_SIZE; + src += CHACHA20_BLOCK_SIZE; + dst += CHACHA20_BLOCK_SIZE; + state[12]++; + } + if (bytes) { + memcpy(buf, src, bytes); + chacha20_block_xor_neon(state, buf, buf); + memcpy(dst, buf, bytes); + } +} + +static int chacha20_simd(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct blkcipher_walk walk; + u32 state[16]; + int err; + + if (nbytes <= CHACHA20_BLOCK_SIZE) + return crypto_chacha20_crypt(desc, dst, src, nbytes); + + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt_block(desc, &walk, CHACHA20_BLOCK_SIZE); + + crypto_chacha20_init(state, crypto_blkcipher_ctx(desc->tfm), walk.iv); + + kernel_neon_begin(); + + while (walk.nbytes >= CHACHA20_BLOCK_SIZE) { + chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr, + rounddown(walk.nbytes, CHACHA20_BLOCK_SIZE)); + err = blkcipher_walk_done(desc, &walk, + walk.nbytes % CHACHA20_BLOCK_SIZE); + } + + if (walk.nbytes) { + chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr, + walk.nbytes); + err = blkcipher_walk_done(desc, &walk, 0); + } + + kernel_neon_end(); + + return err; +} + +static struct crypto_alg alg = { + .cra_name = "chacha20", + .cra_driver_name = "chacha20-neon", + .cra_priority = 300, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = 1, + .cra_type = &crypto_blkcipher_type, + .cra_ctxsize = sizeof(struct chacha20_ctx), + .cra_alignmask = sizeof(u32) - 1, + .cra_module = THIS_MODULE, + .cra_u = { + .blkcipher = { + .min_keysize = CHACHA20_KEY_SIZE, + .max_keysize = CHACHA20_KEY_SIZE, + .ivsize = CHACHA20_IV_SIZE, + .geniv = "seqiv", + .setkey = crypto_chacha20_setkey, + .encrypt = chacha20_simd, + .decrypt = chacha20_simd, + }, + }, +}; + +static int __init chacha20_simd_mod_init(void) +{ + return crypto_register_alg(&alg); +} + +static void __exit chacha20_simd_mod_fini(void) +{ + crypto_unregister_alg(&alg); +} + +module_init(chacha20_simd_mod_init); +module_exit(chacha20_simd_mod_fini); + +MODULE_AUTHOR("Ard Biesheuvel "); +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS_CRYPTO("chacha20"); -- cgit v1.2.3 From 5386e5d1f8b7305e447b781f7ac02649f7a4d055 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Wed, 28 Dec 2016 17:39:26 +0800 Subject: Revert "crypto: arm64/ARM: NEON accelerated ChaCha20" This patch reverts the following commits: 8621caa0d45e731f2e9f5889ff5bb384fcd6e059 8096667273477e735b0072b11a6d617ccee45e5f I should not have applied them because they had already been obsoleted by a subsequent patch series. They also cause a build failure because of the subsequent commit 9ae433bc79f9. Fixes: 9ae433bc79f ("crypto: chacha20 - convert generic and...") Signed-off-by: Herbert Xu --- arch/arm64/crypto/Kconfig | 6 - arch/arm64/crypto/Makefile | 3 - arch/arm64/crypto/chacha20-neon-core.S | 480 --------------------------------- arch/arm64/crypto/chacha20-neon-glue.c | 131 --------- 4 files changed, 620 deletions(-) delete mode 100644 arch/arm64/crypto/chacha20-neon-core.S delete mode 100644 arch/arm64/crypto/chacha20-neon-glue.c (limited to 'arch/arm64') diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig index 0bf0f531f539..450a85df041a 100644 --- a/arch/arm64/crypto/Kconfig +++ b/arch/arm64/crypto/Kconfig @@ -72,10 +72,4 @@ config CRYPTO_CRC32_ARM64 depends on ARM64 select CRYPTO_HASH -config CRYPTO_CHACHA20_NEON - tristate "NEON accelerated ChaCha20 symmetric cipher" - depends on KERNEL_MODE_NEON - select CRYPTO_BLKCIPHER - select CRYPTO_CHACHA20 - endif diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile index 9d2826c5fccf..aa8888d7b744 100644 --- a/arch/arm64/crypto/Makefile +++ b/arch/arm64/crypto/Makefile @@ -41,9 +41,6 @@ sha256-arm64-y := sha256-glue.o sha256-core.o obj-$(CONFIG_CRYPTO_SHA512_ARM64) += sha512-arm64.o sha512-arm64-y := sha512-glue.o sha512-core.o -obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha20-neon.o -chacha20-neon-y := chacha20-neon-core.o chacha20-neon-glue.o - AFLAGS_aes-ce.o := -DINTERLEAVE=4 AFLAGS_aes-neon.o := -DINTERLEAVE=4 diff --git a/arch/arm64/crypto/chacha20-neon-core.S b/arch/arm64/crypto/chacha20-neon-core.S deleted file mode 100644 index e2cd65580807..000000000000 --- a/arch/arm64/crypto/chacha20-neon-core.S +++ /dev/null @@ -1,480 +0,0 @@ -/* - * ChaCha20 256-bit cipher algorithm, RFC7539, arm64 NEON functions - * - * Copyright (C) 2016 Linaro, Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * Based on: - * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions - * - * Copyright (C) 2015 Martin Willi - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ - -#include - - .text - .align 6 - -ENTRY(chacha20_block_xor_neon) - // x0: Input state matrix, s - // x1: 1 data block output, o - // x2: 1 data block input, i - - // - // This function encrypts one ChaCha20 block by loading the state matrix - // in four NEON registers. It performs matrix operation on four words in - // parallel, but requires shuffling to rearrange the words after each - // round. - // - - // x0..3 = s0..3 - ld1 {v0.4s-v3.4s}, [x0] - ld1 {v8.4s-v11.4s}, [x0] - - mov x3, #10 - -.Ldoubleround: - // x0 += x1, x3 = rotl32(x3 ^ x0, 16) - add v0.4s, v0.4s, v1.4s - eor v3.16b, v3.16b, v0.16b - rev32 v3.8h, v3.8h - - // x2 += x3, x1 = rotl32(x1 ^ x2, 12) - add v2.4s, v2.4s, v3.4s - eor v4.16b, v1.16b, v2.16b - shl v1.4s, v4.4s, #12 - sri v1.4s, v4.4s, #20 - - // x0 += x1, x3 = rotl32(x3 ^ x0, 8) - add v0.4s, v0.4s, v1.4s - eor v4.16b, v3.16b, v0.16b - shl v3.4s, v4.4s, #8 - sri v3.4s, v4.4s, #24 - - // x2 += x3, x1 = rotl32(x1 ^ x2, 7) - add v2.4s, v2.4s, v3.4s - eor v4.16b, v1.16b, v2.16b - shl v1.4s, v4.4s, #7 - sri v1.4s, v4.4s, #25 - - // x1 = shuffle32(x1, MASK(0, 3, 2, 1)) - ext v1.16b, v1.16b, v1.16b, #4 - // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - ext v2.16b, v2.16b, v2.16b, #8 - // x3 = shuffle32(x3, MASK(2, 1, 0, 3)) - ext v3.16b, v3.16b, v3.16b, #12 - - // x0 += x1, x3 = rotl32(x3 ^ x0, 16) - add v0.4s, v0.4s, v1.4s - eor v3.16b, v3.16b, v0.16b - rev32 v3.8h, v3.8h - - // x2 += x3, x1 = rotl32(x1 ^ x2, 12) - add v2.4s, v2.4s, v3.4s - eor v4.16b, v1.16b, v2.16b - shl v1.4s, v4.4s, #12 - sri v1.4s, v4.4s, #20 - - // x0 += x1, x3 = rotl32(x3 ^ x0, 8) - add v0.4s, v0.4s, v1.4s - eor v4.16b, v3.16b, v0.16b - shl v3.4s, v4.4s, #8 - sri v3.4s, v4.4s, #24 - - // x2 += x3, x1 = rotl32(x1 ^ x2, 7) - add v2.4s, v2.4s, v3.4s - eor v4.16b, v1.16b, v2.16b - shl v1.4s, v4.4s, #7 - sri v1.4s, v4.4s, #25 - - // x1 = shuffle32(x1, MASK(2, 1, 0, 3)) - ext v1.16b, v1.16b, v1.16b, #12 - // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - ext v2.16b, v2.16b, v2.16b, #8 - // x3 = shuffle32(x3, MASK(0, 3, 2, 1)) - ext v3.16b, v3.16b, v3.16b, #4 - - subs x3, x3, #1 - b.ne .Ldoubleround - - ld1 {v4.16b-v7.16b}, [x2] - - // o0 = i0 ^ (x0 + s0) - add v0.4s, v0.4s, v8.4s - eor v0.16b, v0.16b, v4.16b - - // o1 = i1 ^ (x1 + s1) - add v1.4s, v1.4s, v9.4s - eor v1.16b, v1.16b, v5.16b - - // o2 = i2 ^ (x2 + s2) - add v2.4s, v2.4s, v10.4s - eor v2.16b, v2.16b, v6.16b - - // o3 = i3 ^ (x3 + s3) - add v3.4s, v3.4s, v11.4s - eor v3.16b, v3.16b, v7.16b - - st1 {v0.16b-v3.16b}, [x1] - - ret -ENDPROC(chacha20_block_xor_neon) - - .align 6 -ENTRY(chacha20_4block_xor_neon) - // x0: Input state matrix, s - // x1: 4 data blocks output, o - // x2: 4 data blocks input, i - - // - // This function encrypts four consecutive ChaCha20 blocks by loading - // the state matrix in NEON registers four times. The algorithm performs - // each operation on the corresponding word of each state matrix, hence - // requires no word shuffling. For final XORing step we transpose the - // matrix by interleaving 32- and then 64-bit words, which allows us to - // do XOR in NEON registers. - // - adr x3, CTRINC - ld1 {v16.4s}, [x3] - - // x0..15[0-3] = s0..3[0..3] - mov x4, x0 - ld4r { v0.4s- v3.4s}, [x4], #16 - ld4r { v4.4s- v7.4s}, [x4], #16 - ld4r { v8.4s-v11.4s}, [x4], #16 - ld4r {v12.4s-v15.4s}, [x4] - - // x12 += counter values 0-3 - add v12.4s, v12.4s, v16.4s - - mov x3, #10 - -.Ldoubleround4: - // x0 += x4, x12 = rotl32(x12 ^ x0, 16) - // x1 += x5, x13 = rotl32(x13 ^ x1, 16) - // x2 += x6, x14 = rotl32(x14 ^ x2, 16) - // x3 += x7, x15 = rotl32(x15 ^ x3, 16) - add v0.4s, v0.4s, v4.4s - add v1.4s, v1.4s, v5.4s - add v2.4s, v2.4s, v6.4s - add v3.4s, v3.4s, v7.4s - - eor v12.16b, v12.16b, v0.16b - eor v13.16b, v13.16b, v1.16b - eor v14.16b, v14.16b, v2.16b - eor v15.16b, v15.16b, v3.16b - - rev32 v12.8h, v12.8h - rev32 v13.8h, v13.8h - rev32 v14.8h, v14.8h - rev32 v15.8h, v15.8h - - // x8 += x12, x4 = rotl32(x4 ^ x8, 12) - // x9 += x13, x5 = rotl32(x5 ^ x9, 12) - // x10 += x14, x6 = rotl32(x6 ^ x10, 12) - // x11 += x15, x7 = rotl32(x7 ^ x11, 12) - add v8.4s, v8.4s, v12.4s - add v9.4s, v9.4s, v13.4s - add v10.4s, v10.4s, v14.4s - add v11.4s, v11.4s, v15.4s - - eor v17.16b, v4.16b, v8.16b - eor v18.16b, v5.16b, v9.16b - eor v19.16b, v6.16b, v10.16b - eor v20.16b, v7.16b, v11.16b - - shl v4.4s, v17.4s, #12 - shl v5.4s, v18.4s, #12 - shl v6.4s, v19.4s, #12 - shl v7.4s, v20.4s, #12 - - sri v4.4s, v17.4s, #20 - sri v5.4s, v18.4s, #20 - sri v6.4s, v19.4s, #20 - sri v7.4s, v20.4s, #20 - - // x0 += x4, x12 = rotl32(x12 ^ x0, 8) - // x1 += x5, x13 = rotl32(x13 ^ x1, 8) - // x2 += x6, x14 = rotl32(x14 ^ x2, 8) - // x3 += x7, x15 = rotl32(x15 ^ x3, 8) - add v0.4s, v0.4s, v4.4s - add v1.4s, v1.4s, v5.4s - add v2.4s, v2.4s, v6.4s - add v3.4s, v3.4s, v7.4s - - eor v17.16b, v12.16b, v0.16b - eor v18.16b, v13.16b, v1.16b - eor v19.16b, v14.16b, v2.16b - eor v20.16b, v15.16b, v3.16b - - shl v12.4s, v17.4s, #8 - shl v13.4s, v18.4s, #8 - shl v14.4s, v19.4s, #8 - shl v15.4s, v20.4s, #8 - - sri v12.4s, v17.4s, #24 - sri v13.4s, v18.4s, #24 - sri v14.4s, v19.4s, #24 - sri v15.4s, v20.4s, #24 - - // x8 += x12, x4 = rotl32(x4 ^ x8, 7) - // x9 += x13, x5 = rotl32(x5 ^ x9, 7) - // x10 += x14, x6 = rotl32(x6 ^ x10, 7) - // x11 += x15, x7 = rotl32(x7 ^ x11, 7) - add v8.4s, v8.4s, v12.4s - add v9.4s, v9.4s, v13.4s - add v10.4s, v10.4s, v14.4s - add v11.4s, v11.4s, v15.4s - - eor v17.16b, v4.16b, v8.16b - eor v18.16b, v5.16b, v9.16b - eor v19.16b, v6.16b, v10.16b - eor v20.16b, v7.16b, v11.16b - - shl v4.4s, v17.4s, #7 - shl v5.4s, v18.4s, #7 - shl v6.4s, v19.4s, #7 - shl v7.4s, v20.4s, #7 - - sri v4.4s, v17.4s, #25 - sri v5.4s, v18.4s, #25 - sri v6.4s, v19.4s, #25 - sri v7.4s, v20.4s, #25 - - // x0 += x5, x15 = rotl32(x15 ^ x0, 16) - // x1 += x6, x12 = rotl32(x12 ^ x1, 16) - // x2 += x7, x13 = rotl32(x13 ^ x2, 16) - // x3 += x4, x14 = rotl32(x14 ^ x3, 16) - add v0.4s, v0.4s, v5.4s - add v1.4s, v1.4s, v6.4s - add v2.4s, v2.4s, v7.4s - add v3.4s, v3.4s, v4.4s - - eor v15.16b, v15.16b, v0.16b - eor v12.16b, v12.16b, v1.16b - eor v13.16b, v13.16b, v2.16b - eor v14.16b, v14.16b, v3.16b - - rev32 v15.8h, v15.8h - rev32 v12.8h, v12.8h - rev32 v13.8h, v13.8h - rev32 v14.8h, v14.8h - - // x10 += x15, x5 = rotl32(x5 ^ x10, 12) - // x11 += x12, x6 = rotl32(x6 ^ x11, 12) - // x8 += x13, x7 = rotl32(x7 ^ x8, 12) - // x9 += x14, x4 = rotl32(x4 ^ x9, 12) - add v10.4s, v10.4s, v15.4s - add v11.4s, v11.4s, v12.4s - add v8.4s, v8.4s, v13.4s - add v9.4s, v9.4s, v14.4s - - eor v17.16b, v5.16b, v10.16b - eor v18.16b, v6.16b, v11.16b - eor v19.16b, v7.16b, v8.16b - eor v20.16b, v4.16b, v9.16b - - shl v5.4s, v17.4s, #12 - shl v6.4s, v18.4s, #12 - shl v7.4s, v19.4s, #12 - shl v4.4s, v20.4s, #12 - - sri v5.4s, v17.4s, #20 - sri v6.4s, v18.4s, #20 - sri v7.4s, v19.4s, #20 - sri v4.4s, v20.4s, #20 - - // x0 += x5, x15 = rotl32(x15 ^ x0, 8) - // x1 += x6, x12 = rotl32(x12 ^ x1, 8) - // x2 += x7, x13 = rotl32(x13 ^ x2, 8) - // x3 += x4, x14 = rotl32(x14 ^ x3, 8) - add v0.4s, v0.4s, v5.4s - add v1.4s, v1.4s, v6.4s - add v2.4s, v2.4s, v7.4s - add v3.4s, v3.4s, v4.4s - - eor v17.16b, v15.16b, v0.16b - eor v18.16b, v12.16b, v1.16b - eor v19.16b, v13.16b, v2.16b - eor v20.16b, v14.16b, v3.16b - - shl v15.4s, v17.4s, #8 - shl v12.4s, v18.4s, #8 - shl v13.4s, v19.4s, #8 - shl v14.4s, v20.4s, #8 - - sri v15.4s, v17.4s, #24 - sri v12.4s, v18.4s, #24 - sri v13.4s, v19.4s, #24 - sri v14.4s, v20.4s, #24 - - // x10 += x15, x5 = rotl32(x5 ^ x10, 7) - // x11 += x12, x6 = rotl32(x6 ^ x11, 7) - // x8 += x13, x7 = rotl32(x7 ^ x8, 7) - // x9 += x14, x4 = rotl32(x4 ^ x9, 7) - add v10.4s, v10.4s, v15.4s - add v11.4s, v11.4s, v12.4s - add v8.4s, v8.4s, v13.4s - add v9.4s, v9.4s, v14.4s - - eor v17.16b, v5.16b, v10.16b - eor v18.16b, v6.16b, v11.16b - eor v19.16b, v7.16b, v8.16b - eor v20.16b, v4.16b, v9.16b - - shl v5.4s, v17.4s, #7 - shl v6.4s, v18.4s, #7 - shl v7.4s, v19.4s, #7 - shl v4.4s, v20.4s, #7 - - sri v5.4s, v17.4s, #25 - sri v6.4s, v18.4s, #25 - sri v7.4s, v19.4s, #25 - sri v4.4s, v20.4s, #25 - - subs x3, x3, #1 - b.ne .Ldoubleround4 - - // x0[0-3] += s0[0] - // x1[0-3] += s0[1] - // x2[0-3] += s0[2] - // x3[0-3] += s0[3] - ld4r {v17.4s-v20.4s}, [x0], #16 - add v0.4s, v0.4s, v17.4s - add v1.4s, v1.4s, v18.4s - add v2.4s, v2.4s, v19.4s - add v3.4s, v3.4s, v20.4s - - // x4[0-3] += s1[0] - // x5[0-3] += s1[1] - // x6[0-3] += s1[2] - // x7[0-3] += s1[3] - ld4r {v21.4s-v24.4s}, [x0], #16 - add v4.4s, v4.4s, v21.4s - add v5.4s, v5.4s, v22.4s - add v6.4s, v6.4s, v23.4s - add v7.4s, v7.4s, v24.4s - - // x8[0-3] += s2[0] - // x9[0-3] += s2[1] - // x10[0-3] += s2[2] - // x11[0-3] += s2[3] - ld4r {v17.4s-v20.4s}, [x0], #16 - add v8.4s, v8.4s, v17.4s - add v9.4s, v9.4s, v18.4s - add v10.4s, v10.4s, v19.4s - add v11.4s, v11.4s, v20.4s - - // x12[0-3] += s3[0] - // x13[0-3] += s3[1] - // x14[0-3] += s3[2] - // x15[0-3] += s3[3] - ld4r {v21.4s-v24.4s}, [x0] - add v12.4s, v12.4s, v21.4s - add v13.4s, v13.4s, v22.4s - add v14.4s, v14.4s, v23.4s - add v15.4s, v15.4s, v24.4s - - // x12 += counter values 0-3 - add v12.4s, v12.4s, v16.4s - - ld1 {v16.16b-v19.16b}, [x2], #64 - ld1 {v20.16b-v23.16b}, [x2], #64 - - // interleave 32-bit words in state n, n+1 - zip1 v24.4s, v0.4s, v1.4s - zip1 v25.4s, v2.4s, v3.4s - zip1 v26.4s, v4.4s, v5.4s - zip1 v27.4s, v6.4s, v7.4s - zip1 v28.4s, v8.4s, v9.4s - zip1 v29.4s, v10.4s, v11.4s - zip1 v30.4s, v12.4s, v13.4s - zip1 v31.4s, v14.4s, v15.4s - - zip2 v1.4s, v0.4s, v1.4s - zip2 v3.4s, v2.4s, v3.4s - zip2 v5.4s, v4.4s, v5.4s - zip2 v7.4s, v6.4s, v7.4s - zip2 v9.4s, v8.4s, v9.4s - zip2 v11.4s, v10.4s, v11.4s - zip2 v13.4s, v12.4s, v13.4s - zip2 v15.4s, v14.4s, v15.4s - - mov v0.16b, v24.16b - mov v2.16b, v25.16b - mov v4.16b, v26.16b - mov v6.16b, v27.16b - mov v8.16b, v28.16b - mov v10.16b, v29.16b - mov v12.16b, v30.16b - mov v14.16b, v31.16b - - // interleave 64-bit words in state n, n+2 - zip1 v24.2d, v0.2d, v2.2d - zip1 v25.2d, v1.2d, v3.2d - zip1 v26.2d, v4.2d, v6.2d - zip1 v27.2d, v5.2d, v7.2d - zip1 v28.2d, v8.2d, v10.2d - zip1 v29.2d, v9.2d, v11.2d - zip1 v30.2d, v12.2d, v14.2d - zip1 v31.2d, v13.2d, v15.2d - - zip2 v2.2d, v0.2d, v2.2d - zip2 v3.2d, v1.2d, v3.2d - zip2 v6.2d, v4.2d, v6.2d - zip2 v7.2d, v5.2d, v7.2d - zip2 v10.2d, v8.2d, v10.2d - zip2 v11.2d, v9.2d, v11.2d - zip2 v14.2d, v12.2d, v14.2d - zip2 v15.2d, v13.2d, v15.2d - - mov v0.16b, v24.16b - mov v1.16b, v25.16b - mov v4.16b, v26.16b - mov v5.16b, v27.16b - - mov v8.16b, v28.16b - mov v9.16b, v29.16b - mov v12.16b, v30.16b - mov v13.16b, v31.16b - - ld1 {v24.16b-v27.16b}, [x2], #64 - ld1 {v28.16b-v31.16b}, [x2] - - // xor with corresponding input, write to output - eor v16.16b, v16.16b, v0.16b - eor v17.16b, v17.16b, v4.16b - eor v18.16b, v18.16b, v8.16b - eor v19.16b, v19.16b, v12.16b - st1 {v16.16b-v19.16b}, [x1], #64 - - eor v20.16b, v20.16b, v2.16b - eor v21.16b, v21.16b, v6.16b - eor v22.16b, v22.16b, v10.16b - eor v23.16b, v23.16b, v14.16b - st1 {v20.16b-v23.16b}, [x1], #64 - - eor v24.16b, v24.16b, v1.16b - eor v25.16b, v25.16b, v5.16b - eor v26.16b, v26.16b, v9.16b - eor v27.16b, v27.16b, v13.16b - st1 {v24.16b-v27.16b}, [x1], #64 - - eor v28.16b, v28.16b, v3.16b - eor v29.16b, v29.16b, v7.16b - eor v30.16b, v30.16b, v11.16b - eor v31.16b, v31.16b, v15.16b - st1 {v28.16b-v31.16b}, [x1] - - ret -ENDPROC(chacha20_4block_xor_neon) - -CTRINC: .word 0, 1, 2, 3 diff --git a/arch/arm64/crypto/chacha20-neon-glue.c b/arch/arm64/crypto/chacha20-neon-glue.c deleted file mode 100644 index 705b42b06d00..000000000000 --- a/arch/arm64/crypto/chacha20-neon-glue.c +++ /dev/null @@ -1,131 +0,0 @@ -/* - * ChaCha20 256-bit cipher algorithm, RFC7539, arm64 NEON functions - * - * Copyright (C) 2016 Linaro, Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * Based on: - * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code - * - * Copyright (C) 2015 Martin Willi - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ - -#include -#include -#include -#include -#include - -#include - -asmlinkage void chacha20_block_xor_neon(u32 *state, u8 *dst, const u8 *src); -asmlinkage void chacha20_4block_xor_neon(u32 *state, u8 *dst, const u8 *src); - -static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src, - unsigned int bytes) -{ - u8 buf[CHACHA20_BLOCK_SIZE]; - - while (bytes >= CHACHA20_BLOCK_SIZE * 4) { - chacha20_4block_xor_neon(state, dst, src); - bytes -= CHACHA20_BLOCK_SIZE * 4; - src += CHACHA20_BLOCK_SIZE * 4; - dst += CHACHA20_BLOCK_SIZE * 4; - state[12] += 4; - } - while (bytes >= CHACHA20_BLOCK_SIZE) { - chacha20_block_xor_neon(state, dst, src); - bytes -= CHACHA20_BLOCK_SIZE; - src += CHACHA20_BLOCK_SIZE; - dst += CHACHA20_BLOCK_SIZE; - state[12]++; - } - if (bytes) { - memcpy(buf, src, bytes); - chacha20_block_xor_neon(state, buf, buf); - memcpy(dst, buf, bytes); - } -} - -static int chacha20_simd(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) -{ - struct blkcipher_walk walk; - u32 state[16]; - int err; - - if (nbytes <= CHACHA20_BLOCK_SIZE) - return crypto_chacha20_crypt(desc, dst, src, nbytes); - - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt_block(desc, &walk, CHACHA20_BLOCK_SIZE); - - crypto_chacha20_init(state, crypto_blkcipher_ctx(desc->tfm), walk.iv); - - kernel_neon_begin(); - - while (walk.nbytes >= CHACHA20_BLOCK_SIZE) { - chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr, - rounddown(walk.nbytes, CHACHA20_BLOCK_SIZE)); - err = blkcipher_walk_done(desc, &walk, - walk.nbytes % CHACHA20_BLOCK_SIZE); - } - - if (walk.nbytes) { - chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr, - walk.nbytes); - err = blkcipher_walk_done(desc, &walk, 0); - } - - kernel_neon_end(); - - return err; -} - -static struct crypto_alg alg = { - .cra_name = "chacha20", - .cra_driver_name = "chacha20-neon", - .cra_priority = 300, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = 1, - .cra_type = &crypto_blkcipher_type, - .cra_ctxsize = sizeof(struct chacha20_ctx), - .cra_alignmask = sizeof(u32) - 1, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .min_keysize = CHACHA20_KEY_SIZE, - .max_keysize = CHACHA20_KEY_SIZE, - .ivsize = CHACHA20_IV_SIZE, - .geniv = "seqiv", - .setkey = crypto_chacha20_setkey, - .encrypt = chacha20_simd, - .decrypt = chacha20_simd, - }, - }, -}; - -static int __init chacha20_simd_mod_init(void) -{ - return crypto_register_alg(&alg); -} - -static void __exit chacha20_simd_mod_fini(void) -{ - crypto_unregister_alg(&alg); -} - -module_init(chacha20_simd_mod_init); -module_exit(chacha20_simd_mod_fini); - -MODULE_AUTHOR("Ard Biesheuvel "); -MODULE_LICENSE("GPL v2"); -MODULE_ALIAS_CRYPTO("chacha20"); -- cgit v1.2.3 From b7171ce9eb523fd90e38f2d138d1b6ed2ff3eafd Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 11 Jan 2017 16:41:49 +0000 Subject: crypto: arm64/chacha20 - implement NEON version based on SSE3 code This is a straight port to arm64/NEON of the x86 SSE3 implementation of the ChaCha20 stream cipher. It uses the new skcipher walksize attribute to process the input in strides of 4x the block size. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm64/crypto/Kconfig | 6 + arch/arm64/crypto/Makefile | 3 + arch/arm64/crypto/chacha20-neon-core.S | 450 +++++++++++++++++++++++++++++++++ arch/arm64/crypto/chacha20-neon-glue.c | 127 ++++++++++ 4 files changed, 586 insertions(+) create mode 100644 arch/arm64/crypto/chacha20-neon-core.S create mode 100644 arch/arm64/crypto/chacha20-neon-glue.c (limited to 'arch/arm64') diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig index 450a85df041a..0bf0f531f539 100644 --- a/arch/arm64/crypto/Kconfig +++ b/arch/arm64/crypto/Kconfig @@ -72,4 +72,10 @@ config CRYPTO_CRC32_ARM64 depends on ARM64 select CRYPTO_HASH +config CRYPTO_CHACHA20_NEON + tristate "NEON accelerated ChaCha20 symmetric cipher" + depends on KERNEL_MODE_NEON + select CRYPTO_BLKCIPHER + select CRYPTO_CHACHA20 + endif diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile index aa8888d7b744..9d2826c5fccf 100644 --- a/arch/arm64/crypto/Makefile +++ b/arch/arm64/crypto/Makefile @@ -41,6 +41,9 @@ sha256-arm64-y := sha256-glue.o sha256-core.o obj-$(CONFIG_CRYPTO_SHA512_ARM64) += sha512-arm64.o sha512-arm64-y := sha512-glue.o sha512-core.o +obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha20-neon.o +chacha20-neon-y := chacha20-neon-core.o chacha20-neon-glue.o + AFLAGS_aes-ce.o := -DINTERLEAVE=4 AFLAGS_aes-neon.o := -DINTERLEAVE=4 diff --git a/arch/arm64/crypto/chacha20-neon-core.S b/arch/arm64/crypto/chacha20-neon-core.S new file mode 100644 index 000000000000..13c85e272c2a --- /dev/null +++ b/arch/arm64/crypto/chacha20-neon-core.S @@ -0,0 +1,450 @@ +/* + * ChaCha20 256-bit cipher algorithm, RFC7539, arm64 NEON functions + * + * Copyright (C) 2016 Linaro, Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Based on: + * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions + * + * Copyright (C) 2015 Martin Willi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include + + .text + .align 6 + +ENTRY(chacha20_block_xor_neon) + // x0: Input state matrix, s + // x1: 1 data block output, o + // x2: 1 data block input, i + + // + // This function encrypts one ChaCha20 block by loading the state matrix + // in four NEON registers. It performs matrix operation on four words in + // parallel, but requires shuffling to rearrange the words after each + // round. + // + + // x0..3 = s0..3 + adr x3, ROT8 + ld1 {v0.4s-v3.4s}, [x0] + ld1 {v8.4s-v11.4s}, [x0] + ld1 {v12.4s}, [x3] + + mov x3, #10 + +.Ldoubleround: + // x0 += x1, x3 = rotl32(x3 ^ x0, 16) + add v0.4s, v0.4s, v1.4s + eor v3.16b, v3.16b, v0.16b + rev32 v3.8h, v3.8h + + // x2 += x3, x1 = rotl32(x1 ^ x2, 12) + add v2.4s, v2.4s, v3.4s + eor v4.16b, v1.16b, v2.16b + shl v1.4s, v4.4s, #12 + sri v1.4s, v4.4s, #20 + + // x0 += x1, x3 = rotl32(x3 ^ x0, 8) + add v0.4s, v0.4s, v1.4s + eor v3.16b, v3.16b, v0.16b + tbl v3.16b, {v3.16b}, v12.16b + + // x2 += x3, x1 = rotl32(x1 ^ x2, 7) + add v2.4s, v2.4s, v3.4s + eor v4.16b, v1.16b, v2.16b + shl v1.4s, v4.4s, #7 + sri v1.4s, v4.4s, #25 + + // x1 = shuffle32(x1, MASK(0, 3, 2, 1)) + ext v1.16b, v1.16b, v1.16b, #4 + // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + ext v2.16b, v2.16b, v2.16b, #8 + // x3 = shuffle32(x3, MASK(2, 1, 0, 3)) + ext v3.16b, v3.16b, v3.16b, #12 + + // x0 += x1, x3 = rotl32(x3 ^ x0, 16) + add v0.4s, v0.4s, v1.4s + eor v3.16b, v3.16b, v0.16b + rev32 v3.8h, v3.8h + + // x2 += x3, x1 = rotl32(x1 ^ x2, 12) + add v2.4s, v2.4s, v3.4s + eor v4.16b, v1.16b, v2.16b + shl v1.4s, v4.4s, #12 + sri v1.4s, v4.4s, #20 + + // x0 += x1, x3 = rotl32(x3 ^ x0, 8) + add v0.4s, v0.4s, v1.4s + eor v3.16b, v3.16b, v0.16b + tbl v3.16b, {v3.16b}, v12.16b + + // x2 += x3, x1 = rotl32(x1 ^ x2, 7) + add v2.4s, v2.4s, v3.4s + eor v4.16b, v1.16b, v2.16b + shl v1.4s, v4.4s, #7 + sri v1.4s, v4.4s, #25 + + // x1 = shuffle32(x1, MASK(2, 1, 0, 3)) + ext v1.16b, v1.16b, v1.16b, #12 + // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + ext v2.16b, v2.16b, v2.16b, #8 + // x3 = shuffle32(x3, MASK(0, 3, 2, 1)) + ext v3.16b, v3.16b, v3.16b, #4 + + subs x3, x3, #1 + b.ne .Ldoubleround + + ld1 {v4.16b-v7.16b}, [x2] + + // o0 = i0 ^ (x0 + s0) + add v0.4s, v0.4s, v8.4s + eor v0.16b, v0.16b, v4.16b + + // o1 = i1 ^ (x1 + s1) + add v1.4s, v1.4s, v9.4s + eor v1.16b, v1.16b, v5.16b + + // o2 = i2 ^ (x2 + s2) + add v2.4s, v2.4s, v10.4s + eor v2.16b, v2.16b, v6.16b + + // o3 = i3 ^ (x3 + s3) + add v3.4s, v3.4s, v11.4s + eor v3.16b, v3.16b, v7.16b + + st1 {v0.16b-v3.16b}, [x1] + + ret +ENDPROC(chacha20_block_xor_neon) + + .align 6 +ENTRY(chacha20_4block_xor_neon) + // x0: Input state matrix, s + // x1: 4 data blocks output, o + // x2: 4 data blocks input, i + + // + // This function encrypts four consecutive ChaCha20 blocks by loading + // the state matrix in NEON registers four times. The algorithm performs + // each operation on the corresponding word of each state matrix, hence + // requires no word shuffling. For final XORing step we transpose the + // matrix by interleaving 32- and then 64-bit words, which allows us to + // do XOR in NEON registers. + // + adr x3, CTRINC // ... and ROT8 + ld1 {v30.4s-v31.4s}, [x3] + + // x0..15[0-3] = s0..3[0..3] + mov x4, x0 + ld4r { v0.4s- v3.4s}, [x4], #16 + ld4r { v4.4s- v7.4s}, [x4], #16 + ld4r { v8.4s-v11.4s}, [x4], #16 + ld4r {v12.4s-v15.4s}, [x4] + + // x12 += counter values 0-3 + add v12.4s, v12.4s, v30.4s + + mov x3, #10 + +.Ldoubleround4: + // x0 += x4, x12 = rotl32(x12 ^ x0, 16) + // x1 += x5, x13 = rotl32(x13 ^ x1, 16) + // x2 += x6, x14 = rotl32(x14 ^ x2, 16) + // x3 += x7, x15 = rotl32(x15 ^ x3, 16) + add v0.4s, v0.4s, v4.4s + add v1.4s, v1.4s, v5.4s + add v2.4s, v2.4s, v6.4s + add v3.4s, v3.4s, v7.4s + + eor v12.16b, v12.16b, v0.16b + eor v13.16b, v13.16b, v1.16b + eor v14.16b, v14.16b, v2.16b + eor v15.16b, v15.16b, v3.16b + + rev32 v12.8h, v12.8h + rev32 v13.8h, v13.8h + rev32 v14.8h, v14.8h + rev32 v15.8h, v15.8h + + // x8 += x12, x4 = rotl32(x4 ^ x8, 12) + // x9 += x13, x5 = rotl32(x5 ^ x9, 12) + // x10 += x14, x6 = rotl32(x6 ^ x10, 12) + // x11 += x15, x7 = rotl32(x7 ^ x11, 12) + add v8.4s, v8.4s, v12.4s + add v9.4s, v9.4s, v13.4s + add v10.4s, v10.4s, v14.4s + add v11.4s, v11.4s, v15.4s + + eor v16.16b, v4.16b, v8.16b + eor v17.16b, v5.16b, v9.16b + eor v18.16b, v6.16b, v10.16b + eor v19.16b, v7.16b, v11.16b + + shl v4.4s, v16.4s, #12 + shl v5.4s, v17.4s, #12 + shl v6.4s, v18.4s, #12 + shl v7.4s, v19.4s, #12 + + sri v4.4s, v16.4s, #20 + sri v5.4s, v17.4s, #20 + sri v6.4s, v18.4s, #20 + sri v7.4s, v19.4s, #20 + + // x0 += x4, x12 = rotl32(x12 ^ x0, 8) + // x1 += x5, x13 = rotl32(x13 ^ x1, 8) + // x2 += x6, x14 = rotl32(x14 ^ x2, 8) + // x3 += x7, x15 = rotl32(x15 ^ x3, 8) + add v0.4s, v0.4s, v4.4s + add v1.4s, v1.4s, v5.4s + add v2.4s, v2.4s, v6.4s + add v3.4s, v3.4s, v7.4s + + eor v12.16b, v12.16b, v0.16b + eor v13.16b, v13.16b, v1.16b + eor v14.16b, v14.16b, v2.16b + eor v15.16b, v15.16b, v3.16b + + tbl v12.16b, {v12.16b}, v31.16b + tbl v13.16b, {v13.16b}, v31.16b + tbl v14.16b, {v14.16b}, v31.16b + tbl v15.16b, {v15.16b}, v31.16b + + // x8 += x12, x4 = rotl32(x4 ^ x8, 7) + // x9 += x13, x5 = rotl32(x5 ^ x9, 7) + // x10 += x14, x6 = rotl32(x6 ^ x10, 7) + // x11 += x15, x7 = rotl32(x7 ^ x11, 7) + add v8.4s, v8.4s, v12.4s + add v9.4s, v9.4s, v13.4s + add v10.4s, v10.4s, v14.4s + add v11.4s, v11.4s, v15.4s + + eor v16.16b, v4.16b, v8.16b + eor v17.16b, v5.16b, v9.16b + eor v18.16b, v6.16b, v10.16b + eor v19.16b, v7.16b, v11.16b + + shl v4.4s, v16.4s, #7 + shl v5.4s, v17.4s, #7 + shl v6.4s, v18.4s, #7 + shl v7.4s, v19.4s, #7 + + sri v4.4s, v16.4s, #25 + sri v5.4s, v17.4s, #25 + sri v6.4s, v18.4s, #25 + sri v7.4s, v19.4s, #25 + + // x0 += x5, x15 = rotl32(x15 ^ x0, 16) + // x1 += x6, x12 = rotl32(x12 ^ x1, 16) + // x2 += x7, x13 = rotl32(x13 ^ x2, 16) + // x3 += x4, x14 = rotl32(x14 ^ x3, 16) + add v0.4s, v0.4s, v5.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + add v3.4s, v3.4s, v4.4s + + eor v15.16b, v15.16b, v0.16b + eor v12.16b, v12.16b, v1.16b + eor v13.16b, v13.16b, v2.16b + eor v14.16b, v14.16b, v3.16b + + rev32 v15.8h, v15.8h + rev32 v12.8h, v12.8h + rev32 v13.8h, v13.8h + rev32 v14.8h, v14.8h + + // x10 += x15, x5 = rotl32(x5 ^ x10, 12) + // x11 += x12, x6 = rotl32(x6 ^ x11, 12) + // x8 += x13, x7 = rotl32(x7 ^ x8, 12) + // x9 += x14, x4 = rotl32(x4 ^ x9, 12) + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v12.4s + add v8.4s, v8.4s, v13.4s + add v9.4s, v9.4s, v14.4s + + eor v16.16b, v5.16b, v10.16b + eor v17.16b, v6.16b, v11.16b + eor v18.16b, v7.16b, v8.16b + eor v19.16b, v4.16b, v9.16b + + shl v5.4s, v16.4s, #12 + shl v6.4s, v17.4s, #12 + shl v7.4s, v18.4s, #12 + shl v4.4s, v19.4s, #12 + + sri v5.4s, v16.4s, #20 + sri v6.4s, v17.4s, #20 + sri v7.4s, v18.4s, #20 + sri v4.4s, v19.4s, #20 + + // x0 += x5, x15 = rotl32(x15 ^ x0, 8) + // x1 += x6, x12 = rotl32(x12 ^ x1, 8) + // x2 += x7, x13 = rotl32(x13 ^ x2, 8) + // x3 += x4, x14 = rotl32(x14 ^ x3, 8) + add v0.4s, v0.4s, v5.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + add v3.4s, v3.4s, v4.4s + + eor v15.16b, v15.16b, v0.16b + eor v12.16b, v12.16b, v1.16b + eor v13.16b, v13.16b, v2.16b + eor v14.16b, v14.16b, v3.16b + + tbl v15.16b, {v15.16b}, v31.16b + tbl v12.16b, {v12.16b}, v31.16b + tbl v13.16b, {v13.16b}, v31.16b + tbl v14.16b, {v14.16b}, v31.16b + + // x10 += x15, x5 = rotl32(x5 ^ x10, 7) + // x11 += x12, x6 = rotl32(x6 ^ x11, 7) + // x8 += x13, x7 = rotl32(x7 ^ x8, 7) + // x9 += x14, x4 = rotl32(x4 ^ x9, 7) + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v12.4s + add v8.4s, v8.4s, v13.4s + add v9.4s, v9.4s, v14.4s + + eor v16.16b, v5.16b, v10.16b + eor v17.16b, v6.16b, v11.16b + eor v18.16b, v7.16b, v8.16b + eor v19.16b, v4.16b, v9.16b + + shl v5.4s, v16.4s, #7 + shl v6.4s, v17.4s, #7 + shl v7.4s, v18.4s, #7 + shl v4.4s, v19.4s, #7 + + sri v5.4s, v16.4s, #25 + sri v6.4s, v17.4s, #25 + sri v7.4s, v18.4s, #25 + sri v4.4s, v19.4s, #25 + + subs x3, x3, #1 + b.ne .Ldoubleround4 + + ld4r {v16.4s-v19.4s}, [x0], #16 + ld4r {v20.4s-v23.4s}, [x0], #16 + + // x12 += counter values 0-3 + add v12.4s, v12.4s, v30.4s + + // x0[0-3] += s0[0] + // x1[0-3] += s0[1] + // x2[0-3] += s0[2] + // x3[0-3] += s0[3] + add v0.4s, v0.4s, v16.4s + add v1.4s, v1.4s, v17.4s + add v2.4s, v2.4s, v18.4s + add v3.4s, v3.4s, v19.4s + + ld4r {v24.4s-v27.4s}, [x0], #16 + ld4r {v28.4s-v31.4s}, [x0] + + // x4[0-3] += s1[0] + // x5[0-3] += s1[1] + // x6[0-3] += s1[2] + // x7[0-3] += s1[3] + add v4.4s, v4.4s, v20.4s + add v5.4s, v5.4s, v21.4s + add v6.4s, v6.4s, v22.4s + add v7.4s, v7.4s, v23.4s + + // x8[0-3] += s2[0] + // x9[0-3] += s2[1] + // x10[0-3] += s2[2] + // x11[0-3] += s2[3] + add v8.4s, v8.4s, v24.4s + add v9.4s, v9.4s, v25.4s + add v10.4s, v10.4s, v26.4s + add v11.4s, v11.4s, v27.4s + + // x12[0-3] += s3[0] + // x13[0-3] += s3[1] + // x14[0-3] += s3[2] + // x15[0-3] += s3[3] + add v12.4s, v12.4s, v28.4s + add v13.4s, v13.4s, v29.4s + add v14.4s, v14.4s, v30.4s + add v15.4s, v15.4s, v31.4s + + // interleave 32-bit words in state n, n+1 + zip1 v16.4s, v0.4s, v1.4s + zip2 v17.4s, v0.4s, v1.4s + zip1 v18.4s, v2.4s, v3.4s + zip2 v19.4s, v2.4s, v3.4s + zip1 v20.4s, v4.4s, v5.4s + zip2 v21.4s, v4.4s, v5.4s + zip1 v22.4s, v6.4s, v7.4s + zip2 v23.4s, v6.4s, v7.4s + zip1 v24.4s, v8.4s, v9.4s + zip2 v25.4s, v8.4s, v9.4s + zip1 v26.4s, v10.4s, v11.4s + zip2 v27.4s, v10.4s, v11.4s + zip1 v28.4s, v12.4s, v13.4s + zip2 v29.4s, v12.4s, v13.4s + zip1 v30.4s, v14.4s, v15.4s + zip2 v31.4s, v14.4s, v15.4s + + // interleave 64-bit words in state n, n+2 + zip1 v0.2d, v16.2d, v18.2d + zip2 v4.2d, v16.2d, v18.2d + zip1 v8.2d, v17.2d, v19.2d + zip2 v12.2d, v17.2d, v19.2d + ld1 {v16.16b-v19.16b}, [x2], #64 + + zip1 v1.2d, v20.2d, v22.2d + zip2 v5.2d, v20.2d, v22.2d + zip1 v9.2d, v21.2d, v23.2d + zip2 v13.2d, v21.2d, v23.2d + ld1 {v20.16b-v23.16b}, [x2], #64 + + zip1 v2.2d, v24.2d, v26.2d + zip2 v6.2d, v24.2d, v26.2d + zip1 v10.2d, v25.2d, v27.2d + zip2 v14.2d, v25.2d, v27.2d + ld1 {v24.16b-v27.16b}, [x2], #64 + + zip1 v3.2d, v28.2d, v30.2d + zip2 v7.2d, v28.2d, v30.2d + zip1 v11.2d, v29.2d, v31.2d + zip2 v15.2d, v29.2d, v31.2d + ld1 {v28.16b-v31.16b}, [x2] + + // xor with corresponding input, write to output + eor v16.16b, v16.16b, v0.16b + eor v17.16b, v17.16b, v1.16b + eor v18.16b, v18.16b, v2.16b + eor v19.16b, v19.16b, v3.16b + eor v20.16b, v20.16b, v4.16b + eor v21.16b, v21.16b, v5.16b + st1 {v16.16b-v19.16b}, [x1], #64 + eor v22.16b, v22.16b, v6.16b + eor v23.16b, v23.16b, v7.16b + eor v24.16b, v24.16b, v8.16b + eor v25.16b, v25.16b, v9.16b + st1 {v20.16b-v23.16b}, [x1], #64 + eor v26.16b, v26.16b, v10.16b + eor v27.16b, v27.16b, v11.16b + eor v28.16b, v28.16b, v12.16b + st1 {v24.16b-v27.16b}, [x1], #64 + eor v29.16b, v29.16b, v13.16b + eor v30.16b, v30.16b, v14.16b + eor v31.16b, v31.16b, v15.16b + st1 {v28.16b-v31.16b}, [x1] + + ret +ENDPROC(chacha20_4block_xor_neon) + +CTRINC: .word 0, 1, 2, 3 +ROT8: .word 0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f diff --git a/arch/arm64/crypto/chacha20-neon-glue.c b/arch/arm64/crypto/chacha20-neon-glue.c new file mode 100644 index 000000000000..a7f2337d46cf --- /dev/null +++ b/arch/arm64/crypto/chacha20-neon-glue.c @@ -0,0 +1,127 @@ +/* + * ChaCha20 256-bit cipher algorithm, RFC7539, arm64 NEON functions + * + * Copyright (C) 2016 Linaro, Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Based on: + * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code + * + * Copyright (C) 2015 Martin Willi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include +#include +#include +#include +#include + +#include +#include + +asmlinkage void chacha20_block_xor_neon(u32 *state, u8 *dst, const u8 *src); +asmlinkage void chacha20_4block_xor_neon(u32 *state, u8 *dst, const u8 *src); + +static void chacha20_doneon(u32 *state, u8 *dst, const u8 *src, + unsigned int bytes) +{ + u8 buf[CHACHA20_BLOCK_SIZE]; + + while (bytes >= CHACHA20_BLOCK_SIZE * 4) { + chacha20_4block_xor_neon(state, dst, src); + bytes -= CHACHA20_BLOCK_SIZE * 4; + src += CHACHA20_BLOCK_SIZE * 4; + dst += CHACHA20_BLOCK_SIZE * 4; + state[12] += 4; + } + while (bytes >= CHACHA20_BLOCK_SIZE) { + chacha20_block_xor_neon(state, dst, src); + bytes -= CHACHA20_BLOCK_SIZE; + src += CHACHA20_BLOCK_SIZE; + dst += CHACHA20_BLOCK_SIZE; + state[12]++; + } + if (bytes) { + memcpy(buf, src, bytes); + chacha20_block_xor_neon(state, buf, buf); + memcpy(dst, buf, bytes); + } +} + +static int chacha20_neon(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct chacha20_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + u32 state[16]; + int err; + + if (req->cryptlen <= CHACHA20_BLOCK_SIZE) + return crypto_chacha20_crypt(req); + + err = skcipher_walk_virt(&walk, req, true); + + crypto_chacha20_init(state, ctx, walk.iv); + + kernel_neon_begin(); + while (walk.nbytes > 0) { + unsigned int nbytes = walk.nbytes; + + if (nbytes < walk.total) + nbytes = round_down(nbytes, walk.stride); + + chacha20_doneon(state, walk.dst.virt.addr, walk.src.virt.addr, + nbytes); + err = skcipher_walk_done(&walk, walk.nbytes - nbytes); + } + kernel_neon_end(); + + return err; +} + +static struct skcipher_alg alg = { + .base.cra_name = "chacha20", + .base.cra_driver_name = "chacha20-neon", + .base.cra_priority = 300, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct chacha20_ctx), + .base.cra_alignmask = 1, + .base.cra_module = THIS_MODULE, + + .min_keysize = CHACHA20_KEY_SIZE, + .max_keysize = CHACHA20_KEY_SIZE, + .ivsize = CHACHA20_IV_SIZE, + .chunksize = CHACHA20_BLOCK_SIZE, + .walksize = 4 * CHACHA20_BLOCK_SIZE, + .setkey = crypto_chacha20_setkey, + .encrypt = chacha20_neon, + .decrypt = chacha20_neon, +}; + +static int __init chacha20_simd_mod_init(void) +{ + if (!(elf_hwcap & HWCAP_ASIMD)) + return -ENODEV; + + return crypto_register_skcipher(&alg); +} + +static void __exit chacha20_simd_mod_fini(void) +{ + crypto_unregister_skcipher(&alg); +} + +module_init(chacha20_simd_mod_init); +module_exit(chacha20_simd_mod_fini); + +MODULE_AUTHOR("Ard Biesheuvel "); +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS_CRYPTO("chacha20"); -- cgit v1.2.3 From 293614ce3eda94a3c9b38d5c18fdc06eb1397221 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 11 Jan 2017 16:41:51 +0000 Subject: crypto: arm64/aes-blk - expose AES-CTR as synchronous cipher as well In addition to wrapping the AES-CTR cipher into the async SIMD wrapper, which exposes it as an async skcipher that defers processing to process context, expose our AES-CTR implementation directly as a synchronous cipher as well, but with a lower priority. This makes the AES-CTR transform usable in places where synchronous transforms are required, such as the MAC802.11 encryption code, which executes in sotfirq context, where SIMD processing is allowed on arm64. Users of the async transform will keep the existing behavior. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm64/crypto/aes-glue.c | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) (limited to 'arch/arm64') diff --git a/arch/arm64/crypto/aes-glue.c b/arch/arm64/crypto/aes-glue.c index 4e3f8adb1793..5164aaf82c6a 100644 --- a/arch/arm64/crypto/aes-glue.c +++ b/arch/arm64/crypto/aes-glue.c @@ -325,6 +325,23 @@ static struct skcipher_alg aes_algs[] = { { .setkey = skcipher_aes_setkey, .encrypt = ctr_encrypt, .decrypt = ctr_encrypt, +}, { + .base = { + .cra_name = "ctr(aes)", + .cra_driver_name = "ctr-aes-" MODE, + .cra_priority = PRIO - 1, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct crypto_aes_ctx), + .cra_alignmask = 7, + .cra_module = THIS_MODULE, + }, + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .chunksize = AES_BLOCK_SIZE, + .setkey = skcipher_aes_setkey, + .encrypt = ctr_encrypt, + .decrypt = ctr_encrypt, }, { .base = { .cra_name = "__xts(aes)", @@ -350,8 +367,9 @@ static void aes_exit(void) { int i; - for (i = 0; i < ARRAY_SIZE(aes_simd_algs) && aes_simd_algs[i]; i++) - simd_skcipher_free(aes_simd_algs[i]); + for (i = 0; i < ARRAY_SIZE(aes_simd_algs); i++) + if (aes_simd_algs[i]) + simd_skcipher_free(aes_simd_algs[i]); crypto_unregister_skciphers(aes_algs, ARRAY_SIZE(aes_algs)); } @@ -370,6 +388,9 @@ static int __init aes_init(void) return err; for (i = 0; i < ARRAY_SIZE(aes_algs); i++) { + if (!(aes_algs[i].base.cra_flags & CRYPTO_ALG_INTERNAL)) + continue; + algname = aes_algs[i].base.cra_name + 2; drvname = aes_algs[i].base.cra_driver_name + 2; basename = aes_algs[i].base.cra_driver_name; -- cgit v1.2.3 From bed593c0e852f5c1efd3ca4e984fd744c51cf6ee Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 11 Jan 2017 16:41:52 +0000 Subject: crypto: arm64/aes - add scalar implementation This adds a scalar implementation of AES, based on the precomputed tables that are exposed by the generic AES code. Since rotates are cheap on arm64, this implementation only uses the 4 core tables (of 1 KB each), and avoids the prerotated ones, reducing the D-cache footprint by 75%. On Cortex-A57, this code manages 13.0 cycles per byte, which is ~34% faster than the generic C code. (Note that this is still >13x slower than the code that uses the optional ARMv8 Crypto Extensions, which manages <1 cycles per byte.) Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm64/crypto/Kconfig | 4 ++ arch/arm64/crypto/Makefile | 3 + arch/arm64/crypto/aes-cipher-core.S | 127 ++++++++++++++++++++++++++++++++++++ arch/arm64/crypto/aes-cipher-glue.c | 69 ++++++++++++++++++++ 4 files changed, 203 insertions(+) create mode 100644 arch/arm64/crypto/aes-cipher-core.S create mode 100644 arch/arm64/crypto/aes-cipher-glue.c (limited to 'arch/arm64') diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig index 0bf0f531f539..0826f8e599a6 100644 --- a/arch/arm64/crypto/Kconfig +++ b/arch/arm64/crypto/Kconfig @@ -41,6 +41,10 @@ config CRYPTO_CRC32_ARM64_CE depends on KERNEL_MODE_NEON && CRC32 select CRYPTO_HASH +config CRYPTO_AES_ARM64 + tristate "AES core cipher using scalar instructions" + select CRYPTO_AES + config CRYPTO_AES_ARM64_CE tristate "AES core cipher using ARMv8 Crypto Extensions" depends on ARM64 && KERNEL_MODE_NEON diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile index 9d2826c5fccf..a893507629eb 100644 --- a/arch/arm64/crypto/Makefile +++ b/arch/arm64/crypto/Makefile @@ -44,6 +44,9 @@ sha512-arm64-y := sha512-glue.o sha512-core.o obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha20-neon.o chacha20-neon-y := chacha20-neon-core.o chacha20-neon-glue.o +obj-$(CONFIG_CRYPTO_AES_ARM64) += aes-arm64.o +aes-arm64-y := aes-cipher-core.o aes-cipher-glue.o + AFLAGS_aes-ce.o := -DINTERLEAVE=4 AFLAGS_aes-neon.o := -DINTERLEAVE=4 diff --git a/arch/arm64/crypto/aes-cipher-core.S b/arch/arm64/crypto/aes-cipher-core.S new file mode 100644 index 000000000000..37590ab8121a --- /dev/null +++ b/arch/arm64/crypto/aes-cipher-core.S @@ -0,0 +1,127 @@ +/* + * Scalar AES core transform + * + * Copyright (C) 2017 Linaro Ltd + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include + + .text + + rk .req x0 + out .req x1 + in .req x2 + rounds .req x3 + tt .req x4 + lt .req x2 + + .macro __hround, out0, out1, in0, in1, in2, in3, t0, t1, enc + ldp \out0, \out1, [rk], #8 + + ubfx w13, \in0, #0, #8 + ubfx w14, \in1, #8, #8 + ldr w13, [tt, w13, uxtw #2] + ldr w14, [tt, w14, uxtw #2] + + .if \enc + ubfx w17, \in1, #0, #8 + ubfx w18, \in2, #8, #8 + .else + ubfx w17, \in3, #0, #8 + ubfx w18, \in0, #8, #8 + .endif + ldr w17, [tt, w17, uxtw #2] + ldr w18, [tt, w18, uxtw #2] + + ubfx w15, \in2, #16, #8 + ubfx w16, \in3, #24, #8 + ldr w15, [tt, w15, uxtw #2] + ldr w16, [tt, w16, uxtw #2] + + .if \enc + ubfx \t0, \in3, #16, #8 + ubfx \t1, \in0, #24, #8 + .else + ubfx \t0, \in1, #16, #8 + ubfx \t1, \in2, #24, #8 + .endif + ldr \t0, [tt, \t0, uxtw #2] + ldr \t1, [tt, \t1, uxtw #2] + + eor \out0, \out0, w13 + eor \out1, \out1, w17 + eor \out0, \out0, w14, ror #24 + eor \out1, \out1, w18, ror #24 + eor \out0, \out0, w15, ror #16 + eor \out1, \out1, \t0, ror #16 + eor \out0, \out0, w16, ror #8 + eor \out1, \out1, \t1, ror #8 + .endm + + .macro fround, out0, out1, out2, out3, in0, in1, in2, in3 + __hround \out0, \out1, \in0, \in1, \in2, \in3, \out2, \out3, 1 + __hround \out2, \out3, \in2, \in3, \in0, \in1, \in1, \in2, 1 + .endm + + .macro iround, out0, out1, out2, out3, in0, in1, in2, in3 + __hround \out0, \out1, \in0, \in3, \in2, \in1, \out2, \out3, 0 + __hround \out2, \out3, \in2, \in1, \in0, \in3, \in1, \in0, 0 + .endm + + .macro do_crypt, round, ttab, ltab + ldp w5, w6, [in] + ldp w7, w8, [in, #8] + ldp w9, w10, [rk], #16 + ldp w11, w12, [rk, #-8] + +CPU_BE( rev w5, w5 ) +CPU_BE( rev w6, w6 ) +CPU_BE( rev w7, w7 ) +CPU_BE( rev w8, w8 ) + + eor w5, w5, w9 + eor w6, w6, w10 + eor w7, w7, w11 + eor w8, w8, w12 + + ldr tt, =\ttab + ldr lt, =\ltab + + tbnz rounds, #1, 1f + +0: \round w9, w10, w11, w12, w5, w6, w7, w8 + \round w5, w6, w7, w8, w9, w10, w11, w12 + +1: subs rounds, rounds, #4 + \round w9, w10, w11, w12, w5, w6, w7, w8 + csel tt, tt, lt, hi + \round w5, w6, w7, w8, w9, w10, w11, w12 + b.hi 0b + +CPU_BE( rev w5, w5 ) +CPU_BE( rev w6, w6 ) +CPU_BE( rev w7, w7 ) +CPU_BE( rev w8, w8 ) + + stp w5, w6, [out] + stp w7, w8, [out, #8] + ret + + .align 4 + .ltorg + .endm + + .align 5 +ENTRY(__aes_arm64_encrypt) + do_crypt fround, crypto_ft_tab, crypto_fl_tab +ENDPROC(__aes_arm64_encrypt) + + .align 5 +ENTRY(__aes_arm64_decrypt) + do_crypt iround, crypto_it_tab, crypto_il_tab +ENDPROC(__aes_arm64_decrypt) diff --git a/arch/arm64/crypto/aes-cipher-glue.c b/arch/arm64/crypto/aes-cipher-glue.c new file mode 100644 index 000000000000..7288e7cbebff --- /dev/null +++ b/arch/arm64/crypto/aes-cipher-glue.c @@ -0,0 +1,69 @@ +/* + * Scalar AES core transform + * + * Copyright (C) 2017 Linaro Ltd + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include + +asmlinkage void __aes_arm64_encrypt(u32 *rk, u8 *out, const u8 *in, int rounds); +EXPORT_SYMBOL(__aes_arm64_encrypt); + +asmlinkage void __aes_arm64_decrypt(u32 *rk, u8 *out, const u8 *in, int rounds); +EXPORT_SYMBOL(__aes_arm64_decrypt); + +static void aes_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in) +{ + struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm); + int rounds = 6 + ctx->key_length / 4; + + __aes_arm64_encrypt(ctx->key_enc, out, in, rounds); +} + +static void aes_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in) +{ + struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm); + int rounds = 6 + ctx->key_length / 4; + + __aes_arm64_decrypt(ctx->key_dec, out, in, rounds); +} + +static struct crypto_alg aes_alg = { + .cra_name = "aes", + .cra_driver_name = "aes-arm64", + .cra_priority = 200, + .cra_flags = CRYPTO_ALG_TYPE_CIPHER, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct crypto_aes_ctx), + .cra_module = THIS_MODULE, + + .cra_cipher.cia_min_keysize = AES_MIN_KEY_SIZE, + .cra_cipher.cia_max_keysize = AES_MAX_KEY_SIZE, + .cra_cipher.cia_setkey = crypto_aes_set_key, + .cra_cipher.cia_encrypt = aes_encrypt, + .cra_cipher.cia_decrypt = aes_decrypt +}; + +static int __init aes_init(void) +{ + return crypto_register_alg(&aes_alg); +} + +static void __exit aes_fini(void) +{ + crypto_unregister_alg(&aes_alg); +} + +module_init(aes_init); +module_exit(aes_fini); + +MODULE_DESCRIPTION("Scalar AES cipher for arm64"); +MODULE_AUTHOR("Ard Biesheuvel "); +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS_CRYPTO("aes"); -- cgit v1.2.3 From 1abee99eafab67fb1c98f9ecfc43cd5735384a86 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 11 Jan 2017 16:41:55 +0000 Subject: crypto: arm64/aes - reimplement