/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Accelerated GHASH implementation with ARMv8 PMULL instructions.
*
* Copyright (C) 2014 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org>
*/
#include <linux/linkage.h>
#include <asm/assembler.h>
SHASH .req v0
SHASH2 .req v1
T1 .req v2
T2 .req v3
MASK .req v4
XM .req v5
XL .req v6
XH .req v7
IN1 .req v7
k00_16 .req v8
k32_48 .req v9
t3 .req v10
t4 .req v11
t5 .req v12
t6 .req v13
t7 .req v14
t8 .req v15
t9 .req v16
perm1 .req v17
perm2 .req v18
perm3 .req v19
sh1 .req v20
sh2 .req v21
sh3 .req v22
sh4 .req v23
ss1 .req v24
ss2 .req v25
ss3 .req v26
ss4 .req v27
XL2 .req v8
XM2 .req v9
XH2 .req v10
XL3 .req v11
XM3 .req v12
XH3 .req v13
TT3 .req v14
TT4 .req v15
HH .req v16
HH3 .req v17
HH4 .req v18
HH34 .req v19
.text
.arch armv8-a+crypto
.macro __pmull_p64, rd, rn, rm
pmull \rd\().1q, \rn\().1d, \rm\().1d
.endm
.macro __pmull2_p64, rd, rn, rm
pmull2 \rd\().1q, \rn\().2d, \rm\().2d
.endm
.macro __pmull_p8, rq, ad, bd
ext t3.8b, \ad\().8b, \ad\().8b, #1 // A1
ext t5.8b, \ad\().8b, \ad\().8b, #2 // A2
ext t7.8b, \ad\().8b, \ad\().8b, #3 // A3
__pmull_p8_\bd \rq, \ad
.endm
.macro __pmull2_p8, rq, ad, bd
tbl t3.16b, {\ad\().16b}, perm1.16b // A1
tbl t5.16b, {\ad\().16b}, perm2.16b // A2
tbl t7.16b, {\ad\().16b}, perm3.16b // A3
__pmull2_p8_\bd \rq, \ad
.endm
.macro __pmull_p8_SHASH, rq, ad
__pmull_p8_tail \rq, \ad\().8b, SHASH.8b, 8b,, sh1, sh2, sh3, sh4
.endm
.macro __pmull_p8_SHASH2, rq, ad
__pmull_p8_tail \rq, \ad\().8b, SHASH2.8b, 8b,, ss1, ss2, ss3, ss4
.endm
.macro __pmull2_p8_SHASH, rq, ad
__pmull_p8_tail \rq, \ad\().16b, SHASH.16b, 16b, 2, sh1, sh2, sh3, sh4
.endm
.macro __pmull_p8_tail, rq, ad, bd, nb, t, b1, b2, b3, b4
pmull\t t3.8h, t3.\nb, \bd // F = A1*B
pmull\t t4.8h, \ad, \b1\().\nb // E = A*B1
pmull\t t5.8h, t5.\nb, \bd // H = A2*B
pmull\t t6.8h, \ad, \b2\().\nb // G = A*B2
pmull\t t7.8h, t7.\nb, \bd // J = A3*B
pmull\t t8.8h, \ad, \b3\().\nb // I = A*B3
pmull\t t9.8h, \ad, \b4\().\nb // K = A*B4
pmull\t \rq\().8h, \ad, \bd // D = A*B
eor t3.16b, t3.16b, t4.16b // L = E + F
eor t5.16b, t5.16b, t6.16b // M = G + H
eor t7.16b, t7.16b, t8.16b // N = I + J
uzp1 t4.2d, t3.2d, t5.2d
uzp2 t3.2d, t3.2d, t5.2d
uzp1 t6.2d, t7.2d, t9.2d
uzp2 t7.2d, t7.2d, t9.2d
// t3 = (L) (P0 + P1) << 8
// t5 = (M) (P2 + P3) << 16
eor t4.16b, t4.16b, t3.16b
and t3.16b, t3.16b, k32_48.16b
// t7 = (N) (P4 + P5) << 24
// t9 = (K) (P6 + P7) << 32
eor t6.16b, t6.16b, t7.16b
and t7.16b, t7.16b, k00_16.16b
eor t4.16b, t4.16b, t3.16b
eor t6.16b, t6.16b, t7.16b
zip2 t5.2d, t4.2d, t3.2d
zip1 t3.2d, t4.2d, t3.2d
zip2 t9.2d, t6.2d, t7.2d
zip1 t7.2d, t6.2d, t7.2d
ext t3.16b, t3.16b, t3.16b, #15
ext t5.16b, t5.16b, t5.16b, #14
ext t7.16b, t7.16b, t7.16b, #13
ext t9.16b, t9.16b, t9.16b, #12
eor t3.16b, t3.16b, t5.16b
eor t7.16b, t7.16b, t9.16b
eor \rq\().16b, \rq\().16b, t3.16b
eor \rq\().16b, \rq\().16b, t7.16b
.endm
.macro __pmull_pre_p64
add x8, x3, #16
ld1 {HH.2d-HH4.2d}, [x8]
trn1 SHASH2.2d, SHASH.2d, HH.2d
trn2 T1.2d,