diff options
author | Ben Avison <bavison@riscosopen.org> | 2021-03-10 15:54:44 +0000 |
---|---|---|
committer | Pauli <pauli@openssl.org> | 2021-05-14 00:02:19 +1000 |
commit | da51566b256e0c0536d5b986e676863b0526bf5e (patch) | |
tree | 11b8b2fb3cadf4c4c2107881fdcdf40538e87059 | |
parent | 3ba3e350fd15c133a172095f67e6e0c99ab9b410 (diff) |
ARM assembly pack: translate bit-sliced AES implementation to AArch64
Reviewed-by: Tomas Mraz <tomas@openssl.org>
Reviewed-by: Paul Dale <pauli@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/14592)
-rw-r--r-- | crypto/aes/asm/bsaes-armv8.S | 2338 | ||||
-rw-r--r-- | crypto/aes/build.info | 5 |
2 files changed, 2341 insertions, 2 deletions
diff --git a/crypto/aes/asm/bsaes-armv8.S b/crypto/aes/asm/bsaes-armv8.S new file mode 100644 index 0000000000..9bd02d0c8a --- /dev/null +++ b/crypto/aes/asm/bsaes-armv8.S @@ -0,0 +1,2338 @@ +// Copyright 2021 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the OpenSSL license (the "License"). You may not use +// this file except in compliance with the License. You can obtain a copy +// in the file LICENSE in the source distribution or at +// https://www.openssl.org/source/license.html +// +// ==================================================================== +// Written by Ben Avison <bavison@riscosopen.org> for the OpenSSL +// project. Rights for redistribution and usage in source and binary +// forms are granted according to the OpenSSL license. +// ==================================================================== +// +// This implementation is a translation of bsaes-armv7 for AArch64. +// No attempt has been made to carry across the build switches for +// kernel targets, since the Linux kernel crypto support has moved on +// from when it was based on OpenSSL. + +// A lot of hand-scheduling has been performed. Consequently, this code +// doesn't factor out neatly into macros in the same way that the +// AArch32 version did, and there is little to be gained by wrapping it +// up in Perl, and it is presented as pure assembly. + + +#include "crypto/arm_arch.h" + +.text + +.type _bsaes_decrypt8,%function +.align 4 +// On entry: +// x9 -> key (previously expanded using _bsaes_key_convert) +// x10 = number of rounds +// v0-v7 input data +// On exit: +// x9-x11 corrupted +// other general-purpose registers preserved +// v0-v7 output data +// v11-v15 preserved +// other SIMD registers corrupted +_bsaes_decrypt8: + ldr q8, [x9], #16 + adr x11, .LM0ISR + movi v9.16b, #0x55 + ldr q10, [x11], #16 + movi v16.16b, #0x33 + movi v17.16b, #0x0f + sub x10, x10, #1 + eor v0.16b, v0.16b, v8.16b + eor v1.16b, v1.16b, v8.16b + eor v2.16b, v2.16b, v8.16b + eor v4.16b, v4.16b, v8.16b + eor v3.16b, v3.16b, v8.16b + eor v5.16b, v5.16b, v8.16b + tbl v0.16b, {v0.16b}, v10.16b + tbl v1.16b, {v1.16b}, v10.16b + tbl v2.16b, {v2.16b}, v10.16b + tbl v4.16b, {v4.16b}, v10.16b + eor v6.16b, v6.16b, v8.16b + eor v7.16b, v7.16b, v8.16b + tbl v3.16b, {v3.16b}, v10.16b + tbl v5.16b, {v5.16b}, v10.16b + tbl v6.16b, {v6.16b}, v10.16b + ushr v8.2d, v0.2d, #1 + tbl v7.16b, {v7.16b}, v10.16b + ushr v10.2d, v4.2d, #1 + ushr v18.2d, v2.2d, #1 + eor v8.16b, v8.16b, v1.16b + ushr v19.2d, v6.2d, #1 + eor v10.16b, v10.16b, v5.16b + eor v18.16b, v18.16b, v3.16b + and v8.16b, v8.16b, v9.16b + eor v19.16b, v19.16b, v7.16b + and v10.16b, v10.16b, v9.16b + and v18.16b, v18.16b, v9.16b + eor v1.16b, v1.16b, v8.16b + shl v8.2d, v8.2d, #1 + and v9.16b, v19.16b, v9.16b + eor v5.16b, v5.16b, v10.16b + shl v10.2d, v10.2d, #1 + eor v3.16b, v3.16b, v18.16b + shl v18.2d, v18.2d, #1 + eor v0.16b, v0.16b, v8.16b + shl v8.2d, v9.2d, #1 + eor v7.16b, v7.16b, v9.16b + eor v4.16b, v4.16b, v10.16b + eor v2.16b, v2.16b, v18.16b + ushr v9.2d, v1.2d, #2 + eor v6.16b, v6.16b, v8.16b + ushr v8.2d, v0.2d, #2 + ushr v10.2d, v5.2d, #2 + ushr v18.2d, v4.2d, #2 + eor v9.16b, v9.16b, v3.16b + eor v8.16b, v8.16b, v2.16b + eor v10.16b, v10.16b, v7.16b + eor v18.16b, v18.16b, v6.16b + and v9.16b, v9.16b, v16.16b + and v8.16b, v8.16b, v16.16b + and v10.16b, v10.16b, v16.16b + and v16.16b, v18.16b, v16.16b + eor v3.16b, v3.16b, v9.16b + shl v9.2d, v9.2d, #2 + eor v2.16b, v2.16b, v8.16b + shl v8.2d, v8.2d, #2 + eor v7.16b, v7.16b, v10.16b + shl v10.2d, v10.2d, #2 + eor v6.16b, v6.16b, v16.16b + shl v16.2d, v16.2d, #2 + eor v1.16b, v1.16b, v9.16b + eor v0.16b, v0.16b, v8.16b + eor v5.16b, v5.16b, v10.16b + eor v4.16b, v4.16b, v16.16b + ushr v8.2d, v3.2d, #4 + ushr v9.2d, v2.2d, #4 + ushr v10.2d, v1.2d, #4 + ushr v16.2d, v0.2d, #4 + eor v8.16b, v8.16b, v7.16b + eor v9.16b, v9.16b, v6.16b + eor v10.16b, v10.16b, v5.16b + eor v16.16b, v16.16b, v4.16b + and v8.16b, v8.16b, v17.16b + and v9.16b, v9.16b, v17.16b + and v10.16b, v10.16b, v17.16b + and v16.16b, v16.16b, v17.16b + eor v7.16b, v7.16b, v8.16b + shl v8.2d, v8.2d, #4 + eor v6.16b, v6.16b, v9.16b + shl v9.2d, v9.2d, #4 + eor v5.16b, v5.16b, v10.16b + shl v10.2d, v10.2d, #4 + eor v4.16b, v4.16b, v16.16b + shl v16.2d, v16.2d, #4 + eor v3.16b, v3.16b, v8.16b + eor v2.16b, v2.16b, v9.16b + eor v1.16b, v1.16b, v10.16b + eor v0.16b, v0.16b, v16.16b + b .Ldec_sbox +.align 4 +.Ldec_loop: + ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x9], #64 + ldp q8, q9, [x9], #32 + eor v0.16b, v16.16b, v0.16b + ldr q10, [x9], #16 + eor v1.16b, v17.16b, v1.16b + ldr q16, [x9], #16 + eor v2.16b, v18.16b, v2.16b + eor v3.16b, v19.16b, v3.16b + eor v4.16b, v8.16b, v4.16b + eor v5.16b, v9.16b, v5.16b + eor v6.16b, v10.16b, v6.16b + eor v7.16b, v16.16b, v7.16b + tbl v0.16b, {v0.16b}, v28.16b + tbl v1.16b, {v1.16b}, v28.16b + tbl v2.16b, {v2.16b}, v28.16b + tbl v3.16b, {v3.16b}, v28.16b + tbl v4.16b, {v4.16b}, v28.16b + tbl v5.16b, {v5.16b}, v28.16b + tbl v6.16b, {v6.16b}, v28.16b + tbl v7.16b, {v7.16b}, v28.16b +.Ldec_sbox: + eor v1.16b, v1.16b, v4.16b + eor v3.16b, v3.16b, v4.16b + subs x10, x10, #1 + eor v4.16b, v4.16b, v7.16b + eor v2.16b, v2.16b, v7.16b + eor v1.16b, v1.16b, v6.16b + eor v6.16b, v6.16b, v4.16b + eor v2.16b, v2.16b, v5.16b + eor v0.16b, v0.16b, v1.16b + eor v7.16b, v7.16b, v6.16b + eor v8.16b, v6.16b, v2.16b + and v9.16b, v4.16b, v6.16b + eor v10.16b, v2.16b, v6.16b + eor v3.16b, v3.16b, v0.16b + eor v5.16b, v5.16b, v0.16b + eor v16.16b, v7.16b, v4.16b + eor v17.16b, v4.16b, v0.16b + and v18.16b, v0.16b, v2.16b + eor v19.16b, v7.16b, v4.16b + eor v1.16b, v1.16b, v3.16b + eor v20.16b, v3.16b, v0.16b + eor v21.16b, v5.16b, v2.16b + eor v22.16b, v3.16b, v7.16b + and v8.16b, v17.16b, v8.16b + orr v17.16b, v3.16b, v5.16b + eor v23.16b, v1.16b, v6.16b + eor v24.16b, v20.16b, v16.16b + eor v25.16b, v1.16b, v5.16b + orr v26.16b, v20.16b, v21.16b + and v20.16b, v20.16b, v21.16b + and v27.16b, v7.16b, v1.16b + eor v21.16b, v21.16b, v23.16b + orr v28.16b, v16.16b, v23.16b + orr v29.16b, v22.16b, v25.16b + eor v26.16b, v26.16b, v8.16b + and v16.16b, v16.16b, v23.16b + and v22.16b, v22.16b, v25.16b + and v21.16b, v24.16b, v21.16b + eor v8.16b, v28.16b, v8.16b + eor v23.16b, v5.16b, v2.16b + eor v24.16b, v1.16b, v6.16b + eor v16.16b, v16.16b, v22.16b + eor v22.16b, v3.16b, v0.16b + eor v25.16b, v29.16b, v21.16b + eor v21.16b, v26.16b, v21.16b + eor v8.16b, v8.16b, v20.16b + eor v26.16b, v23.16b, v24.16b + eor v16.16b, v16.16b, v20.16b + eor v28.16b, v22.16b, v19.16b + eor v20.16b, v25.16b, v20.16b + eor v9.16b, v21.16b, v9.16b + eor v8.16b, v8.16b, v18.16b + eor v18.16b, v5.16b, v1.16b + eor v21.16b, v16.16b, v17.16b + eor v16.16b, v16.16b, v17.16b + eor v17.16b, v20.16b, v27.16b + eor v20.16b, v3.16b, v7.16b + eor v25.16b, v9.16b, v8.16b + eor v27.16b, v0.16b, v4.16b + and v29.16b, v9.16b, v17.16b + eor v30.16b, v8.16b, v29.16b + eor v31.16b, v21.16b, v29.16b + eor v29.16b, v21.16b, v29.16b + bsl v30.16b, v17.16b, v21.16b + bsl v31.16b, v9.16b, v8.16b + bsl v16.16b, v30.16b, v29.16b + bsl v21.16b, v29.16b, v30.16b + eor v8.16b, v31.16b, v30.16b + and v1.16b, v1.16b, v31.16b + and v9.16b, v16.16b, v31.16b + and v6.16b, v6.16b, v30.16b + eor v16.16b, v17.16b, v21.16b + and v4.16b, v4.16b, v30.16b + eor v17.16b, v8.16b, v30.16b + and v21.16b, v24.16b, v8.16b + eor v9.16b, v9.16b, v25.16b + and v19.16b, v19.16b, v8.16b + eor v24.16b, v30.16b, v16.16b + eor v25.16b, v30.16b, v16.16b + and v7.16b, v7.16b, v17.16b + and v10.16b, v10.16b, v16.16b + eor v29.16b, v9.16b, v16.16b + eor v30.16b, v31.16b, v9.16b + and v0.16b, v24.16b, v0.16b + and v9.16b, v18.16b, v9.16b + and v2.16b, v25.16b, v2.16b + eor v10.16b, v10.16b, v6.16b + eor v18.16b, v29.16b, v16.16b + and v5.16b, v30.16b, v5.16b + eor v24.16b, v8.16b, v29.16b + and v25.16b, v26.16b, v29.16b + and v26.16b, v28.16b, v29.16b + eor v8.16b, v8.16b, v29.16b + eor v17.16b, v17.16b, v18.16b + eor v5.16b, v1.16b, v5.16b + and v23.16b, v24.16b, v23.16b + eor v21.16b, v21.16b, v25.16b + eor v19.16b, v19.16b, v26.16b + eor v0.16b, v4.16b, v0.16b + and v3.16b, v17.16b, v3.16b + eor v1.16b, v9.16b, v1.16b + eor v9.16b, v25.16b, v23.16b + eor v5.16b, v5.16b, v21.16b + eor v2.16b, v6.16b, v2.16b + and v6.16b, v8.16b, v22.16b + eor v3.16b, v7.16b, v3.16b + and v8.16b, v20.16b, v18.16b + eor v10.16b, v10.16b, v9.16b + eor v0.16b, v0.16b, v19.16b + eor v9.16b, v1.16b, v9.16b + eor v1.16b, v2.16b, v21.16b + eor v3.16b, v3.16b, v19.16b + and v16.16b, v27.16b, v16.16b + eor v17.16b, v26.16b, v6.16b + eor v6.16b, v8.16b, v7.16b + eor v7.16b, v1.16b, v9.16b + eor v1.16b, v5.16b, v3.16b + eor v2.16b, v10.16b, v3.16b + eor v4.16b, v16.16b, v4.16b + eor v8.16b, v6.16b, v17.16b + eor v5.16b, v9.16b, v3.16b + eor v9.16b, v0.16b, v1.16b + eor v6.16b, v7.16b, v1.16b + eor v0.16b, v4.16b, v17.16b + eor v4.16b, v8.16b, v7.16b + eor v7.16b, v9.16b, v2.16b + eor v8.16b, v3.16b, v0.16b + eor v7.16b, v7.16b, v5.16b + eor v3.16b, v4.16b, v7.16b + eor v4.16b, v7.16b, v0.16b + eor v7.16b, v8.16b, v3.16b + bcc .Ldec_done + ext v8.16b, v0.16b, v0.16b, #8 + ext v9.16b, v1.16b, v1.16b, #8 + ldr q28, [x11] // load from .LISR in common case (x10 > 0) + ext v10.16b, v6.16b, v6.16b, #8 + ext v16.16b, v3.16b, v3.16b, #8 + ext v17.16b, v5.16b, v5.16b, #8 + ext v18.16b, v4.16b, v4.16b, #8 + eor v8.16b, v8.16b, v0.16b + eor v9.16b, v9.16b, v1.16b + eor v10.16b, v10.16b, v6.16b + eor v16.16b, v16.16b, v3.16b + eor v17.16b, v17.16b, v5.16b + ext v19.16b, v2.16b, v2.16b, #8 + ext v20.16b, v7.16b, v7.16b, #8 + eor v18.16b, v18.16b, v4.16b + eor v6.16b, v6.16b, v8.16b + eor v8.16b, v2.16b, v10.16b + eor v4.16b, v4.16b, v9.16b + eor v2.16b, v19.16b, v2.16b + eor v9.16b, v20.16b, v7.16b + eor v0.16b, v0.16b, v16.16b + eor v1.16b, v1.16b, v16.16b + eor v6.16b, v6.16b, v17.16b + eor v8.16b, v8.16b, v16.16b + eor v7.16b, v7.16b, v18.16b + eor v4.16b, v4.16b, v16.16b + eor v2.16b, v3.16b, v2.16b + eor v1.16b, v1.16b, v17.16b + eor v3.16b, v5.16b, v9.16b + eor v5.16b, v8.16b, v17.16b + eor v7.16b, v7.16b, v17.16b + ext v8.16b, v0.16b, v0.16b, #12 + ext v9.16b, v6.16b, v6.16b, #12 + ext v10.16b, v4.16b, v4.16b, #12 + ext v16.16b, v1.16b, v1.16b, #12 + ext v17.16b, v5.16b, v5.16b, #12 + ext v18.16b, v7.16b, v7.16b, #12 + eor v0.16b, v0.16b, v8.16b + eor v6.16b, v6.16b, v9.16b + eor v4.16b, v4.16b, v10.16b + ext v19.16b, v2.16b, v2.16b, #12 + ext v20.16b, v3.16b, v3.16b, #12 + eor v1.16b, v1.16b, v16.16b + eor v5.16b, v5.16b, v17.16b + eor v7.16b, v7.16b, v18.16b + eor v2.16b, v2.16b, v19.16b + eor v16.16b, v16.16b, v0.16b + eor v3.16b, v3.16b, v20.16b + eor v17.16b, v17.16b, v4.16b + eor v10.16b, v10.16b, v6.16b + ext v0.16b, v0.16b, v0.16b, #8 + eor v9.16b, v9.16b, v1.16b + ext v1.16b, v1.16b, v1.16b, #8 + eor v8.16b, v8.16b, v3.16b + eor v16.16b, v16.16b, v3.16b + eor v18.16b, v18.16b, v5.16b + eor v19.16b, v19.16b, v7.16b + ext v21.16b, v5.16b, v5.16b, #8 + ext v5.16b, v7.16b, v7.16b, #8 + eor v7.16b, v20.16b, v2.16b + ext v4.16b, v4.16b, v4.16b, #8 + ext v20.16b, v3.16b, v3.16b, #8 + eor v17.16b, v17.16b, v3.16b + ext v2.16b, v2.16b, v2.16b, #8 + eor v3.16b, v10.16b, v3.16b + ext v10.16b, v6.16b, v6.16b, #8 + eor v0.16b, v0.16b, v8.16b + eor v1.16b, v1.16b, v16.16b + eor v5.16b, v5.16b, v18.16b + eor v3.16b, v3.16b, v4.16b + eor v7.16b, v20.16b, v7.16b + eor v6.16b, v2.16b, v19.16b + eor v4.16b, v21.16b, v17.16b + eor v2.16b, v10.16b, v9.16b + bne .Ldec_loop + ldr q28, [x11, #16]! // load from .LISRM0 on last round (x10 == 0) + b .Ldec_loop +.align 4 +.Ldec_done: + ushr v8.2d, v0.2d, #1 + movi v9.16b, #0x55 + ldr q10, [x9] + ushr v16.2d, v2.2d, #1 + movi v17.16b, #0x33 + ushr v18.2d, v6.2d, #1 + movi v19.16b, #0x0f + eor v8.16b, v8.16b, v1.16b + ushr v20.2d, v3.2d, #1 + eor v16.16b, v16.16b, v7.16b + eor v18.16b, v18.16b, v4.16b + and v8.16b, v8.16b, v9.16b + eor v20.16b, v20.16b, v5.16b + and v16.16b, v16.16b, v9.16b + and v18.16b, v18.16b, v9.16b + shl v21.2d, v8.2d, #1 + eor v1.16b, v1.16b, v8.16b + and v8.16b, v20.16b, v9.16b + eor v7.16b, v7.16b, v16.16b + shl v9.2d, v16.2d, #1 + eor v4.16b, v4.16b, v18.16b + shl v16.2d, v18.2d, #1 + eor v0.16b, v0.16b, v21.16b + shl v18.2d, v8.2d, #1 + eor v5.16b, v5.16b, v8.16b + eor v2.16b, v2.16b, v9.16b + eor v6.16b, v6.16b, v16.16b + ushr v8.2d, v1.2d, #2 + eor v3.16b, v3.16b, v18.16b + ushr v9.2d, v0.2d, #2 + ushr v16.2d, v7.2d, #2 + ushr v18.2d, v2.2d, #2 + eor v8.16b, v8.16b, v4.16b + eor v9.16b, v9.16b, v6.16b + eor v16.16b, v16.16b, v5.16b + eor v18.16b, v18.16b, v3.16b + and v8.16b, v8.16b, v17.16b + and v9.16b, v9.16b, v17.16b + and v16.16b, v16.16b, v17.16b + and v17.16b, v18.16b, v17.16b + eor v4.16b, v4.16b, v8.16b + shl v8.2d, v8.2d, #2 + eor v6.16b, v6.16b, v9.16b + shl v9.2d, v9.2d, #2 + eor v5.16b, v5.16b, v16.16b + shl v16.2d, v16.2d, #2 + eor v3.16b, v3.16b, v17.16b + shl v17.2d, v17.2d, #2 + eor v1.16b, v1.16b, v8.16b + eor v0.16b, v0.16b, v9.16b + eor v7.16b, v7.16b, v16.16b + eor v2.16b, v2.16b, v17.16b + ushr v8.2d, v4.2d, #4 + ushr v9.2d, v6.2d, #4 + ushr v16.2d, v1.2d, #4 + ushr v17.2d, v0.2d, #4 + eor v8.16b, v8.16b, v5.16b + eor v9.16b, v9.16b, v3.16b + eor v16.16b, v16.16b, v7.16b + eor v17.16b, v17.16b, v2.16b + and v8.16b, v8.16b, v19.16b + and v9.16b, v9.16b, v19.16b + and v16.16b, v16.16b, v19.16b + and v17.16b, v17.16b, v19.16b + eor v5.16b, v5.16b, v8.16b + shl v8.2d, v8.2d, #4 + eor v3.16b, v3.16b, v9.16b + shl v9.2d, v9.2d, #4 + eor v7.16b, v7.16b, v16.16b + shl v16.2d, v16.2d, #4 + eor v2.16b, v2.16b, v17.16b + shl v17.2d, v17.2d, #4 + eor v4.16b, v4.16b, v8.16b + eor v6.16b, v6.16b, v9.16b + eor v7.16b, v7.16b, v10.16b + eor v1.16b, v1.16b, v16.16b + eor v2.16b, v2.16b, v10.16b + eor v0.16b, v0.16b, v17.16b + eor v4.16b, v4.16b, v10.16b + eor v6.16b, v6.16b, v10.16b + eor v3.16b, v3.16b, v10.16b + eor v5.16b, v5.16b, v10.16b + eor v1.16b, v1.16b, v10.16b + eor v0.16b, v0.16b, v10.16b + ret +.size _bsaes_decrypt8,.-_bsaes_decrypt8 + +.type _bsaes_const,%object +.align 6 +_bsaes_const: +// InvShiftRows constants +// Used in _bsaes_decrypt8, which assumes contiguity +// .LM0ISR used with round 0 key +// .LISR used with middle round keys +// .LISRM0 used with final round key +.LM0ISR: +.quad 0x0a0e0206070b0f03, 0x0004080c0d010509 +.LISR: +.quad 0x0504070602010003, 0x0f0e0d0c080b0a09 +.LISRM0: +.quad 0x01040b0e0205080f, 0x0306090c00070a0d + +// ShiftRows constants +// Used in _bsaes_encrypt8, which assumes contiguity +// .LM0SR used with round 0 key +// .LSR used with middle round keys +// .LSRM0 used with final round key +.LM0SR: +.quad 0x0a0e02060f03070b, 0x0004080c05090d01 +.LSR: +.quad 0x0504070600030201, 0x0f0e0d0c0a09080b +.LSRM0: +.quad 0x0304090e00050a0f, 0x01060b0c0207080d + +.LM0_bigendian: +.quad 0x02060a0e03070b0f, 0x0004080c0105090d +.LM0_littleendian: +.quad 0x0105090d0004080c, 0x03070b0f02060a0e + +// Used in bsaes_ctr32_encrypt_blocks, prior to dropping into +// _bsaes_encrypt8_alt, for round 0 key in place of .LM0SR +.LREVM0SR: +.quad 0x090d01050c000408, 0x03070b0f060a0e02 + +.align 6 +.size _bsaes_const,.-_bsaes_const + +.type _bsaes_encrypt8,%function +.align 4 +// On entry: +// x9 -> key (previously expanded using _bsaes_key_convert) +// x10 = number of rounds +// v0-v7 input data +// On exit: +// x9-x11 corrupted +// other general-purpose registers preserved +// v0-v7 output data +// v11-v15 preserved +// other SIMD registers corrupted +_bsaes_encrypt8: + ldr q8, [x9], #16 + adr x11, .LM0SR + ldr q9, [x11], #16 +_bsaes_encrypt8_alt: + eor v0.16b, v0.16b, v8.16b + eor v1.16b, v1.16b, v8.16b + sub x10, x10, #1 + eor v2.16b, v2.16b, v8.16b + eor v4.16b, v4.16b, v8.16b + eor v3.16b, v3.16b, v8.16b + eor v5.16b, v5.16b, v8.16b + tbl v0.16b, {v0.16b}, v9.16b + tbl v1.16b, {v1.16b}, v9.16b + tbl v2.16b, {v2.16b}, v9.16b + tbl v4.16b, {v4.16b}, v9.16b + eor v6.16b, v6.16b, v8.16b + eor v7.16b, v7.16b, v8.16b + tbl v3.16b, {v3.16b}, v9.16b + tbl v5.16b, {v5.16b}, v9.16b + tbl v6.16b, {v6.16b}, v9.16b + ushr v8.2d, v0.2d, #1 + movi v10.16b, #0x55 + tbl v7.16b, {v7.16b}, v9.16b + ushr v9.2d, v4.2d, #1 + movi v16.16b, #0x33 + ushr v17.2d, v2.2d, #1 + eor v8.16b, v8.16b, v1.16b + movi v18.16b, #0x0f + ushr v19.2d, v6.2d, #1 + eor v9.16b, v9.16b, v5.16b + eor v17.16b, v17.16b, v3.16b + and v8.16b, v8.16b, v10.16b + eor v19.16b, v19.16b, v7.16b + and v9.16b, v9.16b, v10.16b + and v17.16b, v17.16b, v10.16b + eor v1.16b, v1.16b, v8.16b + shl v8.2d, v8.2d, #1 + and v10.16b, v19.16b, v10.16b + eor v5.16b, v5.16b, v9.16b + shl v9.2d, v9.2d, #1 + eor v3.16b, v3.16b, v17.16b + shl v17.2d, v17.2d, #1 + eor v0.16b, v0.16b, v8.16b + shl v8.2d, v10.2d, #1 + eor v7.16b, v7.16b, v10.16b + eor v4.16b, v4.16b, v9.16b + eor v2.16b, v2.16b, v17.16b + ushr v9.2d, v1.2d, #2 + eor v6.16b, v6.16b, v8.16b + ushr v8.2d, v0.2d, #2 + ushr v10.2d, v5.2d, #2 + ushr v17.2d, v4.2d, #2 + eor v9.16b, v9.16b, v3.16b + eor v8.16b, v8.16b, v2.16b + eor v10.16b, v10.16b, v7.16b + eor v17.16b, v17.16b, v6.16b + and v9.16b, v9.16b, v16.16b + and v8.16b, v8.16b, v16.16b + and v10.16b, v10.16b, v16.16b + and v16.16b, v17.16b, v16.16b + eor v3.16b, v3.16b, v9.16b + shl v9.2d, v9.2d, #2 + eor v2.16b, v2.16b, v8.16b + shl v8.2d, v8.2d, #2 + eor v7.16b, v7.16b, v10.16b + shl v10.2d, v10.2d, #2 + eor v6.16b, v6.16b, v16.16b + shl v16.2d, v16.2d, #2 + eor v1.16b, v1.16b, v9.16b + eor v0.16b, v0.16b, v8.16b + eor v5.16b, v5.16b, v10.16b + eor v4.16b, v4.16b, v16.16b + ushr v8.2d, v3.2d, #4 + ushr v9.2d, v2.2d, #4 + ushr v10.2d, v1.2d, #4 + ushr v16.2d, v0.2d, #4 + eor v8.16b, v8.16b, v7.16b + eor v9.16b, v9.16b, v6.16b + eor v10.16b, v10.16b, v5.16b + eor v16.16b, v16.16b, v4.16b + and v8.16b, v8.16b, v18.16b + and v9.16b, v9.16b, v18.16b + and v10.16b, v10.16b, v18.16b + and v16.16b, v16.16b, v18.16b + eor v7.16b, v7.16b, v8.16b + shl v8.2d, v8.2d, #4 + eor v6.16b, v6.16b, v9.16b + shl v9.2d, v9.2d, #4 + eor v5.16b, v5.16b, v10.16b + shl v10.2d, v10.2d, #4 + eor v4.16b, v4.16b, v16.16b + shl v16.2d, v16.2d, #4 + eor v3.16b, v3.16b, v8.16b + eor v2.16b, v2.16b, v9.16b + eor v1.16b, v1.16b, v10.16b + eor v0.16b, v0.16b, v16.16b + b .Lenc_sbox +.align 4 +.Lenc_loop: + ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x9], #64 + ldp q8, q9, [x9], #32 + eor v0.16b, v16.16b, v0.16b + ldr q10, [x9], #16 + eor v1.16b, v17.16b, v1.16b + ldr q16, [x9], #16 + eor v2.16b, v18.16b, v2.16b + eor v3.16b, v19.16b, v3.16b + eor v4.16b, v8.16b, v4.16b + eor v5.16b, v9.16b, v5.16b + eor v6.16b, v10.16b, v6.16b + eor v7.16b, v16.16b, v7.16b + tbl v0.16b, {v0.16b}, v28.16b + tbl v1.16b, {v1.16b}, v28.16b + tbl v2.16b, {v2.16b}, v28.16b + tbl v3.16b, {v3.16b}, v28.16b + tbl v4.16b, {v4.16b}, v28.16b + tbl v5.16b, {v5.16b}, v28.16b + tbl v6.16b, {v6.16b}, v28.16b + tbl v7.16b, {v7.16b}, v28.16b +.Lenc_sbox: + eor v5.16b, v5.16b, v6.16b + eor v3.16b, v3.16b, v0.16b + subs x10, x10, #1 + eor v2.16b, v2.16b, v1.16b + eor v5.16b, v5.16b, v0.16b + eor v8.16b, v3.16b, v7.16b + eor v6.16b, v6.16b, v2.16b + eor v7.16b, v7.16b, v5.16b + eor v8.16b, v8.16b, v4.16b + eor v3.16b, v6.16b, v3.16b + eor v4.16b, v4.16b, v5.16b + eor v6.16b, v1.16b, v5.16b + eor v2.16b, v2.16b, v7.16b + eor v1.16b, v8.16b, v1.16b + eor v8.16b, v7.16b, v4.16b + eor v9.16b, v3.16b, v0.16b + eor v10.16b, v7.16b, v6.16b + eor v16.16b, v5.16b, v3.16b + eor v17.16b, v6.16b, v2.16b + eor v18.16b, v5.16b, v1.16b + eor v19.16b, v2.16b, v4.16b + eor v20.16b, v1.16b, v0.16b + orr v21.16b, v8.16b, v9.16b + orr v22.16b, v10.16b, v16.16b + eor v23.16b, v8.16b, v17.16b + eor v24.16b, v9.16b, v18.16b + and v19.16b, v19.16b, v20.16b + orr v20.16b, v17.16b, v18.16b + and v8.16b, v8.16b, v9.16b + and v9.16b, v17.16b, v18.16b + and v17.16b, v23.16b, v24.16b + and v10.16b, v10.16b, v16.16b + eor v16.16b, v21.16b, v19.16b + eor v18.16b, v20.16b, v19.16b + and v19.16b, v2.16b, v1.16b + and v20.16b, v6.16b, v5.16b + eor v21.16b, v22.16b, v17.16b + eor v9.16b, v9.16b, v10.16b + eor v10.16b, v16.16b, v17.16b + eor v16.16b, v18.16b, v8.16b + and v17.16b, v4.16b, v0.16b + orr v18.16b, v7.16b, v3.16b + eor v21.16b, v21.16b, v8.16b + eor v8.16b, v9.16b, v8.16b + eor v9.16b, v10.16b, v19.16b + eor v10.16b, v3.16b, v0.16b + eor v16.16b, v16.16b, v17.16b + eor v17.16b, v5.16b, v1.16b + eor v19.16b, v21.16b, v20.16b + eor v20.16b, v8.16b, v18.16b + eor v8.16b, v8.16b, v18.16b + eor v18.16b, v7.16b, v4.16b + eor v21.16b, v9.16b, v16.16b + eor v22.16b, v6.16b, v2.16b + and v23.16b, v9.16b, v19.16b + eor v24.16b, v10.16b, v17.16b + eor v25.16b, v0.16b, v1.16b + eor v26.16b, v7.16b, v6.16b + eor v27.16b, v18.16b, v22.16b + eor v28.16b, v3.16b, v5.16b + eor v29.16b, v16.16b, v23.16b + eor v30.16b, v20.16b, v23.16b + eor v23.16b, v20.16b, v23.16b + eor v31.16b, v4.16b, v2.16b + bsl v29.16b, v19.16b, v20.16b + bsl v30.16b, v9.16b, v16.16b + bsl v8.16b, v29.16b, v23.16b + bsl v20.16b, v23.16b, v29.16b + eor v9.16b, v30.16b, v29.16b + and v5.16b, v5.16b, v30.16b + and v8.16b, v8.16b, v30.16b + and v1.16b, v1.16b, v29.16b + eor v16.16b, v19.16b, v20.16b + and v2.16b, v2.16b, v29.16b + eor v19.16b, v9.16b, v29.16b + and v17.16b, v17.16b, v9.16b + eor v8.16b, v8.16b, v21.16b + and v20.16b, v22.16b, v9.16b + eor v21.16b, v29.16b, v16.16b + eor v22.16b, v29.16b, v16.16b + and v23.16b, v25.16b, v16.16b + and v6.16b, v6.16b, v19.16b + eor v25.16b, v8.16b, v16.16b + eor v29.16b, v30.16b, v8.16b + and v4.16b, v21.16b, v4.16b + and v8.16b, v28.16b, v8.16b + and v0.16b, v22.16b, v0.16b + eor v21.16b, v23.16b, v1.16b + eor v22.16b, v9.16b, v25.16b + eor v9.16b, v9.16b, v25.16b + eor v23.16b, v25.16b, v16.16b + and v3.16b, v29.16b, v3.16b + and v24.16b, v24.16b, v25.16b + and v25.16b, v27.16b, v25.16b + and v10.16b, v22.16b, v10.16b + and v9.16b, v9.16b, v18.16b + eor v18.16b, v19.16b, v23.16b + and v19.16b, v26.16b, v23.16b + eor v3.16b, v5.16b, v3.16b + eor v17.16b, v17.16b, v24.16b + eor v10.16b, v24.16b, v10.16b + and v16.16b, v31.16b, v16.16b + eor v20.16b, v20.16b, v25.16b + eor v9.16b, v25.16b, v9.16b + eor v4.16b, v2.16b, v4.16b + and v7.16b, v18.16b, v7.16b + eor v18.16b, v19.16b, v6.16b + eor v5.16b, v8.16b, v5.16b + eor v0.16b, v1.16b, v0.16b + eor v1.16b, v21.16b, v10.16b + eor v8.16b, v3.16b, v17.16b + eor v2.16b, v16.16b, v2.16b + eor v3.16b, v6.16b, v7.16b + eor v6.16b, v18.16b, v9.16b + eor v4.16b, v4.16b, v20.16b + eor v10.16b, v5.16b, v10.16b + eor v0.16b, v0.16b, v17.16b + eor v9.16b, v2.16b, v9.16b + eor v3.16b, v3.16b, v20.16b + eor v7.16b, v6.16b, v1.16b + eor v5.16b, v8.16b, v4.16b + eor v6.16b, v10.16b, v1.16b + eor v2.16b, v4.16b, v0.16b + eor v4.16b, v3.16b, v10.16b + eor v9.16b, v9.16b, v7.16b + eor v3.16b, v0.16b, v5.16b + eor v0.16b, v1.16b, v4.16b + eor v1.16b, v4.16b, v8.16b + eor v4.16b, v9.16b, v5.16b + eor v6.16b, v6.16b, v3.16b + bcc .Lenc_done + ext v8.16b, v0.16b, v0.16b, #12 + ext v9.16b, v4.16b, v4.16b, #12 + ldr q28, [x11] + ext v10.16b, v6.16b, v6.16b, #12 + ext v16.16b, v1.16b, v1.16b, #12 + ext v17.16b, v3.16b, v3.16b, #12 + ext v18.16b, v7.16b, v7.16b, #12 + eor v0.16b, v0.16b, v8.16b + eor v4.16b, v4.16b, v9.16b + eor v6.16b, v6.16b, v10.16b + ext v19.16b, v2.16b, v2.16b, #12 + ext v20.16b, v5.16b, v5.16b, #12 + eor v1.16b, v1.16b, v16.16b + eor v3.16b, v3.16b, v17.16b + eor v7.16b, v7.16b, v18.16b + eor v2.16b, v2.16b, v19.16b + eor v16.16b, v16.16b, v0.16b + eor v5.16b, v5.16b, v20.16b + eor v17.16b, v17.16b, v6.16b + eor v10.16b, v10.16b, v4.16b + ext v0.16b, v0.16b, v0.16b, #8 + eor v9.16b, v9.16b, v1.16b + ext v1.16b, v1.16b, v1.16b, #8 + eor v8.16b, v8.16b, v5.16b + eor v16.16b, v16.16b, v5.16b + eor v18.16b, v18.16b, v3.16b + eor v19.16b, v19.16b, v7.16b + ext v3.16b, v3.16b, v3.16b, #8 + ext v7.16b, v7.16b, v7.16b, #8 + eor v20.16b, v20.16b, v2.16b + ext v6.16b, v6.16b, v6.16b, #8 + ext v21.16b, v5.16b, v5.16b, #8 + eor v17.16b, v17.16b, v5.16b + ext v2.16b, v2.16b, v2.16b, #8 + eor v10.16b, v10.16b, v5.16b + ext v22.16b, v4.16b, v4.16b, #8 + eor v0.16b, v0.16b, v8.16b + eor v1.16b, v1.16b, v16.16b + eor v5.16b, v7.16b, v18.16b + eor v4.16b, v3.16b, v17.16b + eor v3.16b, v6.16b, v10.16b + eor v7.16b, v21.16b, v20.16b + eor v6.16b, v2.16b, v19.16b + eor v2.16b, v22.16b, v9.16b + bne .Lenc_loop + ldr q28, [x11, #16]! // load from .LSRM0 on last round (x10 == 0) + b .Lenc_loop +.align 4 +.Lenc_done: + ushr v8.2d, v0.2d, #1 + movi v9.16b, #0x55 + ldr q10, [x9] + ushr v16.2d, v3.2d, #1 + movi v17.16b, #0x33 + ushr v18.2d, v4.2d, #1 + movi v19.16b, #0x0f + eor v8.16b, v8.16b, v1.16b + ushr v20.2d, v2.2d, #1 + eor v16.16b, v16.16b, v7.16b + eor v18.16b, v18.16b, v6.16b + and v8.16b, v8.16b, v9.16b + eor v20.16b, v20.16b, v5.16b + and v16.16b, v16.16b, v9.16b + and v18.16b, v18.16b, v9.16b + shl v21.2d, v8.2d, #1 + eor v1.16b, v1.16b, v8.16b + and v8.16b, v20.16b, v9.16b + eor v7.16b, v7.16b, v16.16b + shl v9.2d, v16.2d, #1 + eor v6.16b, v6.16b, v18.16b + shl v16.2d, v18.2d, #1 + eor v0.16b, v0.16b, v21.16b + shl v18.2d, v8.2d, #1 + eor v5.16b, v5.16b, v8.16b + eor v3.16b, v3.16b, v9.16b + eor v4.16b, v4.16b, v16.16b + ushr v8.2d, v1.2d, #2 + eor v2.16b, v2.16b, v18.16b + ushr v9.2d, v0.2d, #2 + ushr v16.2d, v7.2d, #2 + ushr v18.2d, v3.2d, #2 + eor v8.16b, v8.16b, v6.16b + eor v9.16b, v9.16b, v4.16b + eor v16.16b, v16.16b, v5.16b + eor v18.16b, v18.16b, v2.16b + and v8.16b, v8.16b, v17.16b + and v9.16b, v9.16b, v17.16b + and v16.16b, v16.16b, v17.16b + and v17.16b, v18.16b, v17.16b + eor v6.16b, v6.16b, v8.16b + shl v8.2d, v8.2d, #2 + eor v4.16b, v4.16b, v9.16b + shl v9.2d, v9.2d, #2 + eor v5.16b, v5.16b, v16.16b + shl v16.2d, v16.2d, #2 + eor v2.16b, v2.16b, v17.16b + shl v17.2d, v17.2d, #2 + eor v1.16b, v1.16b, v8.16b + eor v0.16b, v0.16b, v9.16b + eor v7.16b, v7.16b, v16.16b + eor v3.16b, v3.16b, v17.16b + ushr v8.2d, v6.2d, #4 + ushr v9.2d, v4.2d, #4 + ushr v16.2d, v1.2d, #4 + ushr v17.2d, v0.2d, #4 + eor v8.16b, v8.16b, v5.16b + eor v9.16b, v9.16b, v2.16b + eor v16.16b, v16.16b, v7.16b + eor v17.16b, v17.16b, v3.16b + and v8.16b, v8.16b, v19.16b + and v9.16b, v9.16b, v19.16b + and v16.16b, v16.16b, v19.16b + and v17.16b, v17.16b, v19.16b + eor v5.16b, v5.16b, v8.16b + shl v8.2d, v8.2d, #4 + eor v2.16b, v2.16b, v9.16b + shl v9.2d, v9.2d, #4 + eor v7.16b, v7.16b, v16.16b + shl v16.2d, v16.2d, #4 + eor v3.16b, v3.16b, v17.16b + shl v17.2d, v17.2d, #4 + eor v6.16b, v6.16b, v8.16b + eor v4.16b, v4.16b, v9.16b + eor v7.16b, v7.16b, v10.16b + eor v1.16b, v1.16b, v16.16b + eor v3.16b, v3.16b, v10.16b + eor v0.16b, v0.16b, v17.16b + eor v6.16b, v6.16b, v10.16b + eor v4.16b, v4.16b, v10.16b + eor v2.16b, v2.16b, v10.16b + eor v5.16b, v5.16b, v10.16b + eor v1.16b, v1.16b, v10.16b + eor v0.16b, v0.16b, v10.16b + ret +.size _bsaes_encrypt8,.-_bsaes_encrypt8 + +.type _bsaes_key_convert,%function +.align 4 +// On entry: +// x9 -> input key (big-endian) +// x10 = number of rounds +// x17 -> output key (native endianness) +// On exit: +// x9, x10 corrupted +// x11 -> .LM0_bigendian +// x17 -> last quadword of output key +// other general-purpose registers preserved +// v2-v6 preserved +// v7.16b[] = 0x63 +// v8-v14 preserved +// v15 = last round key (converted to native endianness) +// other SIMD registers corrupted +_bsaes_key_convert: +#ifdef __ARMEL__ + adr x11, .LM0_littleendian +#else + adr x11, .LM0_bigendian +#endif + ldr q0, [x9], #16 // load round 0 key + ldr q1, [x11] // .LM0 + ldr q15, [x9], #16 // load round 1 key + + movi v7.16b, #0x63 // compose .L63 + movi v16.16b, #0x01 // bit masks + movi v17.16b, #0x02 + movi v18.16b, #0x04 + movi v19.16b, #0x08 + movi v20.16b, #0x10 + movi v21.16b, #0x20 + movi v22.16b, #0x40 + movi v23.16b, #0x80 + +#ifdef __ARMEL__ + rev32 v0.16b, v0.16b +#endif + sub x10, x10, #1 + str q0, [x17], #16 // save round 0 key + |