summaryrefslogtreecommitdiffstats
path: root/crypto/aes
diff options
context:
space:
mode:
authorPauli <pauli@openssl.org>2021-05-20 13:51:59 +1000
committerMatt Caswell <matt@openssl.org>2021-05-20 08:51:30 +0100
commite3884ec5c37334e585e9208ce69d7e5b3cad4624 (patch)
tree08ade3022fda3a64cd84b629736c0c9ac051833b /crypto/aes
parentb7140b0604bdfaa034452d97648a9c23a97568e4 (diff)
Revert "ARM assembly pack: translate bit-sliced AES implementation to AArch64"
This reverts commit da51566b256e0c0536d5b986e676863b0526bf5e. Fixes #15321 Reviewed-by: Tim Hudson <tjh@openssl.org> (Merged from https://github.com/openssl/openssl/pull/15364)
Diffstat (limited to 'crypto/aes')
-rw-r--r--crypto/aes/asm/bsaes-armv8.S2338
-rw-r--r--crypto/aes/build.info5
2 files changed, 2 insertions, 2341 deletions
diff --git a/crypto/aes/asm/bsaes-armv8.S b/crypto/aes/asm/bsaes-armv8.S
deleted file mode 100644
index 9bd02d0c8a..0000000000
--- a/crypto/aes/asm/bsaes-armv8.S
+++ /dev/null
@@ -1,2338 +0,0 @@
-// Copyright 2021 The OpenSSL Project Authors. All Rights Reserved.
-//
-// Licensed under the OpenSSL license (the "License"). You may not use
-// this file except in compliance with the License. You can obtain a copy
-// in the file LICENSE in the source distribution or at
-// https://www.openssl.org/source/license.html
-//
-// ====================================================================
-// Written by Ben Avison <bavison@riscosopen.org> for the OpenSSL
-// project. Rights for redistribution and usage in source and binary
-// forms are granted according to the OpenSSL license.
-// ====================================================================
-//
-// This implementation is a translation of bsaes-armv7 for AArch64.
-// No attempt has been made to carry across the build switches for
-// kernel targets, since the Linux kernel crypto support has moved on
-// from when it was based on OpenSSL.
-
-// A lot of hand-scheduling has been performed. Consequently, this code
-// doesn't factor out neatly into macros in the same way that the
-// AArch32 version did, and there is little to be gained by wrapping it
-// up in Perl, and it is presented as pure assembly.
-
-
-#include "crypto/arm_arch.h"
-
-.text
-
-.type _bsaes_decrypt8,%function
-.align 4
-// On entry:
-// x9 -> key (previously expanded using _bsaes_key_convert)
-// x10 = number of rounds
-// v0-v7 input data
-// On exit:
-// x9-x11 corrupted
-// other general-purpose registers preserved
-// v0-v7 output data
-// v11-v15 preserved
-// other SIMD registers corrupted
-_bsaes_decrypt8:
- ldr q8, [x9], #16
- adr x11, .LM0ISR
- movi v9.16b, #0x55
- ldr q10, [x11], #16
- movi v16.16b, #0x33
- movi v17.16b, #0x0f
- sub x10, x10, #1
- eor v0.16b, v0.16b, v8.16b
- eor v1.16b, v1.16b, v8.16b
- eor v2.16b, v2.16b, v8.16b
- eor v4.16b, v4.16b, v8.16b
- eor v3.16b, v3.16b, v8.16b
- eor v5.16b, v5.16b, v8.16b
- tbl v0.16b, {v0.16b}, v10.16b
- tbl v1.16b, {v1.16b}, v10.16b
- tbl v2.16b, {v2.16b}, v10.16b
- tbl v4.16b, {v4.16b}, v10.16b
- eor v6.16b, v6.16b, v8.16b
- eor v7.16b, v7.16b, v8.16b
- tbl v3.16b, {v3.16b}, v10.16b
- tbl v5.16b, {v5.16b}, v10.16b
- tbl v6.16b, {v6.16b}, v10.16b
- ushr v8.2d, v0.2d, #1
- tbl v7.16b, {v7.16b}, v10.16b
- ushr v10.2d, v4.2d, #1
- ushr v18.2d, v2.2d, #1
- eor v8.16b, v8.16b, v1.16b
- ushr v19.2d, v6.2d, #1
- eor v10.16b, v10.16b, v5.16b
- eor v18.16b, v18.16b, v3.16b
- and v8.16b, v8.16b, v9.16b
- eor v19.16b, v19.16b, v7.16b
- and v10.16b, v10.16b, v9.16b
- and v18.16b, v18.16b, v9.16b
- eor v1.16b, v1.16b, v8.16b
- shl v8.2d, v8.2d, #1
- and v9.16b, v19.16b, v9.16b
- eor v5.16b, v5.16b, v10.16b
- shl v10.2d, v10.2d, #1
- eor v3.16b, v3.16b, v18.16b
- shl v18.2d, v18.2d, #1
- eor v0.16b, v0.16b, v8.16b
- shl v8.2d, v9.2d, #1
- eor v7.16b, v7.16b, v9.16b
- eor v4.16b, v4.16b, v10.16b
- eor v2.16b, v2.16b, v18.16b
- ushr v9.2d, v1.2d, #2
- eor v6.16b, v6.16b, v8.16b
- ushr v8.2d, v0.2d, #2
- ushr v10.2d, v5.2d, #2
- ushr v18.2d, v4.2d, #2
- eor v9.16b, v9.16b, v3.16b
- eor v8.16b, v8.16b, v2.16b
- eor v10.16b, v10.16b, v7.16b
- eor v18.16b, v18.16b, v6.16b
- and v9.16b, v9.16b, v16.16b
- and v8.16b, v8.16b, v16.16b
- and v10.16b, v10.16b, v16.16b
- and v16.16b, v18.16b, v16.16b
- eor v3.16b, v3.16b, v9.16b
- shl v9.2d, v9.2d, #2
- eor v2.16b, v2.16b, v8.16b
- shl v8.2d, v8.2d, #2
- eor v7.16b, v7.16b, v10.16b
- shl v10.2d, v10.2d, #2
- eor v6.16b, v6.16b, v16.16b
- shl v16.2d, v16.2d, #2
- eor v1.16b, v1.16b, v9.16b
- eor v0.16b, v0.16b, v8.16b
- eor v5.16b, v5.16b, v10.16b
- eor v4.16b, v4.16b, v16.16b
- ushr v8.2d, v3.2d, #4
- ushr v9.2d, v2.2d, #4
- ushr v10.2d, v1.2d, #4
- ushr v16.2d, v0.2d, #4
- eor v8.16b, v8.16b, v7.16b
- eor v9.16b, v9.16b, v6.16b
- eor v10.16b, v10.16b, v5.16b
- eor v16.16b, v16.16b, v4.16b
- and v8.16b, v8.16b, v17.16b
- and v9.16b, v9.16b, v17.16b
- and v10.16b, v10.16b, v17.16b
- and v16.16b, v16.16b, v17.16b
- eor v7.16b, v7.16b, v8.16b
- shl v8.2d, v8.2d, #4
- eor v6.16b, v6.16b, v9.16b
- shl v9.2d, v9.2d, #4
- eor v5.16b, v5.16b, v10.16b
- shl v10.2d, v10.2d, #4
- eor v4.16b, v4.16b, v16.16b
- shl v16.2d, v16.2d, #4
- eor v3.16b, v3.16b, v8.16b
- eor v2.16b, v2.16b, v9.16b
- eor v1.16b, v1.16b, v10.16b
- eor v0.16b, v0.16b, v16.16b
- b .Ldec_sbox
-.align 4
-.Ldec_loop:
- ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x9], #64
- ldp q8, q9, [x9], #32
- eor v0.16b, v16.16b, v0.16b
- ldr q10, [x9], #16
- eor v1.16b, v17.16b, v1.16b
- ldr q16, [x9], #16
- eor v2.16b, v18.16b, v2.16b
- eor v3.16b, v19.16b, v3.16b
- eor v4.16b, v8.16b, v4.16b
- eor v5.16b, v9.16b, v5.16b
- eor v6.16b, v10.16b, v6.16b
- eor v7.16b, v16.16b, v7.16b
- tbl v0.16b, {v0.16b}, v28.16b
- tbl v1.16b, {v1.16b}, v28.16b
- tbl v2.16b, {v2.16b}, v28.16b
- tbl v3.16b, {v3.16b}, v28.16b
- tbl v4.16b, {v4.16b}, v28.16b
- tbl v5.16b, {v5.16b}, v28.16b
- tbl v6.16b, {v6.16b}, v28.16b
- tbl v7.16b, {v7.16b}, v28.16b
-.Ldec_sbox:
- eor v1.16b, v1.16b, v4.16b
- eor v3.16b, v3.16b, v4.16b
- subs x10, x10, #1
- eor v4.16b, v4.16b, v7.16b
- eor v2.16b, v2.16b, v7.16b
- eor v1.16b, v1.16b, v6.16b
- eor v6.16b, v6.16b, v4.16b
- eor v2.16b, v2.16b, v5.16b
- eor v0.16b, v0.16b, v1.16b
- eor v7.16b, v7.16b, v6.16b
- eor v8.16b, v6.16b, v2.16b
- and v9.16b, v4.16b, v6.16b
- eor v10.16b, v2.16b, v6.16b
- eor v3.16b, v3.16b, v0.16b
- eor v5.16b, v5.16b, v0.16b
- eor v16.16b, v7.16b, v4.16b
- eor v17.16b, v4.16b, v0.16b
- and v18.16b, v0.16b, v2.16b
- eor v19.16b, v7.16b, v4.16b
- eor v1.16b, v1.16b, v3.16b
- eor v20.16b, v3.16b, v0.16b
- eor v21.16b, v5.16b, v2.16b
- eor v22.16b, v3.16b, v7.16b
- and v8.16b, v17.16b, v8.16b
- orr v17.16b, v3.16b, v5.16b
- eor v23.16b, v1.16b, v6.16b
- eor v24.16b, v20.16b, v16.16b
- eor v25.16b, v1.16b, v5.16b
- orr v26.16b, v20.16b, v21.16b
- and v20.16b, v20.16b, v21.16b
- and v27.16b, v7.16b, v1.16b
- eor v21.16b, v21.16b, v23.16b
- orr v28.16b, v16.16b, v23.16b
- orr v29.16b, v22.16b, v25.16b
- eor v26.16b, v26.16b, v8.16b
- and v16.16b, v16.16b, v23.16b
- and v22.16b, v22.16b, v25.16b
- and v21.16b, v24.16b, v21.16b
- eor v8.16b, v28.16b, v8.16b
- eor v23.16b, v5.16b, v2.16b
- eor v24.16b, v1.16b, v6.16b
- eor v16.16b, v16.16b, v22.16b
- eor v22.16b, v3.16b, v0.16b
- eor v25.16b, v29.16b, v21.16b
- eor v21.16b, v26.16b, v21.16b
- eor v8.16b, v8.16b, v20.16b
- eor v26.16b, v23.16b, v24.16b
- eor v16.16b, v16.16b, v20.16b
- eor v28.16b, v22.16b, v19.16b
- eor v20.16b, v25.16b, v20.16b
- eor v9.16b, v21.16b, v9.16b
- eor v8.16b, v8.16b, v18.16b
- eor v18.16b, v5.16b, v1.16b
- eor v21.16b, v16.16b, v17.16b
- eor v16.16b, v16.16b, v17.16b
- eor v17.16b, v20.16b, v27.16b
- eor v20.16b, v3.16b, v7.16b
- eor v25.16b, v9.16b, v8.16b
- eor v27.16b, v0.16b, v4.16b
- and v29.16b, v9.16b, v17.16b
- eor v30.16b, v8.16b, v29.16b
- eor v31.16b, v21.16b, v29.16b
- eor v29.16b, v21.16b, v29.16b
- bsl v30.16b, v17.16b, v21.16b
- bsl v31.16b, v9.16b, v8.16b
- bsl v16.16b, v30.16b, v29.16b
- bsl v21.16b, v29.16b, v30.16b
- eor v8.16b, v31.16b, v30.16b
- and v1.16b, v1.16b, v31.16b
- and v9.16b, v16.16b, v31.16b
- and v6.16b, v6.16b, v30.16b
- eor v16.16b, v17.16b, v21.16b
- and v4.16b, v4.16b, v30.16b
- eor v17.16b, v8.16b, v30.16b
- and v21.16b, v24.16b, v8.16b
- eor v9.16b, v9.16b, v25.16b
- and v19.16b, v19.16b, v8.16b
- eor v24.16b, v30.16b, v16.16b
- eor v25.16b, v30.16b, v16.16b
- and v7.16b, v7.16b, v17.16b
- and v10.16b, v10.16b, v16.16b
- eor v29.16b, v9.16b, v16.16b
- eor v30.16b, v31.16b, v9.16b
- and v0.16b, v24.16b, v0.16b
- and v9.16b, v18.16b, v9.16b
- and v2.16b, v25.16b, v2.16b
- eor v10.16b, v10.16b, v6.16b
- eor v18.16b, v29.16b, v16.16b
- and v5.16b, v30.16b, v5.16b
- eor v24.16b, v8.16b, v29.16b
- and v25.16b, v26.16b, v29.16b
- and v26.16b, v28.16b, v29.16b
- eor v8.16b, v8.16b, v29.16b
- eor v17.16b, v17.16b, v18.16b
- eor v5.16b, v1.16b, v5.16b
- and v23.16b, v24.16b, v23.16b
- eor v21.16b, v21.16b, v25.16b
- eor v19.16b, v19.16b, v26.16b
- eor v0.16b, v4.16b, v0.16b
- and v3.16b, v17.16b, v3.16b
- eor v1.16b, v9.16b, v1.16b
- eor v9.16b, v25.16b, v23.16b
- eor v5.16b, v5.16b, v21.16b
- eor v2.16b, v6.16b, v2.16b
- and v6.16b, v8.16b, v22.16b
- eor v3.16b, v7.16b, v3.16b
- and v8.16b, v20.16b, v18.16b
- eor v10.16b, v10.16b, v9.16b
- eor v0.16b, v0.16b, v19.16b
- eor v9.16b, v1.16b, v9.16b
- eor v1.16b, v2.16b, v21.16b
- eor v3.16b, v3.16b, v19.16b
- and v16.16b, v27.16b, v16.16b
- eor v17.16b, v26.16b, v6.16b
- eor v6.16b, v8.16b, v7.16b
- eor v7.16b, v1.16b, v9.16b
- eor v1.16b, v5.16b, v3.16b
- eor v2.16b, v10.16b, v3.16b
- eor v4.16b, v16.16b, v4.16b
- eor v8.16b, v6.16b, v17.16b
- eor v5.16b, v9.16b, v3.16b
- eor v9.16b, v0.16b, v1.16b
- eor v6.16b, v7.16b, v1.16b
- eor v0.16b, v4.16b, v17.16b
- eor v4.16b, v8.16b, v7.16b
- eor v7.16b, v9.16b, v2.16b
- eor v8.16b, v3.16b, v0.16b
- eor v7.16b, v7.16b, v5.16b
- eor v3.16b, v4.16b, v7.16b
- eor v4.16b, v7.16b, v0.16b
- eor v7.16b, v8.16b, v3.16b
- bcc .Ldec_done
- ext v8.16b, v0.16b, v0.16b, #8
- ext v9.16b, v1.16b, v1.16b, #8
- ldr q28, [x11] // load from .LISR in common case (x10 > 0)
- ext v10.16b, v6.16b, v6.16b, #8
- ext v16.16b, v3.16b, v3.16b, #8
- ext v17.16b, v5.16b, v5.16b, #8
- ext v18.16b, v4.16b, v4.16b, #8
- eor v8.16b, v8.16b, v0.16b
- eor v9.16b, v9.16b, v1.16b
- eor v10.16b, v10.16b, v6.16b
- eor v16.16b, v16.16b, v3.16b
- eor v17.16b, v17.16b, v5.16b
- ext v19.16b, v2.16b, v2.16b, #8
- ext v20.16b, v7.16b, v7.16b, #8
- eor v18.16b, v18.16b, v4.16b
- eor v6.16b, v6.16b, v8.16b
- eor v8.16b, v2.16b, v10.16b
- eor v4.16b, v4.16b, v9.16b
- eor v2.16b, v19.16b, v2.16b
- eor v9.16b, v20.16b, v7.16b
- eor v0.16b, v0.16b, v16.16b
- eor v1.16b, v1.16b, v16.16b
- eor v6.16b, v6.16b, v17.16b
- eor v8.16b, v8.16b, v16.16b
- eor v7.16b, v7.16b, v18.16b
- eor v4.16b, v4.16b, v16.16b
- eor v2.16b, v3.16b, v2.16b
- eor v1.16b, v1.16b, v17.16b
- eor v3.16b, v5.16b, v9.16b
- eor v5.16b, v8.16b, v17.16b
- eor v7.16b, v7.16b, v17.16b
- ext v8.16b, v0.16b, v0.16b, #12
- ext v9.16b, v6.16b, v6.16b, #12
- ext v10.16b, v4.16b, v4.16b, #12
- ext v16.16b, v1.16b, v1.16b, #12
- ext v17.16b, v5.16b, v5.16b, #12
- ext v18.16b, v7.16b, v7.16b, #12
- eor v0.16b, v0.16b, v8.16b
- eor v6.16b, v6.16b, v9.16b
- eor v4.16b, v4.16b, v10.16b
- ext v19.16b, v2.16b, v2.16b, #12
- ext v20.16b, v3.16b, v3.16b, #12
- eor v1.16b, v1.16b, v16.16b
- eor v5.16b, v5.16b, v17.16b
- eor v7.16b, v7.16b, v18.16b
- eor v2.16b, v2.16b, v19.16b
- eor v16.16b, v16.16b, v0.16b
- eor v3.16b, v3.16b, v20.16b
- eor v17.16b, v17.16b, v4.16b
- eor v10.16b, v10.16b, v6.16b
- ext v0.16b, v0.16b, v0.16b, #8
- eor v9.16b, v9.16b, v1.16b
- ext v1.16b, v1.16b, v1.16b, #8
- eor v8.16b, v8.16b, v3.16b
- eor v16.16b, v16.16b, v3.16b
- eor v18.16b, v18.16b, v5.16b
- eor v19.16b, v19.16b, v7.16b
- ext v21.16b, v5.16b, v5.16b, #8
- ext v5.16b, v7.16b, v7.16b, #8
- eor v7.16b, v20.16b, v2.16b
- ext v4.16b, v4.16b, v4.16b, #8
- ext v20.16b, v3.16b, v3.16b, #8
- eor v17.16b, v17.16b, v3.16b
- ext v2.16b, v2.16b, v2.16b, #8
- eor v3.16b, v10.16b, v3.16b
- ext v10.16b, v6.16b, v6.16b, #8
- eor v0.16b, v0.16b, v8.16b
- eor v1.16b, v1.16b, v16.16b
- eor v5.16b, v5.16b, v18.16b
- eor v3.16b, v3.16b, v4.16b
- eor v7.16b, v20.16b, v7.16b
- eor v6.16b, v2.16b, v19.16b
- eor v4.16b, v21.16b, v17.16b
- eor v2.16b, v10.16b, v9.16b
- bne .Ldec_loop
- ldr q28, [x11, #16]! // load from .LISRM0 on last round (x10 == 0)
- b .Ldec_loop
-.align 4
-.Ldec_done:
- ushr v8.2d, v0.2d, #1
- movi v9.16b, #0x55
- ldr q10, [x9]
- ushr v16.2d, v2.2d, #1
- movi v17.16b, #0x33
- ushr v18.2d, v6.2d, #1
- movi v19.16b, #0x0f
- eor v8.16b, v8.16b, v1.16b
- ushr v20.2d, v3.2d, #1
- eor v16.16b, v16.16b, v7.16b
- eor v18.16b, v18.16b, v4.16b
- and v8.16b, v8.16b, v9.16b
- eor v20.16b, v20.16b, v5.16b
- and v16.16b, v16.16b, v9.16b
- and v18.16b, v18.16b, v9.16b
- shl v21.2d, v8.2d, #1
- eor v1.16b, v1.16b, v8.16b
- and v8.16b, v20.16b, v9.16b
- eor v7.16b, v7.16b, v16.16b
- shl v9.2d, v16.2d, #1
- eor v4.16b, v4.16b, v18.16b
- shl v16.2d, v18.2d, #1
- eor v0.16b, v0.16b, v21.16b
- shl v18.2d, v8.2d, #1
- eor v5.16b, v5.16b, v8.16b
- eor v2.16b, v2.16b, v9.16b
- eor v6.16b, v6.16b, v16.16b
- ushr v8.2d, v1.2d, #2
- eor v3.16b, v3.16b, v18.16b
- ushr v9.2d, v0.2d, #2
- ushr v16.2d, v7.2d, #2
- ushr v18.2d, v2.2d, #2
- eor v8.16b, v8.16b, v4.16b
- eor v9.16b, v9.16b, v6.16b
- eor v16.16b, v16.16b, v5.16b
- eor v18.16b, v18.16b, v3.16b
- and v8.16b, v8.16b, v17.16b
- and v9.16b, v9.16b, v17.16b
- and v16.16b, v16.16b, v17.16b
- and v17.16b, v18.16b, v17.16b
- eor v4.16b, v4.16b, v8.16b
- shl v8.2d, v8.2d, #2
- eor v6.16b, v6.16b, v9.16b
- shl v9.2d, v9.2d, #2
- eor v5.16b, v5.16b, v16.16b
- shl v16.2d, v16.2d, #2
- eor v3.16b, v3.16b, v17.16b
- shl v17.2d, v17.2d, #2
- eor v1.16b, v1.16b, v8.16b
- eor v0.16b, v0.16b, v9.16b
- eor v7.16b, v7.16b, v16.16b
- eor v2.16b, v2.16b, v17.16b
- ushr v8.2d, v4.2d, #4
- ushr v9.2d, v6.2d, #4
- ushr v16.2d, v1.2d, #4
- ushr v17.2d, v0.2d, #4
- eor v8.16b, v8.16b, v5.16b
- eor v9.16b, v9.16b, v3.16b
- eor v16.16b, v16.16b, v7.16b
- eor v17.16b, v17.16b, v2.16b
- and v8.16b, v8.16b, v19.16b
- and v9.16b, v9.16b, v19.16b
- and v16.16b, v16.16b, v19.16b
- and v17.16b, v17.16b, v19.16b
- eor v5.16b, v5.16b, v8.16b
- shl v8.2d, v8.2d, #4
- eor v3.16b, v3.16b, v9.16b
- shl v9.2d, v9.2d, #4
- eor v7.16b, v7.16b, v16.16b
- shl v16.2d, v16.2d, #4
- eor v2.16b, v2.16b, v17.16b
- shl v17.2d, v17.2d, #4
- eor v4.16b, v4.16b, v8.16b
- eor v6.16b, v6.16b, v9.16b
- eor v7.16b, v7.16b, v10.16b
- eor v1.16b, v1.16b, v16.16b
- eor v2.16b, v2.16b, v10.16b
- eor v0.16b, v0.16b, v17.16b
- eor v4.16b, v4.16b, v10.16b
- eor v6.16b, v6.16b, v10.16b
- eor v3.16b, v3.16b, v10.16b
- eor v5.16b, v5.16b, v10.16b
- eor v1.16b, v1.16b, v10.16b
- eor v0.16b, v0.16b, v10.16b
- ret
-.size _bsaes_decrypt8,.-_bsaes_decrypt8
-
-.type _bsaes_const,%object
-.align 6
-_bsaes_const:
-// InvShiftRows constants
-// Used in _bsaes_decrypt8, which assumes contiguity
-// .LM0ISR used with round 0 key
-// .LISR used with middle round keys
-// .LISRM0 used with final round key
-.LM0ISR:
-.quad 0x0a0e0206070b0f03, 0x0004080c0d010509
-.LISR:
-.quad 0x0504070602010003, 0x0f0e0d0c080b0a09
-.LISRM0:
-.quad 0x01040b0e0205080f, 0x0306090c00070a0d
-
-// ShiftRows constants
-// Used in _bsaes_encrypt8, which assumes contiguity
-// .LM0SR used with round 0 key
-// .LSR used with middle round keys
-// .LSRM0 used with final round key
-.LM0SR:
-.quad 0x0a0e02060f03070b, 0x0004080c05090d01
-.LSR:
-.quad 0x0504070600030201, 0x0f0e0d0c0a09080b
-.LSRM0:
-.quad 0x0304090e00050a0f, 0x01060b0c0207080d
-
-.LM0_bigendian:
-.quad 0x02060a0e03070b0f, 0x0004080c0105090d
-.LM0_littleendian:
-.quad 0x0105090d0004080c, 0x03070b0f02060a0e
-
-// Used in bsaes_ctr32_encrypt_blocks, prior to dropping into
-// _bsaes_encrypt8_alt, for round 0 key in place of .LM0SR
-.LREVM0SR:
-.quad 0x090d01050c000408, 0x03070b0f060a0e02
-
-.align 6
-.size _bsaes_const,.-_bsaes_const
-
-.type _bsaes_encrypt8,%function
-.align 4
-// On entry:
-// x9 -> key (previously expanded using _bsaes_key_convert)
-// x10 = number of rounds
-// v0-v7 input data
-// On exit:
-// x9-x11 corrupted
-// other general-purpose registers preserved
-// v0-v7 output data
-// v11-v15 preserved
-// other SIMD registers corrupted
-_bsaes_encrypt8:
- ldr q8, [x9], #16
- adr x11, .LM0SR
- ldr q9, [x11], #16
-_bsaes_encrypt8_alt:
- eor v0.16b, v0.16b, v8.16b
- eor v1.16b, v1.16b, v8.16b
- sub x10, x10, #1
- eor v2.16b, v2.16b, v8.16b
- eor v4.16b, v4.16b, v8.16b
- eor v3.16b, v3.16b, v8.16b
- eor v5.16b, v5.16b, v8.16b
- tbl v0.16b, {v0.16b}, v9.16b
- tbl v1.16b, {v1.16b}, v9.16b
- tbl v2.16b, {v2.16b}, v9.16b
- tbl v4.16b, {v4.16b}, v9.16b
- eor v6.16b, v6.16b, v8.16b
- eor v7.16b, v7.16b, v8.16b
- tbl v3.16b, {v3.16b}, v9.16b
- tbl v5.16b, {v5.16b}, v9.16b
- tbl v6.16b, {v6.16b}, v9.16b
- ushr v8.2d, v0.2d, #1
- movi v10.16b, #0x55
- tbl v7.16b, {v7.16b}, v9.16b
- ushr v9.2d, v4.2d, #1
- movi v16.16b, #0x33
- ushr v17.2d, v2.2d, #1
- eor v8.16b, v8.16b, v1.16b
- movi v18.16b, #0x0f
- ushr v19.2d, v6.2d, #1
- eor v9.16b, v9.16b, v5.16b
- eor v17.16b, v17.16b, v3.16b
- and v8.16b, v8.16b, v10.16b
- eor v19.16b, v19.16b, v7.16b
- and v9.16b, v9.16b, v10.16b
- and v17.16b, v17.16b, v10.16b
- eor v1.16b, v1.16b, v8.16b
- shl v8.2d, v8.2d, #1
- and v10.16b, v19.16b, v10.16b
- eor v5.16b, v5.16b, v9.16b
- shl v9.2d, v9.2d, #1
- eor v3.16b, v3.16b, v17.16b
- shl v17.2d, v17.2d, #1
- eor v0.16b, v0.16b, v8.16b
- shl v8.2d, v10.2d, #1
- eor v7.16b, v7.16b, v10.16b
- eor v4.16b, v4.16b, v9.16b
- eor v2.16b, v2.16b, v17.16b
- ushr v9.2d, v1.2d, #2
- eor v6.16b, v6.16b, v8.16b
- ushr v8.2d, v0.2d, #2
- ushr v10.2d, v5.2d, #2
- ushr v17.2d, v4.2d, #2
- eor v9.16b, v9.16b, v3.16b
- eor v8.16b, v8.16b, v2.16b
- eor v10.16b, v10.16b, v7.16b
- eor v17.16b, v17.16b, v6.16b
- and v9.16b, v9.16b, v16.16b
- and v8.16b, v8.16b, v16.16b
- and v10.16b, v10.16b, v16.16b
- and v16.16b, v17.16b, v16.16b
- eor v3.16b, v3.16b, v9.16b
- shl v9.2d, v9.2d, #2
- eor v2.16b, v2.16b, v8.16b
- shl v8.2d, v8.2d, #2
- eor v7.16b, v7.16b, v10.16b
- shl v10.2d, v10.2d, #2
- eor v6.16b, v6.16b, v16.16b
- shl v16.2d, v16.2d, #2
- eor v1.16b, v1.16b, v9.16b
- eor v0.16b, v0.16b, v8.16b
- eor v5.16b, v5.16b, v10.16b
- eor v4.16b, v4.16b, v16.16b
- ushr v8.2d, v3.2d, #4
- ushr v9.2d, v2.2d, #4
- ushr v10.2d, v1.2d, #4
- ushr v16.2d, v0.2d, #4
- eor v8.16b, v8.16b, v7.16b
- eor v9.16b, v9.16b, v6.16b
- eor v10.16b, v10.16b, v5.16b
- eor v16.16b, v16.16b, v4.16b
- and v8.16b, v8.16b, v18.16b
- and v9.16b, v9.16b, v18.16b
- and v10.16b, v10.16b, v18.16b
- and v16.16b, v16.16b, v18.16b
- eor v7.16b, v7.16b, v8.16b
- shl v8.2d, v8.2d, #4
- eor v6.16b, v6.16b, v9.16b
- shl v9.2d, v9.2d, #4
- eor v5.16b, v5.16b, v10.16b
- shl v10.2d, v10.2d, #4
- eor v4.16b, v4.16b, v16.16b
- shl v16.2d, v16.2d, #4
- eor v3.16b, v3.16b, v8.16b
- eor v2.16b, v2.16b, v9.16b
- eor v1.16b, v1.16b, v10.16b
- eor v0.16b, v0.16b, v16.16b
- b .Lenc_sbox
-.align 4
-.Lenc_loop:
- ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x9], #64
- ldp q8, q9, [x9], #32
- eor v0.16b, v16.16b, v0.16b
- ldr q10, [x9], #16
- eor v1.16b, v17.16b, v1.16b
- ldr q16, [x9], #16
- eor v2.16b, v18.16b, v2.16b
- eor v3.16b, v19.16b, v3.16b
- eor v4.16b, v8.16b, v4.16b
- eor v5.16b, v9.16b, v5.16b
- eor v6.16b, v10.16b, v6.16b
- eor v7.16b, v16.16b, v7.16b
- tbl v0.16b, {v0.16b}, v28.16b
- tbl v1.16b, {v1.16b}, v28.16b
- tbl v2.16b, {v2.16b}, v28.16b
- tbl v3.16b, {v3.16b}, v28.16b
- tbl v4.16b, {v4.16b}, v28.16b
- tbl v5.16b, {v5.16b}, v28.16b
- tbl v6.16b, {v6.16b}, v28.16b
- tbl v7.16b, {v7.16b}, v28.16b
-.Lenc_sbox:
- eor v5.16b, v5.16b, v6.16b
- eor v3.16b, v3.16b, v0.16b
- subs x10, x10, #1
- eor v2.16b, v2.16b, v1.16b
- eor v5.16b, v5.16b, v0.16b
- eor v8.16b, v3.16b, v7.16b
- eor v6.16b, v6.16b, v2.16b
- eor v7.16b, v7.16b, v5.16b
- eor v8.16b, v8.16b, v4.16b
- eor v3.16b, v6.16b, v3.16b
- eor v4.16b, v4.16b, v5.16b
- eor v6.16b, v1.16b, v5.16b
- eor v2.16b, v2.16b, v7.16b
- eor v1.16b, v8.16b, v1.16b
- eor v8.16b, v7.16b, v4.16b
- eor v9.16b, v3.16b, v0.16b
- eor v10.16b, v7.16b, v6.16b
- eor v16.16b, v5.16b, v3.16b
- eor v17.16b, v6.16b, v2.16b
- eor v18.16b, v5.16b, v1.16b
- eor v19.16b, v2.16b, v4.16b
- eor v20.16b, v1.16b, v0.16b
- orr v21.16b, v8.16b, v9.16b
- orr v22.16b, v10.16b, v16.16b
- eor v23.16b, v8.16b, v17.16b
- eor v24.16b, v9.16b, v18.16b
- and v19.16b, v19.16b, v20.16b
- orr v20.16b, v17.16b, v18.16b
- and v8.16b, v8.16b, v9.16b
- and v9.16b, v17.16b, v18.16b
- and v17.16b, v23.16b, v24.16b
- and v10.16b, v10.16b, v16.16b
- eor v16.16b, v21.16b, v19.16b
- eor v18.16b, v20.16b, v19.16b
- and v19.16b, v2.16b, v1.16b
- and v20.16b, v6.16b, v5.16b
- eor v21.16b, v22.16b, v17.16b
- eor v9.16b, v9.16b, v10.16b
- eor v10.16b, v16.16b, v17.16b
- eor v16.16b, v18.16b, v8.16b
- and v17.16b, v4.16b, v0.16b
- orr v18.16b, v7.16b, v3.16b
- eor v21.16b, v21.16b, v8.16b
- eor v8.16b, v9.16b, v8.16b
- eor v9.16b, v10.16b, v19.16b
- eor v10.16b, v3.16b, v0.16b
- eor v16.16b, v16.16b, v17.16b
- eor v17.16b, v5.16b, v1.16b
- eor v19.16b, v21.16b, v20.16b
- eor v20.16b, v8.16b, v18.16b
- eor v8.16b, v8.16b, v18.16b
- eor v18.16b, v7.16b, v4.16b
- eor v21.16b, v9.16b, v16.16b
- eor v22.16b, v6.16b, v2.16b
- and v23.16b, v9.16b, v19.16b
- eor v24.16b, v10.16b, v17.16b
- eor v25.16b, v0.16b, v1.16b
- eor v26.16b, v7.16b, v6.16b
- eor v27.16b, v18.16b, v22.16b
- eor v28.16b, v3.16b, v5.16b
- eor v29.16b, v16.16b, v23.16b
- eor v30.16b, v20.16b, v23.16b
- eor v23.16b, v20.16b, v23.16b
- eor v31.16b, v4.16b, v2.16b
- bsl v29.16b, v19.16b, v20.16b
- bsl v30.16b, v9.16b, v16.16b
- bsl v8.16b, v29.16b, v23.16b
- bsl v20.16b, v23.16b, v29.16b
- eor v9.16b, v30.16b, v29.16b
- and v5.16b, v5.16b, v30.16b
- and v8.16b, v8.16b, v30.16b
- and v1.16b, v1.16b, v29.16b
- eor v16.16b, v19.16b, v20.16b
- and v2.16b, v2.16b, v29.16b
- eor v19.16b, v9.16b, v29.16b
- and v17.16b, v17.16b, v9.16b
- eor v8.16b, v8.16b, v21.16b
- and v20.16b, v22.16b, v9.16b
- eor v21.16b, v29.16b, v16.16b
- eor v22.16b, v29.16b, v16.16b
- and v23.16b, v25.16b, v16.16b
- and v6.16b, v6.16b, v19.16b
- eor v25.16b, v8.16b, v16.16b
- eor v29.16b, v30.16b, v8.16b
- and v4.16b, v21.16b, v4.16b
- and v8.16b, v28.16b, v8.16b
- and v0.16b, v22.16b, v0.16b
- eor v21.16b, v23.16b, v1.16b
- eor v22.16b, v9.16b, v25.16b
- eor v9.16b, v9.16b, v25.16b
- eor v23.16b, v25.16b, v16.16b
- and v3.16b, v29.16b, v3.16b
- and v24.16b, v24.16b, v25.16b
- and v25.16b, v27.16b, v25.16b
- and v10.16b, v22.16b, v10.16b
- and v9.16b, v9.16b, v18.16b
- eor v18.16b, v19.16b, v23.16b
- and v19.16b, v26.16b, v23.16b
- eor v3.16b, v5.16b, v3.16b
- eor v17.16b, v17.16b, v24.16b
- eor v10.16b, v24.16b, v10.16b
- and v16.16b, v31.16b, v16.16b
- eor v20.16b, v20.16b, v25.16b
- eor v9.16b, v25.16b, v9.16b
- eor v4.16b, v2.16b, v4.16b
- and v7.16b, v18.16b, v7.16b
- eor v18.16b, v19.16b, v6.16b
- eor v5.16b, v8.16b, v5.16b
- eor v0.16b, v1.16b, v0.16b
- eor v1.16b, v21.16b, v10.16b
- eor v8.16b, v3.16b, v17.16b
- eor v2.16b, v16.16b, v2.16b
- eor v3.16b, v6.16b, v7.16b
- eor v6.16b, v18.16b, v9.16b
- eor v4.16b, v4.16b, v20.16b
- eor v10.16b, v5.16b, v10.16b
- eor v0.16b, v0.16b, v17.16b
- eor v9.16b, v2.16b, v9.16b
- eor v3.16b, v3.16b, v20.16b
- eor v7.16b, v6.16b, v1.16b
- eor v5.16b, v8.16b, v4.16b
- eor v6.16b, v10.16b, v1.16b
- eor v2.16b, v4.16b, v0.16b
- eor v4.16b, v3.16b, v10.16b
- eor v9.16b, v9.16b, v7.16b
- eor v3.16b, v0.16b, v5.16b
- eor v0.16b, v1.16b, v4.16b
- eor v1.16b, v4.16b, v8.16b
- eor v4.16b, v9.16b, v5.16b
- eor v6.16b, v6.16b, v3.16b
- bcc .Lenc_done
- ext v8.16b, v0.16b, v0.16b, #12
- ext v9.16b, v4.16b, v4.16b, #12
- ldr q28, [x11]
- ext v10.16b, v6.16b, v6.16b, #12
- ext v16.16b, v1.16b, v1.16b, #12
- ext v17.16b, v3.16b, v3.16b, #12
- ext v18.16b, v7.16b, v7.16b, #12
- eor v0.16b, v0.16b, v8.16b
- eor v4.16b, v4.16b, v9.16b
- eor v6.16b, v6.16b, v10.16b
- ext v19.16b, v2.16b, v2.16b, #12
- ext v20.16b, v5.16b, v5.16b, #12
- eor v1.16b, v1.16b, v16.16b
- eor v3.16b, v3.16b, v17.16b
- eor v7.16b, v7.16b, v18.16b
- eor v2.16b, v2.16b, v19.16b
- eor v16.16b, v16.16b, v0.16b
- eor v5.16b, v5.16b, v20.16b
- eor v17.16b, v17.16b, v6.16b
- eor v10.16b, v10.16b, v4.16b
- ext v0.16b, v0.16b, v0.16b, #8
- eor v9.16b, v9.16b, v1.16b
- ext v1.16b, v1.16b, v1.16b, #8
- eor v8.16b, v8.16b, v5.16b
- eor v16.16b, v16.16b, v5.16b
- eor v18.16b, v18.16b, v3.16b
- eor v19.16b, v19.16b, v7.16b
- ext v3.16b, v3.16b, v3.16b, #8
- ext v7.16b, v7.16b, v7.16b, #8
- eor v20.16b, v20.16b, v2.16b
- ext v6.16b, v6.16b, v6.16b, #8
- ext v21.16b, v5.16b, v5.16b, #8
- eor v17.16b, v17.16b, v5.16b
- ext v2.16b, v2.16b, v2.16b, #8
- eor v10.16b, v10.16b, v5.16b
- ext v22.16b, v4.16b, v4.16b, #8
- eor v0.16b, v0.16b, v8.16b
- eor v1.16b, v1.16b, v16.16b
- eor v5.16b, v7.16b, v18.16b
- eor v4.16b, v3.16b, v17.16b
- eor v3.16b, v6.16b, v10.16b
- eor v7.16b, v21.16b, v20.16b
- eor v6.16b, v2.16b, v19.16b
- eor v2.16b, v22.16b, v9.16b
- bne .Lenc_loop
- ldr q28, [x11, #16]! // load from .LSRM0 on last round (x10 == 0)
- b .Lenc_loop
-.align 4
-.Lenc_done:
- ushr v8.2d, v0.2d, #1
- movi v9.16b, #0x55
- ldr q10, [x9]
- ushr v16.2d, v3.2d, #1
- movi v17.16b, #0x33
- ushr v18.2d, v4.2d, #1
- movi v19.16b, #0x0f
- eor v8.16b, v8.16b, v1.16b
- ushr v20.2d, v2.2d, #1
- eor v16.16b, v16.16b, v7.16b
- eor v18.16b, v18.16b, v6.16b
- and v8.16b, v8.16b, v9.16b
- eor v20.16b, v20.16b, v5.16b
- and v16.16b, v16.16b, v9.16b
- and v18.16b, v18.16b, v9.16b
- shl v21.2d, v8.2d, #1
- eor v1.16b, v1.16b, v8.16b
- and v8.16b, v20.16b, v9.16b
- eor v7.16b, v7.16b, v16.16b
- shl v9.2d, v16.2d, #1
- eor v6.16b, v6.16b, v18.16b
- shl v16.2d, v18.2d, #1
- eor v0.16b, v0.16b, v21.16b
- shl v18.2d, v8.2d, #1
- eor v5.16b, v5.16b, v8.16b
- eor v3.16b, v3.16b, v9.16b
- eor v4.16b, v4.16b, v16.16b
- ushr v8.2d, v1.2d, #2
- eor v2.16b, v2.16b, v18.16b
- ushr v9.2d, v0.2d, #2
- ushr v16.2d, v7.2d, #2
- ushr v18.2d, v3.2d, #2
- eor v8.16b, v8.16b, v6.16b
- eor v9.16b, v9.16b, v4.16b
- eor v16.16b, v16.16b, v5.16b
- eor v18.16b, v18.16b, v2.16b
- and v8.16b, v8.16b, v17.16b
- and v9.16b, v9.16b, v17.16b
- and v16.16b, v16.16b, v17.16b
- and v17.16b, v18.16b, v17.16b
- eor v6.16b, v6.16b, v8.16b
- shl v8.2d, v8.2d, #2
- eor v4.16b, v4.16b, v9.16b
- shl v9.2d, v9.2d, #2
- eor v5.16b, v5.16b, v16.16b
- shl v16.2d, v16.2d, #2
- eor v2.16b, v2.16b, v17.16b
- shl v17.2d, v17.2d, #2
- eor v1.16b, v1.16b, v8.16b
- eor v0.16b, v0.16b, v9.16b
- eor v7.16b, v7.16b, v16.16b
- eor v3.16b, v3.16b, v17.16b
- ushr v8.2d, v6.2d, #4
- ushr v9.2d, v4.2d, #4
- ushr v16.2d, v1.2d, #4
- ushr v17.2d, v0.2d, #4
- eor v8.16b, v8.16b, v5.16b
- eor v9.16b, v9.16b, v2.16b
- eor v16.16b, v16.16b, v7.16b
- eor v17.16b, v17.16b, v3.16b
- and v8.16b, v8.16b, v19.16b
- and v9.16b, v9.16b, v19.16b
- and v16.16b, v16.16b, v19.16b
- and v17.16b, v17.16b, v19.16b
- eor v5.16b, v5.16b, v8.16b
- shl v8.2d, v8.2d, #4
- eor v2.16b, v2.16b, v9.16b
- shl v9.2d, v9.2d, #4
- eor v7.16b, v7.16b, v16.16b
- shl v16.2d, v16.2d, #4
- eor v3.16b, v3.16b, v17.16b
- shl v17.2d, v17.2d, #4
- eor v6.16b, v6.16b, v8.16b
- eor v4.16b, v4.16b, v9.16b
- eor v7.16b, v7.16b, v10.16b
- eor v1.16b, v1.16b, v16.16b
- eor v3.16b, v3.16b, v10.16b
- eor v0.16b, v0.16b, v17.16b
- eor v6.16b, v6.16b, v10.16b
- eor v4.16b, v4.16b, v10.16b
- eor v2.16b, v2.16b, v10.16b
- eor v5.16b, v5.16b, v10.16b
- eor v1.16b, v1.16b, v10.16b
- eor v0.16b, v0.16b, v10.16b
- ret
-.size _bsaes_encrypt8,.-_bsaes_encrypt8
-
-.type _bsaes_key_convert,%function
-.align 4
-// On entry:
-// x9 -> input key (big-endian)
-// x10 = number of rounds
-// x17 -> output key (native endianness)
-// On exit:
-// x9, x10 corrupted
-// x11 -> .LM0_bigendian
-// x17 -> last quadword of output key
-// other general-purpose registers preserved
-// v2-v6 preserved
-// v7.16b[] = 0x63
-// v8-v14 preserved
-// v15 = last round key (converted to native endianness)
-// other SIMD registers corrupted
-_bsaes_key_convert:
-#ifdef __ARMEL__
- adr