summaryrefslogtreecommitdiffstats
path: root/crypto/aes
diff options
context:
space:
mode:
authorBen Avison <bavison@riscosopen.org>2021-03-10 15:54:44 +0000
committerPauli <pauli@openssl.org>2021-05-14 00:02:19 +1000
commitda51566b256e0c0536d5b986e676863b0526bf5e (patch)
tree11b8b2fb3cadf4c4c2107881fdcdf40538e87059 /crypto/aes
parent3ba3e350fd15c133a172095f67e6e0c99ab9b410 (diff)
ARM assembly pack: translate bit-sliced AES implementation to AArch64
Reviewed-by: Tomas Mraz <tomas@openssl.org> Reviewed-by: Paul Dale <pauli@openssl.org> (Merged from https://github.com/openssl/openssl/pull/14592)
Diffstat (limited to 'crypto/aes')
-rw-r--r--crypto/aes/asm/bsaes-armv8.S2338
-rw-r--r--crypto/aes/build.info5
2 files changed, 2341 insertions, 2 deletions
diff --git a/crypto/aes/asm/bsaes-armv8.S b/crypto/aes/asm/bsaes-armv8.S
new file mode 100644
index 0000000000..9bd02d0c8a
--- /dev/null
+++ b/crypto/aes/asm/bsaes-armv8.S
@@ -0,0 +1,2338 @@
+// Copyright 2021 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the OpenSSL license (the "License"). You may not use
+// this file except in compliance with the License. You can obtain a copy
+// in the file LICENSE in the source distribution or at
+// https://www.openssl.org/source/license.html
+//
+// ====================================================================
+// Written by Ben Avison <bavison@riscosopen.org> for the OpenSSL
+// project. Rights for redistribution and usage in source and binary
+// forms are granted according to the OpenSSL license.
+// ====================================================================
+//
+// This implementation is a translation of bsaes-armv7 for AArch64.
+// No attempt has been made to carry across the build switches for
+// kernel targets, since the Linux kernel crypto support has moved on
+// from when it was based on OpenSSL.
+
+// A lot of hand-scheduling has been performed. Consequently, this code
+// doesn't factor out neatly into macros in the same way that the
+// AArch32 version did, and there is little to be gained by wrapping it
+// up in Perl, and it is presented as pure assembly.
+
+
+#include "crypto/arm_arch.h"
+
+.text
+
+.type _bsaes_decrypt8,%function
+.align 4
+// On entry:
+// x9 -> key (previously expanded using _bsaes_key_convert)
+// x10 = number of rounds
+// v0-v7 input data
+// On exit:
+// x9-x11 corrupted
+// other general-purpose registers preserved
+// v0-v7 output data
+// v11-v15 preserved
+// other SIMD registers corrupted
+_bsaes_decrypt8:
+ ldr q8, [x9], #16
+ adr x11, .LM0ISR
+ movi v9.16b, #0x55
+ ldr q10, [x11], #16
+ movi v16.16b, #0x33
+ movi v17.16b, #0x0f
+ sub x10, x10, #1
+ eor v0.16b, v0.16b, v8.16b
+ eor v1.16b, v1.16b, v8.16b
+ eor v2.16b, v2.16b, v8.16b
+ eor v4.16b, v4.16b, v8.16b
+ eor v3.16b, v3.16b, v8.16b
+ eor v5.16b, v5.16b, v8.16b
+ tbl v0.16b, {v0.16b}, v10.16b
+ tbl v1.16b, {v1.16b}, v10.16b
+ tbl v2.16b, {v2.16b}, v10.16b
+ tbl v4.16b, {v4.16b}, v10.16b
+ eor v6.16b, v6.16b, v8.16b
+ eor v7.16b, v7.16b, v8.16b
+ tbl v3.16b, {v3.16b}, v10.16b
+ tbl v5.16b, {v5.16b}, v10.16b
+ tbl v6.16b, {v6.16b}, v10.16b
+ ushr v8.2d, v0.2d, #1
+ tbl v7.16b, {v7.16b}, v10.16b
+ ushr v10.2d, v4.2d, #1
+ ushr v18.2d, v2.2d, #1
+ eor v8.16b, v8.16b, v1.16b
+ ushr v19.2d, v6.2d, #1
+ eor v10.16b, v10.16b, v5.16b
+ eor v18.16b, v18.16b, v3.16b
+ and v8.16b, v8.16b, v9.16b
+ eor v19.16b, v19.16b, v7.16b
+ and v10.16b, v10.16b, v9.16b
+ and v18.16b, v18.16b, v9.16b
+ eor v1.16b, v1.16b, v8.16b
+ shl v8.2d, v8.2d, #1
+ and v9.16b, v19.16b, v9.16b
+ eor v5.16b, v5.16b, v10.16b
+ shl v10.2d, v10.2d, #1
+ eor v3.16b, v3.16b, v18.16b
+ shl v18.2d, v18.2d, #1
+ eor v0.16b, v0.16b, v8.16b
+ shl v8.2d, v9.2d, #1
+ eor v7.16b, v7.16b, v9.16b
+ eor v4.16b, v4.16b, v10.16b
+ eor v2.16b, v2.16b, v18.16b
+ ushr v9.2d, v1.2d, #2
+ eor v6.16b, v6.16b, v8.16b
+ ushr v8.2d, v0.2d, #2
+ ushr v10.2d, v5.2d, #2
+ ushr v18.2d, v4.2d, #2
+ eor v9.16b, v9.16b, v3.16b
+ eor v8.16b, v8.16b, v2.16b
+ eor v10.16b, v10.16b, v7.16b
+ eor v18.16b, v18.16b, v6.16b
+ and v9.16b, v9.16b, v16.16b
+ and v8.16b, v8.16b, v16.16b
+ and v10.16b, v10.16b, v16.16b
+ and v16.16b, v18.16b, v16.16b
+ eor v3.16b, v3.16b, v9.16b
+ shl v9.2d, v9.2d, #2
+ eor v2.16b, v2.16b, v8.16b
+ shl v8.2d, v8.2d, #2
+ eor v7.16b, v7.16b, v10.16b
+ shl v10.2d, v10.2d, #2
+ eor v6.16b, v6.16b, v16.16b
+ shl v16.2d, v16.2d, #2
+ eor v1.16b, v1.16b, v9.16b
+ eor v0.16b, v0.16b, v8.16b
+ eor v5.16b, v5.16b, v10.16b
+ eor v4.16b, v4.16b, v16.16b
+ ushr v8.2d, v3.2d, #4
+ ushr v9.2d, v2.2d, #4
+ ushr v10.2d, v1.2d, #4
+ ushr v16.2d, v0.2d, #4
+ eor v8.16b, v8.16b, v7.16b
+ eor v9.16b, v9.16b, v6.16b
+ eor v10.16b, v10.16b, v5.16b
+ eor v16.16b, v16.16b, v4.16b
+ and v8.16b, v8.16b, v17.16b
+ and v9.16b, v9.16b, v17.16b
+ and v10.16b, v10.16b, v17.16b
+ and v16.16b, v16.16b, v17.16b
+ eor v7.16b, v7.16b, v8.16b
+ shl v8.2d, v8.2d, #4
+ eor v6.16b, v6.16b, v9.16b
+ shl v9.2d, v9.2d, #4
+ eor v5.16b, v5.16b, v10.16b
+ shl v10.2d, v10.2d, #4
+ eor v4.16b, v4.16b, v16.16b
+ shl v16.2d, v16.2d, #4
+ eor v3.16b, v3.16b, v8.16b
+ eor v2.16b, v2.16b, v9.16b
+ eor v1.16b, v1.16b, v10.16b
+ eor v0.16b, v0.16b, v16.16b
+ b .Ldec_sbox
+.align 4
+.Ldec_loop:
+ ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x9], #64
+ ldp q8, q9, [x9], #32
+ eor v0.16b, v16.16b, v0.16b
+ ldr q10, [x9], #16
+ eor v1.16b, v17.16b, v1.16b
+ ldr q16, [x9], #16
+ eor v2.16b, v18.16b, v2.16b
+ eor v3.16b, v19.16b, v3.16b
+ eor v4.16b, v8.16b, v4.16b
+ eor v5.16b, v9.16b, v5.16b
+ eor v6.16b, v10.16b, v6.16b
+ eor v7.16b, v16.16b, v7.16b
+ tbl v0.16b, {v0.16b}, v28.16b
+ tbl v1.16b, {v1.16b}, v28.16b
+ tbl v2.16b, {v2.16b}, v28.16b
+ tbl v3.16b, {v3.16b}, v28.16b
+ tbl v4.16b, {v4.16b}, v28.16b
+ tbl v5.16b, {v5.16b}, v28.16b
+ tbl v6.16b, {v6.16b}, v28.16b
+ tbl v7.16b, {v7.16b}, v28.16b
+.Ldec_sbox:
+ eor v1.16b, v1.16b, v4.16b
+ eor v3.16b, v3.16b, v4.16b
+ subs x10, x10, #1
+ eor v4.16b, v4.16b, v7.16b
+ eor v2.16b, v2.16b, v7.16b
+ eor v1.16b, v1.16b, v6.16b
+ eor v6.16b, v6.16b, v4.16b
+ eor v2.16b, v2.16b, v5.16b
+ eor v0.16b, v0.16b, v1.16b
+ eor v7.16b, v7.16b, v6.16b
+ eor v8.16b, v6.16b, v2.16b
+ and v9.16b, v4.16b, v6.16b
+ eor v10.16b, v2.16b, v6.16b
+ eor v3.16b, v3.16b, v0.16b
+ eor v5.16b, v5.16b, v0.16b
+ eor v16.16b, v7.16b, v4.16b
+ eor v17.16b, v4.16b, v0.16b
+ and v18.16b, v0.16b, v2.16b
+ eor v19.16b, v7.16b, v4.16b
+ eor v1.16b, v1.16b, v3.16b
+ eor v20.16b, v3.16b, v0.16b
+ eor v21.16b, v5.16b, v2.16b
+ eor v22.16b, v3.16b, v7.16b
+ and v8.16b, v17.16b, v8.16b
+ orr v17.16b, v3.16b, v5.16b
+ eor v23.16b, v1.16b, v6.16b
+ eor v24.16b, v20.16b, v16.16b
+ eor v25.16b, v1.16b, v5.16b
+ orr v26.16b, v20.16b, v21.16b
+ and v20.16b, v20.16b, v21.16b
+ and v27.16b, v7.16b, v1.16b
+ eor v21.16b, v21.16b, v23.16b
+ orr v28.16b, v16.16b, v23.16b
+ orr v29.16b, v22.16b, v25.16b
+ eor v26.16b, v26.16b, v8.16b
+ and v16.16b, v16.16b, v23.16b
+ and v22.16b, v22.16b, v25.16b
+ and v21.16b, v24.16b, v21.16b
+ eor v8.16b, v28.16b, v8.16b
+ eor v23.16b, v5.16b, v2.16b
+ eor v24.16b, v1.16b, v6.16b
+ eor v16.16b, v16.16b, v22.16b
+ eor v22.16b, v3.16b, v0.16b
+ eor v25.16b, v29.16b, v21.16b
+ eor v21.16b, v26.16b, v21.16b
+ eor v8.16b, v8.16b, v20.16b
+ eor v26.16b, v23.16b, v24.16b
+ eor v16.16b, v16.16b, v20.16b
+ eor v28.16b, v22.16b, v19.16b
+ eor v20.16b, v25.16b, v20.16b
+ eor v9.16b, v21.16b, v9.16b
+ eor v8.16b, v8.16b, v18.16b
+ eor v18.16b, v5.16b, v1.16b
+ eor v21.16b, v16.16b, v17.16b
+ eor v16.16b, v16.16b, v17.16b
+ eor v17.16b, v20.16b, v27.16b
+ eor v20.16b, v3.16b, v7.16b
+ eor v25.16b, v9.16b, v8.16b
+ eor v27.16b, v0.16b, v4.16b
+ and v29.16b, v9.16b, v17.16b
+ eor v30.16b, v8.16b, v29.16b
+ eor v31.16b, v21.16b, v29.16b
+ eor v29.16b, v21.16b, v29.16b
+ bsl v30.16b, v17.16b, v21.16b
+ bsl v31.16b, v9.16b, v8.16b
+ bsl v16.16b, v30.16b, v29.16b
+ bsl v21.16b, v29.16b, v30.16b
+ eor v8.16b, v31.16b, v30.16b
+ and v1.16b, v1.16b, v31.16b
+ and v9.16b, v16.16b, v31.16b
+ and v6.16b, v6.16b, v30.16b
+ eor v16.16b, v17.16b, v21.16b
+ and v4.16b, v4.16b, v30.16b
+ eor v17.16b, v8.16b, v30.16b
+ and v21.16b, v24.16b, v8.16b
+ eor v9.16b, v9.16b, v25.16b
+ and v19.16b, v19.16b, v8.16b
+ eor v24.16b, v30.16b, v16.16b
+ eor v25.16b, v30.16b, v16.16b
+ and v7.16b, v7.16b, v17.16b
+ and v10.16b, v10.16b, v16.16b
+ eor v29.16b, v9.16b, v16.16b
+ eor v30.16b, v31.16b, v9.16b
+ and v0.16b, v24.16b, v0.16b
+ and v9.16b, v18.16b, v9.16b
+ and v2.16b, v25.16b, v2.16b
+ eor v10.16b, v10.16b, v6.16b
+ eor v18.16b, v29.16b, v16.16b
+ and v5.16b, v30.16b, v5.16b
+ eor v24.16b, v8.16b, v29.16b
+ and v25.16b, v26.16b, v29.16b
+ and v26.16b, v28.16b, v29.16b
+ eor v8.16b, v8.16b, v29.16b
+ eor v17.16b, v17.16b, v18.16b
+ eor v5.16b, v1.16b, v5.16b
+ and v23.16b, v24.16b, v23.16b
+ eor v21.16b, v21.16b, v25.16b
+ eor v19.16b, v19.16b, v26.16b
+ eor v0.16b, v4.16b, v0.16b
+ and v3.16b, v17.16b, v3.16b
+ eor v1.16b, v9.16b, v1.16b
+ eor v9.16b, v25.16b, v23.16b
+ eor v5.16b, v5.16b, v21.16b
+ eor v2.16b, v6.16b, v2.16b
+ and v6.16b, v8.16b, v22.16b
+ eor v3.16b, v7.16b, v3.16b
+ and v8.16b, v20.16b, v18.16b
+ eor v10.16b, v10.16b, v9.16b
+ eor v0.16b, v0.16b, v19.16b
+ eor v9.16b, v1.16b, v9.16b
+ eor v1.16b, v2.16b, v21.16b
+ eor v3.16b, v3.16b, v19.16b
+ and v16.16b, v27.16b, v16.16b
+ eor v17.16b, v26.16b, v6.16b
+ eor v6.16b, v8.16b, v7.16b
+ eor v7.16b, v1.16b, v9.16b
+ eor v1.16b, v5.16b, v3.16b
+ eor v2.16b, v10.16b, v3.16b
+ eor v4.16b, v16.16b, v4.16b
+ eor v8.16b, v6.16b, v17.16b
+ eor v5.16b, v9.16b, v3.16b
+ eor v9.16b, v0.16b, v1.16b
+ eor v6.16b, v7.16b, v1.16b
+ eor v0.16b, v4.16b, v17.16b
+ eor v4.16b, v8.16b, v7.16b
+ eor v7.16b, v9.16b, v2.16b
+ eor v8.16b, v3.16b, v0.16b
+ eor v7.16b, v7.16b, v5.16b
+ eor v3.16b, v4.16b, v7.16b
+ eor v4.16b, v7.16b, v0.16b
+ eor v7.16b, v8.16b, v3.16b
+ bcc .Ldec_done
+ ext v8.16b, v0.16b, v0.16b, #8
+ ext v9.16b, v1.16b, v1.16b, #8
+ ldr q28, [x11] // load from .LISR in common case (x10 > 0)
+ ext v10.16b, v6.16b, v6.16b, #8
+ ext v16.16b, v3.16b, v3.16b, #8
+ ext v17.16b, v5.16b, v5.16b, #8
+ ext v18.16b, v4.16b, v4.16b, #8
+ eor v8.16b, v8.16b, v0.16b
+ eor v9.16b, v9.16b, v1.16b
+ eor v10.16b, v10.16b, v6.16b
+ eor v16.16b, v16.16b, v3.16b
+ eor v17.16b, v17.16b, v5.16b
+ ext v19.16b, v2.16b, v2.16b, #8
+ ext v20.16b, v7.16b, v7.16b, #8
+ eor v18.16b, v18.16b, v4.16b
+ eor v6.16b, v6.16b, v8.16b
+ eor v8.16b, v2.16b, v10.16b
+ eor v4.16b, v4.16b, v9.16b
+ eor v2.16b, v19.16b, v2.16b
+ eor v9.16b, v20.16b, v7.16b
+ eor v0.16b, v0.16b, v16.16b
+ eor v1.16b, v1.16b, v16.16b
+ eor v6.16b, v6.16b, v17.16b
+ eor v8.16b, v8.16b, v16.16b
+ eor v7.16b, v7.16b, v18.16b
+ eor v4.16b, v4.16b, v16.16b
+ eor v2.16b, v3.16b, v2.16b
+ eor v1.16b, v1.16b, v17.16b
+ eor v3.16b, v5.16b, v9.16b
+ eor v5.16b, v8.16b, v17.16b
+ eor v7.16b, v7.16b, v17.16b
+ ext v8.16b, v0.16b, v0.16b, #12
+ ext v9.16b, v6.16b, v6.16b, #12
+ ext v10.16b, v4.16b, v4.16b, #12
+ ext v16.16b, v1.16b, v1.16b, #12
+ ext v17.16b, v5.16b, v5.16b, #12
+ ext v18.16b, v7.16b, v7.16b, #12
+ eor v0.16b, v0.16b, v8.16b
+ eor v6.16b, v6.16b, v9.16b
+ eor v4.16b, v4.16b, v10.16b
+ ext v19.16b, v2.16b, v2.16b, #12
+ ext v20.16b, v3.16b, v3.16b, #12
+ eor v1.16b, v1.16b, v16.16b
+ eor v5.16b, v5.16b, v17.16b
+ eor v7.16b, v7.16b, v18.16b
+ eor v2.16b, v2.16b, v19.16b
+ eor v16.16b, v16.16b, v0.16b
+ eor v3.16b, v3.16b, v20.16b
+ eor v17.16b, v17.16b, v4.16b
+ eor v10.16b, v10.16b, v6.16b
+ ext v0.16b, v0.16b, v0.16b, #8
+ eor v9.16b, v9.16b, v1.16b
+ ext v1.16b, v1.16b, v1.16b, #8
+ eor v8.16b, v8.16b, v3.16b
+ eor v16.16b, v16.16b, v3.16b
+ eor v18.16b, v18.16b, v5.16b
+ eor v19.16b, v19.16b, v7.16b
+ ext v21.16b, v5.16b, v5.16b, #8
+ ext v5.16b, v7.16b, v7.16b, #8
+ eor v7.16b, v20.16b, v2.16b
+ ext v4.16b, v4.16b, v4.16b, #8
+ ext v20.16b, v3.16b, v3.16b, #8
+ eor v17.16b, v17.16b, v3.16b
+ ext v2.16b, v2.16b, v2.16b, #8
+ eor v3.16b, v10.16b, v3.16b
+ ext v10.16b, v6.16b, v6.16b, #8
+ eor v0.16b, v0.16b, v8.16b
+ eor v1.16b, v1.16b, v16.16b
+ eor v5.16b, v5.16b, v18.16b
+ eor v3.16b, v3.16b, v4.16b
+ eor v7.16b, v20.16b, v7.16b
+ eor v6.16b, v2.16b, v19.16b
+ eor v4.16b, v21.16b, v17.16b
+ eor v2.16b, v10.16b, v9.16b
+ bne .Ldec_loop
+ ldr q28, [x11, #16]! // load from .LISRM0 on last round (x10 == 0)
+ b .Ldec_loop
+.align 4
+.Ldec_done:
+ ushr v8.2d, v0.2d, #1
+ movi v9.16b, #0x55
+ ldr q10, [x9]
+ ushr v16.2d, v2.2d, #1
+ movi v17.16b, #0x33
+ ushr v18.2d, v6.2d, #1
+ movi v19.16b, #0x0f
+ eor v8.16b, v8.16b, v1.16b
+ ushr v20.2d, v3.2d, #1
+ eor v16.16b, v16.16b, v7.16b
+ eor v18.16b, v18.16b, v4.16b
+ and v8.16b, v8.16b, v9.16b
+ eor v20.16b, v20.16b, v5.16b
+ and v16.16b, v16.16b, v9.16b
+ and v18.16b, v18.16b, v9.16b
+ shl v21.2d, v8.2d, #1
+ eor v1.16b, v1.16b, v8.16b
+ and v8.16b, v20.16b, v9.16b
+ eor v7.16b, v7.16b, v16.16b
+ shl v9.2d, v16.2d, #1
+ eor v4.16b, v4.16b, v18.16b
+ shl v16.2d, v18.2d, #1
+ eor v0.16b, v0.16b, v21.16b
+ shl v18.2d, v8.2d, #1
+ eor v5.16b, v5.16b, v8.16b
+ eor v2.16b, v2.16b, v9.16b
+ eor v6.16b, v6.16b, v16.16b
+ ushr v8.2d, v1.2d, #2
+ eor v3.16b, v3.16b, v18.16b
+ ushr v9.2d, v0.2d, #2
+ ushr v16.2d, v7.2d, #2
+ ushr v18.2d, v2.2d, #2
+ eor v8.16b, v8.16b, v4.16b
+ eor v9.16b, v9.16b, v6.16b
+ eor v16.16b, v16.16b, v5.16b
+ eor v18.16b, v18.16b, v3.16b
+ and v8.16b, v8.16b, v17.16b
+ and v9.16b, v9.16b, v17.16b
+ and v16.16b, v16.16b, v17.16b
+ and v17.16b, v18.16b, v17.16b
+ eor v4.16b, v4.16b, v8.16b
+ shl v8.2d, v8.2d, #2
+ eor v6.16b, v6.16b, v9.16b
+ shl v9.2d, v9.2d, #2
+ eor v5.16b, v5.16b, v16.16b
+ shl v16.2d, v16.2d, #2
+ eor v3.16b, v3.16b, v17.16b
+ shl v17.2d, v17.2d, #2
+ eor v1.16b, v1.16b, v8.16b
+ eor v0.16b, v0.16b, v9.16b
+ eor v7.16b, v7.16b, v16.16b
+ eor v2.16b, v2.16b, v17.16b
+ ushr v8.2d, v4.2d, #4
+ ushr v9.2d, v6.2d, #4
+ ushr v16.2d, v1.2d, #4
+ ushr v17.2d, v0.2d, #4
+ eor v8.16b, v8.16b, v5.16b
+ eor v9.16b, v9.16b, v3.16b
+ eor v16.16b, v16.16b, v7.16b
+ eor v17.16b, v17.16b, v2.16b
+ and v8.16b, v8.16b, v19.16b
+ and v9.16b, v9.16b, v19.16b
+ and v16.16b, v16.16b, v19.16b
+ and v17.16b, v17.16b, v19.16b
+ eor v5.16b, v5.16b, v8.16b
+ shl v8.2d, v8.2d, #4
+ eor v3.16b, v3.16b, v9.16b
+ shl v9.2d, v9.2d, #4
+ eor v7.16b, v7.16b, v16.16b
+ shl v16.2d, v16.2d, #4
+ eor v2.16b, v2.16b, v17.16b
+ shl v17.2d, v17.2d, #4
+ eor v4.16b, v4.16b, v8.16b
+ eor v6.16b, v6.16b, v9.16b
+ eor v7.16b, v7.16b, v10.16b
+ eor v1.16b, v1.16b, v16.16b
+ eor v2.16b, v2.16b, v10.16b
+ eor v0.16b, v0.16b, v17.16b
+ eor v4.16b, v4.16b, v10.16b
+ eor v6.16b, v6.16b, v10.16b
+ eor v3.16b, v3.16b, v10.16b
+ eor v5.16b, v5.16b, v10.16b
+ eor v1.16b, v1.16b, v10.16b
+ eor v0.16b, v0.16b, v10.16b
+ ret
+.size _bsaes_decrypt8,.-_bsaes_decrypt8
+
+.type _bsaes_const,%object
+.align 6
+_bsaes_const:
+// InvShiftRows constants
+// Used in _bsaes_decrypt8, which assumes contiguity
+// .LM0ISR used with round 0 key
+// .LISR used with middle round keys
+// .LISRM0 used with final round key
+.LM0ISR:
+.quad 0x0a0e0206070b0f03, 0x0004080c0d010509
+.LISR:
+.quad 0x0504070602010003, 0x0f0e0d0c080b0a09
+.LISRM0:
+.quad 0x01040b0e0205080f, 0x0306090c00070a0d
+
+// ShiftRows constants
+// Used in _bsaes_encrypt8, which assumes contiguity
+// .LM0SR used with round 0 key
+// .LSR used with middle round keys
+// .LSRM0 used with final round key
+.LM0SR:
+.quad 0x0a0e02060f03070b, 0x0004080c05090d01
+.LSR:
+.quad 0x0504070600030201, 0x0f0e0d0c0a09080b
+.LSRM0:
+.quad 0x0304090e00050a0f, 0x01060b0c0207080d
+
+.LM0_bigendian:
+.quad 0x02060a0e03070b0f, 0x0004080c0105090d
+.LM0_littleendian:
+.quad 0x0105090d0004080c, 0x03070b0f02060a0e
+
+// Used in bsaes_ctr32_encrypt_blocks, prior to dropping into
+// _bsaes_encrypt8_alt, for round 0 key in place of .LM0SR
+.LREVM0SR:
+.quad 0x090d01050c000408, 0x03070b0f060a0e02
+
+.align 6
+.size _bsaes_const,.-_bsaes_const
+
+.type _bsaes_encrypt8,%function
+.align 4
+// On entry:
+// x9 -> key (previously expanded using _bsaes_key_convert)
+// x10 = number of rounds
+// v0-v7 input data
+// On exit:
+// x9-x11 corrupted
+// other general-purpose registers preserved
+// v0-v7 output data
+// v11-v15 preserved
+// other SIMD registers corrupted
+_bsaes_encrypt8:
+ ldr q8, [x9], #16
+ adr x11, .LM0SR
+ ldr q9, [x11], #16
+_bsaes_encrypt8_alt:
+ eor v0.16b, v0.16b, v8.16b
+ eor v1.16b, v1.16b, v8.16b
+ sub x10, x10, #1
+ eor v2.16b, v2.16b, v8.16b
+ eor v4.16b, v4.16b, v8.16b
+ eor v3.16b, v3.16b, v8.16b
+ eor v5.16b, v5.16b, v8.16b
+ tbl v0.16b, {v0.16b}, v9.16b
+ tbl v1.16b, {v1.16b}, v9.16b
+ tbl v2.16b, {v2.16b}, v9.16b
+ tbl v4.16b, {v4.16b}, v9.16b
+ eor v6.16b, v6.16b, v8.16b
+ eor v7.16b, v7.16b, v8.16b
+ tbl v3.16b, {v3.16b}, v9.16b
+ tbl v5.16b, {v5.16b}, v9.16b
+ tbl v6.16b, {v6.16b}, v9.16b
+ ushr v8.2d, v0.2d, #1
+ movi v10.16b, #0x55
+ tbl v7.16b, {v7.16b}, v9.16b
+ ushr v9.2d, v4.2d, #1
+ movi v16.16b, #0x33
+ ushr v17.2d, v2.2d, #1
+ eor v8.16b, v8.16b, v1.16b
+ movi v18.16b, #0x0f
+ ushr v19.2d, v6.2d, #1
+ eor v9.16b, v9.16b, v5.16b
+ eor v17.16b, v17.16b, v3.16b
+ and v8.16b, v8.16b, v10.16b
+ eor v19.16b, v19.16b, v7.16b
+ and v9.16b, v9.16b, v10.16b
+ and v17.16b, v17.16b, v10.16b
+ eor v1.16b, v1.16b, v8.16b
+ shl v8.2d, v8.2d, #1
+ and v10.16b, v19.16b, v10.16b
+ eor v5.16b, v5.16b, v9.16b
+ shl v9.2d, v9.2d, #1
+ eor v3.16b, v3.16b, v17.16b
+ shl v17.2d, v17.2d, #1
+ eor v0.16b, v0.16b, v8.16b
+ shl v8.2d, v10.2d, #1
+ eor v7.16b, v7.16b, v10.16b
+ eor v4.16b, v4.16b, v9.16b
+ eor v2.16b, v2.16b, v17.16b
+ ushr v9.2d, v1.2d, #2
+ eor v6.16b, v6.16b, v8.16b
+ ushr v8.2d, v0.2d, #2
+ ushr v10.2d, v5.2d, #2
+ ushr v17.2d, v4.2d, #2
+ eor v9.16b, v9.16b, v3.16b
+ eor v8.16b, v8.16b, v2.16b
+ eor v10.16b, v10.16b, v7.16b
+ eor v17.16b, v17.16b, v6.16b
+ and v9.16b, v9.16b, v16.16b
+ and v8.16b, v8.16b, v16.16b
+ and v10.16b, v10.16b, v16.16b
+ and v16.16b, v17.16b, v16.16b
+ eor v3.16b, v3.16b, v9.16b
+ shl v9.2d, v9.2d, #2
+ eor v2.16b, v2.16b, v8.16b
+ shl v8.2d, v8.2d, #2
+ eor v7.16b, v7.16b, v10.16b
+ shl v10.2d, v10.2d, #2
+ eor v6.16b, v6.16b, v16.16b
+ shl v16.2d, v16.2d, #2
+ eor v1.16b, v1.16b, v9.16b
+ eor v0.16b, v0.16b, v8.16b
+ eor v5.16b, v5.16b, v10.16b
+ eor v4.16b, v4.16b, v16.16b
+ ushr v8.2d, v3.2d, #4
+ ushr v9.2d, v2.2d, #4
+ ushr v10.2d, v1.2d, #4
+ ushr v16.2d, v0.2d, #4
+ eor v8.16b, v8.16b, v7.16b
+ eor v9.16b, v9.16b, v6.16b
+ eor v10.16b, v10.16b, v5.16b
+ eor v16.16b, v16.16b, v4.16b
+ and v8.16b, v8.16b, v18.16b
+ and v9.16b, v9.16b, v18.16b
+ and v10.16b, v10.16b, v18.16b
+ and v16.16b, v16.16b, v18.16b
+ eor v7.16b, v7.16b, v8.16b
+ shl v8.2d, v8.2d, #4
+ eor v6.16b, v6.16b, v9.16b
+ shl v9.2d, v9.2d, #4
+ eor v5.16b, v5.16b, v10.16b
+ shl v10.2d, v10.2d, #4
+ eor v4.16b, v4.16b, v16.16b
+ shl v16.2d, v16.2d, #4
+ eor v3.16b, v3.16b, v8.16b
+ eor v2.16b, v2.16b, v9.16b
+ eor v1.16b, v1.16b, v10.16b
+ eor v0.16b, v0.16b, v16.16b
+ b .Lenc_sbox
+.align 4
+.Lenc_loop:
+ ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x9], #64
+ ldp q8, q9, [x9], #32
+ eor v0.16b, v16.16b, v0.16b
+ ldr q10, [x9], #16
+ eor v1.16b, v17.16b, v1.16b
+ ldr q16, [x9], #16
+ eor v2.16b, v18.16b, v2.16b
+ eor v3.16b, v19.16b, v3.16b
+ eor v4.16b, v8.16b, v4.16b
+ eor v5.16b, v9.16b, v5.16b
+ eor v6.16b, v10.16b, v6.16b
+ eor v7.16b, v16.16b, v7.16b
+ tbl v0.16b, {v0.16b}, v28.16b
+ tbl v1.16b, {v1.16b}, v28.16b
+ tbl v2.16b, {v2.16b}, v28.16b
+ tbl v3.16b, {v3.16b}, v28.16b
+ tbl v4.16b, {v4.16b}, v28.16b
+ tbl v5.16b, {v5.16b}, v28.16b
+ tbl v6.16b, {v6.16b}, v28.16b
+ tbl v7.16b, {v7.16b}, v28.16b
+.Lenc_sbox:
+ eor v5.16b, v5.16b, v6.16b
+ eor v3.16b, v3.16b, v0.16b
+ subs x10, x10, #1
+ eor v2.16b, v2.16b, v1.16b
+ eor v5.16b, v5.16b, v0.16b
+ eor v8.16b, v3.16b, v7.16b
+ eor v6.16b, v6.16b, v2.16b
+ eor v7.16b, v7.16b, v5.16b
+ eor v8.16b, v8.16b, v4.16b
+ eor v3.16b, v6.16b, v3.16b
+ eor v4.16b, v4.16b, v5.16b
+ eor v6.16b, v1.16b, v5.16b
+ eor v2.16b, v2.16b, v7.16b
+ eor v1.16b, v8.16b, v1.16b
+ eor v8.16b, v7.16b, v4.16b
+ eor v9.16b, v3.16b, v0.16b
+ eor v10.16b, v7.16b, v6.16b
+ eor v16.16b, v5.16b, v3.16b
+ eor v17.16b, v6.16b, v2.16b
+ eor v18.16b, v5.16b, v1.16b
+ eor v19.16b, v2.16b, v4.16b
+ eor v20.16b, v1.16b, v0.16b
+ orr v21.16b, v8.16b, v9.16b
+ orr v22.16b, v10.16b, v16.16b
+ eor v23.16b, v8.16b, v17.16b
+ eor v24.16b, v9.16b, v18.16b
+ and v19.16b, v19.16b, v20.16b
+ orr v20.16b, v17.16b, v18.16b
+ and v8.16b, v8.16b, v9.16b
+ and v9.16b, v17.16b, v18.16b
+ and v17.16b, v23.16b, v24.16b
+ and v10.16b, v10.16b, v16.16b
+ eor v16.16b, v21.16b, v19.16b
+ eor v18.16b, v20.16b, v19.16b
+ and v19.16b, v2.16b, v1.16b
+ and v20.16b, v6.16b, v5.16b
+ eor v21.16b, v22.16b, v17.16b
+ eor v9.16b, v9.16b, v10.16b
+ eor v10.16b, v16.16b, v17.16b
+ eor v16.16b, v18.16b, v8.16b
+ and v17.16b, v4.16b, v0.16b
+ orr v18.16b, v7.16b, v3.16b
+ eor v21.16b, v21.16b, v8.16b
+ eor v8.16b, v9.16b, v8.16b
+ eor v9.16b, v10.16b, v19.16b
+ eor v10.16b, v3.16b, v0.16b
+ eor v16.16b, v16.16b, v17.16b
+ eor v17.16b, v5.16b, v1.16b
+ eor v19.16b, v21.16b, v20.16b
+ eor v20.16b, v8.16b, v18.16b
+ eor v8.16b, v8.16b, v18.16b
+ eor v18.16b, v7.16b, v4.16b
+ eor v21.16b, v9.16b, v16.16b
+ eor v22.16b, v6.16b, v2.16b
+ and v23.16b, v9.16b, v19.16b
+ eor v24.16b, v10.16b, v17.16b
+ eor v25.16b, v0.16b, v1.16b
+ eor v26.16b, v7.16b, v6.16b
+ eor v27.16b, v18.16b, v22.16b
+ eor v28.16b, v3.16b, v5.16b
+ eor v29.16b, v16.16b, v23.16b
+ eor v30.16b, v20.16b, v23.16b
+ eor v23.16b, v20.16b, v23.16b
+ eor v31.16b, v4.16b, v2.16b
+ bsl v29.16b, v19.16b, v20.16b
+ bsl v30.16b, v9.16b, v16.16b
+ bsl v8.16b, v29.16b, v23.16b
+ bsl v20.16b, v23.16b, v29.16b
+ eor v9.16b, v30.16b, v29.16b
+ and v5.16b, v5.16b, v30.16b
+ and v8.16b, v8.16b, v30.16b
+ and v1.16b, v1.16b, v29.16b
+ eor v16.16b, v19.16b, v20.16b
+ and v2.16b, v2.16b, v29.16b
+ eor v19.16b, v9.16b, v29.16b
+ and v17.16b, v17.16b, v9.16b
+ eor v8.16b, v8.16b, v21.16b
+ and v20.16b, v22.16b, v9.16b
+ eor v21.16b, v29.16b, v16.16b
+ eor v22.16b, v29.16b, v16.16b
+ and v23.16b, v25.16b, v16.16b
+ and v6.16b, v6.16b, v19.16b
+ eor v25.16b, v8.16b, v16.16b
+ eor v29.16b, v30.16b, v8.16b
+ and v4.16b, v21.16b, v4.16b
+ and v8.16b, v28.16b, v8.16b
+ and v0.16b, v22.16b, v0.16b
+ eor v21.16b, v23.16b, v1.16b
+ eor v22.16b, v9.16b, v25.16b
+ eor v9.16b, v9.16b, v25.16b
+ eor v23.16b, v25.16b, v16.16b
+ and v3.16b, v29.16b, v3.16b
+ and v24.16b, v24.16b, v25.16b
+ and v25.16b, v27.16b, v25.16b
+ and v10.16b, v22.16b, v10.16b
+ and v9.16b, v9.16b, v18.16b
+ eor v18.16b, v19.16b, v23.16b
+ and v19.16b, v26.16b, v23.16b
+ eor v3.16b, v5.16b, v3.16b
+ eor v17.16b, v17.16b, v24.16b
+ eor v10.16b, v24.16b, v10.16b
+ and v16.16b, v31.16b, v16.16b
+ eor v20.16b, v20.16b, v25.16b
+ eor v9.16b, v25.16b, v9.16b
+ eor v4.16b, v2.16b, v4.16b
+ and v7.16b, v18.16b, v7.16b
+ eor v18.16b, v19.16b, v6.16b
+ eor v5.16b, v8.16b, v5.16b
+ eor v0.16b, v1.16b, v0.16b
+ eor v1.16b, v21.16b, v10.16b
+ eor v8.16b, v3.16b, v17.16b
+ eor v2.16b, v16.16b, v2.16b
+ eor v3.16b, v6.16b, v7.16b
+ eor v6.16b, v18.16b, v9.16b
+ eor v4.16b, v4.16b, v20.16b
+ eor v10.16b, v5.16b, v10.16b
+ eor v0.16b, v0.16b, v17.16b
+ eor v9.16b, v2.16b, v9.16b
+ eor v3.16b, v3.16b, v20.16b
+ eor v7.16b, v6.16b, v1.16b
+ eor v5.16b, v8.16b, v4.16b
+ eor v6.16b, v10.16b, v1.16b
+ eor v2.16b, v4.16b, v0.16b
+ eor v4.16b, v3.16b, v10.16b
+ eor v9.16b, v9.16b, v7.16b
+ eor v3.16b, v0.16b, v5.16b
+ eor v0.16b, v1.16b, v4.16b
+ eor v1.16b, v4.16b, v8.16b
+ eor v4.16b, v9.16b, v5.16b
+ eor v6.16b, v6.16b, v3.16b
+ bcc .Lenc_done
+ ext v8.16b, v0.16b, v0.16b, #12
+ ext v9.16b, v4.16b, v4.16b, #12
+ ldr q28, [x11]
+ ext v10.16b, v6.16b, v6.16b, #12
+ ext v16.16b, v1.16b, v1.16b, #12
+ ext v17.16b, v3.16b, v3.16b, #12
+ ext v18.16b, v7.16b, v7.16b, #12
+ eor v0.16b, v0.16b, v8.16b
+ eor v4.16b, v4.16b, v9.16b
+ eor v6.16b, v6.16b, v10.16b
+ ext v19.16b, v2.16b, v2.16b, #12
+ ext v20.16b, v5.16b, v5.16b, #12
+ eor v1.16b, v1.16b, v16.16b
+ eor v3.16b, v3.16b, v17.16b
+ eor v7.16b, v7.16b, v18.16b
+ eor v2.16b, v2.16b, v19.16b
+ eor v16.16b, v16.16b, v0.16b
+ eor v5.16b, v5.16b, v20.16b
+ eor v17.16b, v17.16b, v6.16b
+ eor v10.16b, v10.16b, v4.16b
+ ext v0.16b, v0.16b, v0.16b, #8
+ eor v9.16b, v9.16b, v1.16b
+ ext v1.16b, v1.16b, v1.16b, #8
+ eor v8.16b, v8.16b, v5.16b
+ eor v16.16b, v16.16b, v5.16b
+ eor v18.16b, v18.16b, v3.16b
+ eor v19.16b, v19.16b, v7.16b
+ ext v3.16b, v3.16b, v3.16b, #8
+ ext v7.16b, v7.16b, v7.16b, #8
+ eor v20.16b, v20.16b, v2.16b
+ ext v6.16b, v6.16b, v6.16b, #8
+ ext v21.16b, v5.16b, v5.16b, #8
+ eor v17.16b, v17.16b, v5.16b
+ ext v2.16b, v2.16b, v2.16b, #8
+ eor v10.16b, v10.16b, v5.16b
+ ext v22.16b, v4.16b, v4.16b, #8
+ eor v0.16b, v0.16b, v8.16b
+ eor v1.16b, v1.16b, v16.16b
+ eor v5.16b, v7.16b, v18.16b
+ eor v4.16b, v3.16b, v17.16b
+ eor v3.16b, v6.16b, v10.16b
+ eor v7.16b, v21.16b, v20.16b
+ eor v6.16b, v2.16b, v19.16b
+ eor v2.16b, v22.16b, v9.16b
+ bne .Lenc_loop
+ ldr q28, [x11, #16]! // load from .LSRM0 on last round (x10 == 0)
+ b .Lenc_loop
+.align 4
+.Lenc_done:
+ ushr v8.2d, v0.2d, #1
+ movi v9.16b, #0x55
+ ldr q10, [x9]
+ ushr v16.2d, v3.2d, #1
+ movi v17.16b, #0x33
+ ushr v18.2d, v4.2d, #1
+ movi v19.16b, #0x0f
+ eor v8.16b, v8.16b, v1.16b
+ ushr v20.2d, v2.2d, #1
+ eor v16.16b, v16.16b, v7.16b
+ eor v18.16b, v18.16b, v6.16b
+ and v8.16b, v8.16b, v9.16b
+ eor v20.16b, v20.16b, v5.16b
+ and v16.16b, v16.16b, v9.16b
+ and v18.16b, v18.16b, v9.16b
+ shl v21.2d, v8.2d, #1
+ eor v1.16b, v1.16b, v8.16b
+ and v8.16b, v20.16b, v9.16b
+ eor v7.16b, v7.16b, v16.16b
+ shl v9.2d, v16.2d, #1
+ eor v6.16b, v6.16b, v18.16b
+ shl v16.2d, v18.2d, #1
+ eor v0.16b, v0.16b, v21.16b
+ shl v18.2d, v8.2d, #1
+ eor v5.16b, v5.16b, v8.16b
+ eor v3.16b, v3.16b, v9.16b
+ eor v4.16b, v4.16b, v16.16b
+ ushr v8.2d, v1.2d, #2
+ eor v2.16b, v2.16b, v18.16b
+ ushr v9.2d, v0.2d, #2
+ ushr v16.2d, v7.2d, #2
+ ushr v18.2d, v3.2d, #2
+ eor v8.16b, v8.16b, v6.16b
+ eor v9.16b, v9.16b, v4.16b
+ eor v16.16b, v16.16b, v5.16b
+ eor v18.16b, v18.16b, v2.16b
+ and v8.16b, v8.16b, v17.16b
+ and v9.16b, v9.16b, v17.16b
+ and v16.16b, v16.16b, v17.16b
+ and v17.16b, v18.16b, v17.16b
+ eor v6.16b, v6.16b, v8.16b
+ shl v8.2d, v8.2d, #2
+ eor v4.16b, v4.16b, v9.16b
+ shl v9.2d, v9.2d, #2
+ eor v5.16b, v5.16b, v16.16b
+ shl v16.2d, v16.2d, #2
+ eor v2.16b, v2.16b, v17.16b
+ shl v17.2d, v17.2d, #2
+ eor v1.16b, v1.16b, v8.16b
+ eor v0.16b, v0.16b, v9.16b
+ eor v7.16b, v7.16b, v16.16b
+ eor v3.16b, v3.16b, v17.16b
+ ushr v8.2d, v6.2d, #4
+ ushr v9.2d, v4.2d, #4
+ ushr v16.2d, v1.2d, #4
+ ushr v17.2d, v0.2d, #4
+ eor v8.16b, v8.16b, v5.16b
+ eor v9.16b, v9.16b, v2.16b
+ eor v16.16b, v16.16b, v7.16b
+ eor v17.16b, v17.16b, v3.16b
+ and v8.16b, v8.16b, v19.16b
+ and v9.16b, v9.16b, v19.16b
+ and v16.16b, v16.16b, v19.16b
+ and v17.16b, v17.16b, v19.16b
+ eor v5.16b, v5.16b, v8.16b
+ shl v8.2d, v8.2d, #4
+ eor v2.16b, v2.16b, v9.16b
+ shl v9.2d, v9.2d, #4
+ eor v7.16b, v7.16b, v16.16b
+ shl v16.2d, v16.2d, #4
+ eor v3.16b, v3.16b, v17.16b
+ shl v17.2d, v17.2d, #4
+ eor v6.16b, v6.16b, v8.16b
+ eor v4.16b, v4.16b, v9.16b
+ eor v7.16b, v7.16b, v10.16b
+ eor v1.16b, v1.16b, v16.16b
+ eor v3.16b, v3.16b, v10.16b
+ eor v0.16b, v0.16b, v17.16b
+ eor v6.16b, v6.16b, v10.16b
+ eor v4.16b, v4.16b, v10.16b
+ eor v2.16b, v2.16b, v10.16b
+ eor v5.16b, v5.16b, v10.16b
+ eor v1.16b, v1.16b, v10.16b
+ eor v0.16b, v0.16b, v10.16b
+ ret
+.size _bsaes_encrypt8,.-_bsaes_encrypt8
+
+.type _bsaes_key_convert,%function
+.align 4
+// On entry:
+// x9 -> input key (big-endian)
+// x10 = number of rounds
+// x17 -> output key (native endianness)
+// On exit:
+// x9, x10 corrupted
+// x11 -> .LM0_bigendian
+// x17 -> last quadword of output key
+// other general-purpose registers preserved
+// v2-v6 preserved
+// v7.16b[] = 0x63
+// v8-v14 preserved
+// v15 = last round key (converted to native endianness)
+// other SIMD registers corrupted
+_bsaes_key_convert:
+#ifdef __ARMEL__
+ adr x11, .LM0_littleendian
+#else
+ adr x11, .LM0_bigendian
+#endif
+