Optimize AES-GCM for uarchs with unroll and new instructions

Increase the block numbers to 8 for every iteration. Increase the hash table capacity. Make use of EOR3 instruction to improve the performance. This can improve performance 25-40% on out-of-order microarchitectures with a large number of fast execution units, such as Neoverse V1. We also see 20-30% performance improvements on other architectures such as the M1. Assembly code reviewd by Tom Cosgrove (ARM). Reviewed-by: Bernd Edlinger <bernd.edlinger@hotmail.de> Reviewed-by: Paul Dale <pauli@openssl.org> (Merged from https://github.com/openssl/openssl/pull/15916) (cherry picked from commit 954f45ba4c504570206ff5bed811e512cf92dc8e)
author: XiaokangQian <xiaokang.qian@arm.com> 2021-06-09 06:35:46 +0000
committer: Tomas Mraz <tomas@openssl.org> 2022-11-11 10:02:44 +0100
commit: 34ca334e5de6837f2c6bc0b0b0df28bdd237e4d7 (patch)
tree: 7e1cc3c3cc26f34e53ac4c4a1f957bd5e892fb65 /crypto/modes/asm/aes-gcm-armv8-unroll8_64.pl
parent: a2bdca6fe666c3a0a13e7f0a51626715608f8597 (diff)
1 files changed, 7369 insertions, 0 deletions
diff --git a/crypto/modes/asm/aes-gcm-armv8-unroll8_64.pl b/crypto/modes/asm/aes-gcm-armv8-unroll8_64.pl
new file mode 100644
index 0000000000..1aaad663d7
--- /dev/null
+++ b/crypto/modes/asm/aes-gcm-armv8-unroll8_64.pl
@@ -0,0 +1,7369 @@
+#! /usr/bin/env perl
+# Copyright 2020-2021 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#
+#========================================================================
+# Written by Xiaokang Qian <xiaokang.qian@arm.com> for the OpenSSL project,
+# derived from https://github.com/ARM-software/AArch64cryptolib, original
+# author Samuel Lee <Samuel.Lee@arm.com>. The module is, however, dual
+# licensed under OpenSSL and SPDX BSD-3-Clause licenses depending on where you
+# obtain it.
+#========================================================================
+#
+# Approach - We want to reload constants as we have plenty of spare ASIMD slots around crypto units for loading
+# Unroll x8 in main loop, main loop to act on 8 16B blocks per iteration, and then do modulo of the accumulated
+# intermediate hashesfrom the 8 blocks.
+#
+#  ____________________________________________________
+# |                                                    |
+# | PRE                                                |
+# |____________________________________________________|
+# |                |                |                  |
+# | CTR block 8k+13| AES block 8k+8 | GHASH block 8k+0 |
+# |________________|________________|__________________|
+# |                |                |                  |
+# | CTR block 8k+14| AES block 8k+9 | GHASH block 8k+1 |
+# |________________|________________|__________________|
+# |                |                |                  |
+# | CTR block 8k+15| AES block 8k+10| GHASH block 8k+2 |
+# |________________|________________|__________________|
+# |                |                |                  |
+# | CTR block 8k+16| AES block 8k+11| GHASH block 8k+3 |
+# |________________|________________|__________________|
+# |                |                |                  |
+# | CTR block 8k+17| AES block 8k+12| GHASH block 8k+4 |
+# |________________|________________|__________________|
+# |                |                |                  |
+# | CTR block 8k+18| AES block 8k+13| GHASH block 8k+5 |
+# |________________|________________|__________________|
+# |                |                |                  |
+# | CTR block 8k+19| AES block 8k+14| GHASH block 8k+6 |
+# |________________|________________|__________________|
+# |                |                |                  |
+# | CTR block 8k+20| AES block 8k+15| GHASH block 8k+7 |
+# |________________|____(mostly)____|__________________|
+# |                                                    |
+# | MODULO                                             |
+# |____________________________________________________|
+#
+# PRE:
+#     Ensure previous generated intermediate hash is aligned and merged with result for GHASH 4k+0
+# EXT low_acc, low_acc, low_acc, #8
+# EOR res_curr (8k+0), res_curr (4k+0), low_acc
+#
+# CTR block:
+#     Increment and byte reverse counter in scalar registers and transfer to SIMD registers
+# REV     ctr32, rev_ctr32
+# ORR     ctr64, constctr96_top32, ctr32, LSL #32
+# INS     ctr_next.d[0], constctr96_bottom64      // Keeping this in scalar registers to free up space in SIMD RF
+# INS     ctr_next.d[1], ctr64X
+# ADD     rev_ctr32, #1
+#
+# AES block:
+#      Do AES encryption/decryption on CTR block X and EOR it with input block X. Take 256 bytes key below for example.
+#      Doing small trick here of loading input in scalar registers, EORing with last key and then transferring
+#      Given we are very constrained in our ASIMD registers this is quite important
+#
+#      Encrypt:
+# LDR     input_low, [ input_ptr  ], #8
+# LDR     input_high, [ input_ptr  ], #8
+# EOR     input_low, k14_low
+# EOR     input_high, k14_high
+# INS     res_curr.d[0], input_low
+# INS     res_curr.d[1], input_high
+# AESE    ctr_curr, k0; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k1; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k2; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k3; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k4; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k5; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k6; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k7; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k8; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k9; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k10; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k11; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k12; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k13
+# EOR     res_curr, res_curr, ctr_curr
+# ST1     { res_curr.16b  }, [ output_ptr  ], #16
+#
+#     Decrypt:
+# AESE    ctr_curr, k0; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k1; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k2; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k3; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k4; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k5; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k6; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k7; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k8; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k9; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k10; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k11; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k12; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k13
+# LDR     res_curr, [ input_ptr  ], #16
+# EOR     res_curr, res_curr, ctr_curr
+# MOV     output_low, res_curr.d[0]
+# MOV     output_high, res_curr.d[1]
+# EOR     output_low, k14_low
+# EOR     output_high, k14_high
+# STP     output_low, output_high, [ output_ptr  ], #16
+
+# GHASH block X:
+#     Do 128b karatsuba polynomial multiplication on block
+#     We only have 64b->128b polynomial multipliers, naively that means we need to do 4 64b multiplies to generate a 128b
+#
+# multiplication:
+#     Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^ (Pmull(Ah,Bl) ^ Pmull(Al,Bh))<<64
+#
+#     The idea behind Karatsuba multiplication is that we can do just 3 64b multiplies:
+#     Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^ (Pmull(Ah^Al,Bh^Bl) ^ Pmull(Ah,Bh) ^ Pmull(Al,Bl))<<64
+#
+#     There is some complication here because the bit order of GHASH's PMULL is reversed compared to elsewhere, so we are
+#     multiplying with "twisted" powers of H
+#
+# Note: We can PMULL directly into the acc_x in first GHASH of the loop
+# Note: For scheduling big cores we want to split the processing to happen over two loop iterations - otherwise the critical
+#       path latency dominates the performance
+#
+#       This has a knock on effect on register pressure, so we have to be a bit more clever with our temporary registers
+#       than indicated here
+# REV64   res_curr, res_curr
+# INS     t_m.d[0], res_curr.d[1]
+# EOR     t_m.8B, t_m.8B, res_curr.8B
+# PMULL2  t_h, res_curr, HX
+# PMULL   t_l, res_curr, HX
+# PMULL   t_m, t_m, HX_k
+# EOR     acc_h, acc_h, t_h
+# EOR     acc_l, acc_l, t_l
+# EOR     acc_m, acc_m, t_m
+#
+# MODULO: take the partial accumulators (~representing sum of 256b multiplication results), from GHASH and do modulo reduction on them
+#         There is some complication here because the bit order of GHASH's PMULL is reversed compared to elsewhere, so we are doing modulo
+#         with a reversed constant
+# EOR3    acc_m, acc_m, acc_l, acc_h                     // Finish off karatsuba processing
+# PMULL   t_mod, acc_h, mod_constant
+# EXT     acc_h, acc_h, acc_h, #8
+# EOR3     acc_m, acc_m, t_mod, acc_h
+# PMULL   acc_h, acc_m, mod_constant
+# EXT     acc_m, acc_m, acc_m, #8
+# EOR3    acc_l, acc_l, acc_m, acc_h
+
+$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
+$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate  ) or
+( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate ) or
+die "can't locate arm-xlate.pl";
+
+die "only for 64 bit" if $flavour !~ /64/;
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+$code=<<___;
+#include "arm_arch.h"
+
+#if __ARM_MAX_ARCH__>=8
+___
+$code.=".arch   armv8.2-a+crypto\n.arch_extension sha3\n.text\n";
+
+$input_ptr="x0";  #argument block
+$bit_length="x1";
+$output_ptr="x2";
+$current_tag="x3";
+$counter="x16";
+$constant_temp="x15";
+$modulo_constant="x10";
+$cc="x8";
+{
+my ($end_input_ptr,$main_end_input_ptr,$temp0_x,$temp1_x)=map("x$_",(4..7));
+my ($temp2_x,$temp3_x)=map("x$_",(13..14));
+my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$ctr4b,$ctr5b,$ctr6b,$ctr7b,$res0b,$res1b,$res2b,$res3b,$res4b,$res5b,$res6b,$res7b)=map("v$_.16b",(0..15));
+my ($ctr0,$ctr1,$ctr2,$ctr3,$ctr4,$ctr5,$ctr6,$ctr7,$res0,$res1,$res2,$res3,$res4,$res5,$res6,$res7)=map("v$_",(0..15));
+my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$ctr4d,$ctr5d,$ctr6d,$ctr7d)=map("d$_",(0..7));
+my ($ctr0q,$ctr1q,$ctr2q,$ctr3q,$ctr4q,$ctr5q,$ctr6q,$ctr7q)=map("q$_",(0..7));
+my ($res0q,$res1q,$res2q,$res3q,$res4q,$res5q,$res6q,$res7q)=map("q$_",(8..15));
+
+my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3,$ctr_t4,$ctr_t5,$ctr_t6,$ctr_t7)=map("v$_",(8..15));
+my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b,$ctr_t4b,$ctr_t5b,$ctr_t6b,$ctr_t7b)=map("v$_.16b",(8..15));
+my ($ctr_t0q,$ctr_t1q,$ctr_t2q,$ctr_t3q,$ctr_t4q,$ctr_t5q,$ctr_t6q,$ctr_t7q)=map("q$_",(8..15));
+
+my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(17..19));
+my ($acc_h,$acc_m,$acc_l)=map("v$_",(17..19));
+
+my ($h1,$h12k,$h2,$h3,$h34k,$h4)=map("v$_",(20..25));
+my ($h5,$h56k,$h6,$h7,$h78k,$h8)=map("v$_",(20..25));
+my ($h1q,$h12kq,$h2q,$h3q,$h34kq,$h4q)=map("q$_",(20..25));
+my ($h5q,$h56kq,$h6q,$h7q,$h78kq,$h8q)=map("q$_",(20..25));
+
+my $t0="v16";
+my $t0d="d16";
+
+my $t1="v29";
+my $t2=$res1;
+my $t3=$t1;
+
+my $t4=$res0;
+my $t5=$res2;
+my $t6=$t0;
+
+my $t7=$res3;
+my $t8=$res4;
+my $t9=$res5;
+
+my $t10=$res6;
+my $t11="v21";
+my $t12=$t1;
+
+my $rtmp_ctr="v30";
+my $rtmp_ctrq="q30";
+my $rctr_inc="v31";
+my $rctr_incd="d31";
+
+my $mod_constantd=$t0d;
+my $mod_constant=$t0;
+
+my ($rk0,$rk1,$rk2)=map("v$_.16b",(26..28));
+my ($rk3,$rk4,$rk5)=map("v$_.16b",(26..28));
+my ($rk6,$rk7,$rk8)=map("v$_.16b",(26..28));
+my ($rk9,$rk10,$rk11)=map("v$_.16b",(26..28));
+my ($rk12,$rk13,$rk14)=map("v$_.16b",(26..28));
+my ($rk0q,$rk1q,$rk2q)=map("q$_",(26..28));
+my ($rk3q,$rk4q,$rk5q)=map("q$_",(26..28));
+my ($rk6q,$rk7q,$rk8q)=map("q$_",(26..28));
+my ($rk9q,$rk10q,$rk11q)=map("q$_",(26..28));
+my ($rk12q,$rk13q,$rk14q)=map("q$_",(26..28));
+my $rk2q1="v28.1q";
+my $rk3q1="v26.1q";
+my $rk4v="v27";
+
+
+#########################################################################################
+# size_t unroll8_eor3_aes_gcm_enc_128_kernel(const unsigned char *in,
+#                               size_t len,
+#                               unsigned char *out,
+#                               const void *key,
+#                               unsigned char ivec[16],
+#                               u64 *Xi);
+#
+$code.=<<___;
+.global unroll8_eor3_aes_gcm_enc_128_kernel
+.type   unroll8_eor3_aes_gcm_enc_128_kernel,%function
+.align  4
+unroll8_eor3_aes_gcm_enc_128_kernel:
+	AARCH64_VALID_CALL_TARGET
+	cbz	x1, .L128_enc_ret
+	stp	d8, d9, [sp, #-80]!
+	mov	$counter, x4
+	mov	$cc, x5
+	stp	d10, d11, [sp, #16]
+	stp	d12, d13, [sp, #32]
+	stp	d14, d15, [sp, #48]
+	mov	x5, #0xc200000000000000
+	stp	x5, xzr, [sp, #64]
+	add	$modulo_constant, sp, #64
+
+	mov	$constant_temp, #0x100000000				@ set up counter increment
+	movi	$rctr_inc.16b, #0x0
+	mov	$rctr_inc.d[1], $constant_temp
+	lsr	$main_end_input_ptr, $bit_length, #3		  	@ byte_len
+	ld1	{ $ctr0b}, [$counter]					@ CTR block 0
+
+	sub	$main_end_input_ptr, $main_end_input_ptr, #1	 	@ byte_len - 1
+
+	and	$main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80		@ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
+
+	rev32	$rtmp_ctr.16b, $ctr0.16b				@ set up reversed counter
+
+	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 0
+
+	rev32	$ctr1.16b, $rtmp_ctr.16b				@ CTR block 1
+	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 1
+
+	rev32	$ctr2.16b, $rtmp_ctr.16b				@ CTR block 2
+	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 2
+
+	rev32	$ctr3.16b, $rtmp_ctr.16b				@ CTR block 3
+	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 3
+
+	rev32	$ctr4.16b, $rtmp_ctr.16b				@ CTR block 4
+	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 4
+
+	rev32	$ctr5.16b, $rtmp_ctr.16b				@ CTR block 5
+	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 5
+	ldp	$rk0q, $rk1q, [$cc, #0]				  	@ load rk0, rk1
+
+	rev32	$ctr6.16b, $rtmp_ctr.16b				@ CTR block 6
+	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 6
+
+	rev32	$ctr7.16b, $rtmp_ctr.16b				@ CTR block 7
+	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 7
+
+	aese	$ctr4b, $rk0  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 0
+	aese	$ctr6b, $rk0  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 0
+	aese	$ctr3b, $rk0  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 0
+
+	aese	$ctr0b, $rk0  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 0
+	aese	$ctr1b, $rk0  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 0
+	aese	$ctr2b, $rk0  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 0
+
+	aese	$ctr7b, $rk0  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 0
+	aese	$ctr5b, $rk0  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 0
+	ldp	$rk2q, $rk3q, [$cc, #32]				@ load rk2, rk3
+
+	aese	$ctr3b, $rk1  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 1
+
+	aese	$ctr7b, $rk1  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 1
+	aese	$ctr5b, $rk1  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 1
+	aese	$ctr4b, $rk1  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 1
+
+	aese	$ctr2b, $rk1  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 1
+	aese	$ctr6b, $rk1  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 1
+	aese	$ctr0b, $rk1  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 1
+
+	aese	$ctr5b, $rk2  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 2
+	aese	$ctr1b, $rk1  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 1
+	aese	$ctr0b, $rk2  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 2
+
+	aese	$ctr2b, $rk2  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 2
+	aese	$ctr3b, $rk2  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 2
+	aese	$ctr7b, $rk2  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 2
+
+	aese	$ctr1b, $rk2  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 2
+	aese	$ctr6b, $rk2  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 2
+	aese	$ctr4b, $rk2  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 2
+
+	aese	$ctr2b, $rk3  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 3
+
+	ldp	$rk4q, $rk5q, [$cc, #64]				@ load rk4, rk5
+	aese	$ctr5b, $rk3  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 3
+	aese	$ctr0b, $rk3  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 3
+
+	aese	$ctr4b, $rk3  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 3
+	aese	$ctr3b, $rk3  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 3
+	aese	$ctr6b, $rk3  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 3
+
+	aese	$ctr7b, $rk3  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 3
+
+	aese	$ctr6b, $rk4  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 4
+	aese	$ctr1b, $rk3  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 3
+	aese	$ctr5b, $rk4  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 4
+
+	aese	$ctr7b, $rk4  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 4
+	aese	$ctr4b, $rk4  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 4
+	aese	$ctr0b, $rk4  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 4
+
+	aese	$ctr1b, $rk4  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 4
+	aese	$ctr2b, $rk4  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 4
+	aese	$ctr3b, $rk4  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 4
+
+	aese	$ctr7b, $rk5  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 5
+	aese	$ctr0b, $rk5  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 5
+	ldp	$rk6q, $rk7q, [$cc, #96]				@ load rk6, rk7
+
+	aese	$ctr1b, $rk5  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 5
+	aese	$ctr3b, $rk5  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 5
+	aese	$ctr2b, $rk5  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 5
+
+	aese	$ctr4b, $rk5  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 5
+	aese	$ctr5b, $rk5  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 5
+	aese	$ctr6b, $rk5  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 5
+
+	aese	$ctr4b, $rk6  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 6
+	aese	$ctr3b, $rk6  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 6
+	aese	$ctr2b, $rk6  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 6
+
+	aese	$ctr7b, $rk6  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 6
+	aese	$ctr6b, $rk6  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 6
+	aese	$ctr5b, $rk6  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 6
+
+	aese	$ctr0b, $rk6  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 6
+	aese	$ctr1b, $rk6  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 6
+	ldp	$rk8q, $rk9q, [$cc, #128]				@ load rk8, rk9
+
+	aese	$ctr5b, $rk7  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 7
+
+	ld1	{ $acc_lb}, [$current_tag]
+	ext	$acc_lb, $acc_lb, $acc_lb, #8
+	rev64	$acc_lb, $acc_lb
+
+	aese	$ctr7b, $rk7  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 7
+
+	aese	$ctr4b, $rk7  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 7
+	aese	$ctr3b, $rk7  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 7
+	aese	$ctr6b, $rk7  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 7
+
+	aese	$ctr1b, $rk7  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 7
+	aese	$ctr2b, $rk7  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 7
+	aese	$ctr0b, $rk7  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 7
+
+	aese	$ctr3b, $rk8  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 8
+	aese	$ctr6b, $rk8  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 8
+	aese	$ctr2b, $rk8  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 8
+
+	aese	$ctr7b, $rk8  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 8
+	aese	$ctr0b, $rk8  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 8
+	ldr	$rk10q, [$cc, #160]					@ load rk10
+
+	aese	$ctr3b, $rk9						@ AES block 8k+11 - round 9
+	aese	$ctr4b, $rk8  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 8
+	aese	$ctr2b, $rk9						@ AES block 8k+10 - round 9
+
+	aese	$ctr5b, $rk8  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 8
+	aese	$ctr1b, $rk8  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 8
+	aese	$ctr6b, $rk9						@ AES block 8k+14 - round 9
+
+	aese	$ctr4b, $rk9						@ AES block 8k+12 - round 9
+	add	$main_end_input_ptr, $main_end_input_ptr, $input_ptr
+	aese	$ctr0b, $rk9						@ AES block 8k+8 - round 9
+
+	aese	$ctr7b, $rk9						@ AES block 8k+15 - round 9
+	aese	$ctr5b, $rk9						@ AES block 8k+13 - round 9
+	aese	$ctr1b, $rk9						@ AES block 8k+9 - round 9
+
+	add	$end_input_ptr, $input_ptr, $bit_length, lsr #3		@ end_input_ptr
+	cmp	$input_ptr, $main_end_input_ptr				@ check if we have <= 8 blocks
+	b.ge	.L128_enc_tail						@ handle tail
+
+	ldp	$ctr_t0q, $ctr_t1q, [$input_ptr], #32			@ AES block 0, 1 - load plaintext
+
+	ldp	$ctr_t2q, $ctr_t3q, [$input_ptr], #32			@ AES block 2, 3 - load plaintext
+
+	ldp	$ctr_t4q, $ctr_t5q, [$input_ptr], #32			@ AES block 4, 5 - load plaintext
+
+	ldp	$ctr_t6q, $ctr_t7q, [$input_ptr], #32			@ AES block 6, 7 - load plaintext
+	cmp	$input_ptr, $main_end_input_ptr				@ check if we have <= 8 blocks
+
+	eor3	$res0b, $ctr_t0b, $ctr0b, $rk10				@ AES block 0 - result
+	rev32	$ctr0.16b, $rtmp_ctr.16b				@ CTR block 8
+	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8
+
+	eor3	$res1b, $ctr_t1b, $ctr1b, $rk10				@ AES block 1 - result
+	stp	$res0q, $res1q, [$output_ptr], #32			@ AES block 0, 1 - store result
+
+	rev32	$ctr1.16b, $rtmp_ctr.16b				@ CTR block 9
+	eor3	$res5b, $ctr_t5b, $ctr5b, $rk10				@ AES block 5 - result
+	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 9
+
+	eor3	$res2b, $ctr_t2b, $ctr2b, $rk10				@ AES block 2 - result
+	eor3	$res6b, $ctr_t6b, $ctr6b, $rk10				@ AES block 6 - result
+	eor3	$res4b, $ctr_t4b, $ctr4b, $rk10				@ AES block 4 - result
+
+	rev32	$ctr2.16b, $rtmp_ctr.16b				@ CTR block 10
+	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 10
+
+	eor3	$res3b, $ctr_t3b, $ctr3b, $rk10				@ AES block 3 - result
+	eor3	$res7b, $ctr_t7b, $ctr7b,$rk10				@ AES block 7 - result
+	stp	$res2q, $res3q, [$output_ptr], #32			@ AES block 2, 3 - store result
+
+	rev32	$ctr3.16b, $rtmp_ctr.16b				@ CTR block 11
+	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 11
+	stp	$res4q, $res5q, [$output_ptr], #32			@ AES block 4, 5 - store result
+
+	stp	$res6q, $res7q, [$output_ptr], #32			@ AES block 6, 7 - store result
+
+	rev32	$ctr4.16b, $rtmp_ctr.16b				@ CTR block 12
+	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 12
+	b.ge	.L128_enc_prepretail					@ do prepretail
+
+.L128_enc_main_loop:							@ main loop start
+	rev32	$ctr5.16b, $rtmp_ctr.16b				@ CTR block 8k+13
+	ldr	$h5q, [$current_tag, #128]				@ load h5l | h5h
+	ext     $h5.16b, $h5.16b, $h5.16b, #8
+	ldr	$h6q, [$current_tag, #160]				@ load h6l | h6h
+	ext     $h6.16b, $h6.16b, $h6.16b, #8
+	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+13
+
+	rev64	$res1b, $res1b						@ GHASH block 8k+1
+	rev64	$res0b, $res0b						@ GHASH block 8k
+	ldr	$h7q, [$current_tag, #176]				@ load h7l | h7h
+	ext     $h7.16b, $h7.16b, $h7.16b, #8
+	ldr	$h8q, [$current_tag, #208]				@ load h8l | h8h
+	ext     $h8.16b, $h8.16b, $h8.16b, #8
+
+	rev32	$ctr6.16b, $rtmp_ctr.16b				@ CTR block 8k+14
+	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+14
+	ext	$acc_lb, $acc_lb, $acc_lb, #8				@ PRE 0
+
+	ldr	$h56kq, [$current_tag, #144]				@ load h6k | h5k
+	ldr	$h78kq, [$current_tag, #192]				@ load h8k | h7k
+	rev64	$res5b, $res5b						@ GHASH block 8k+5 (t0, t1, t2 and t3 free)
+	rev64	$res3b, $res3b						@ GHASH block 8k+3
+
+	ldp	$rk0q, $rk1q, [$cc, #0]				 	@ load rk0, rk1
+	eor	$res0b, $res0b, $acc_lb				 	@ PRE 1
+	rev32	$ctr7.16b, $rtmp_ctr.16b				@ CTR block 8k+15
+
+	rev64	$res7b, $res7b						@ GHASH block 8k+7 (t0, t1, t2 and t3 free)
+
+	pmull2  $t0.1q, $res1.2d, $h7.2d				@ GHASH block 8k+1 - high
+	rev64	$res2b, $res2b						@ GHASH block 8k+2
+	pmull2  $acc_h.1q, $res0.2d, $h8.2d				@ GHASH block 8k - high
+
+	pmull	$h7.1q, $res1.1d, $h7.1d				@ GHASH block 8k+1 - low
+	trn1	$acc_m.2d, $res1.2d, $res0.2d				@ GHASH block 8k, 8k+1 - mid
+	pmull	$acc_l.1q, $res0.1d, $h8.1d				@ GHASH block 8k - low
+
+	trn2	$res0.2d, $res1.2d, $res0.2d				@ GHASH block 8k, 8k+1 - mid
+	pmull2  $t1.1q, $res2.2d, $h6.2d				@ GHASH block 8k+2 - high
+	pmull2  $t2.1q, $res3.2d, $h5.2d				@ GHASH block 8k+3 - high
+
+	eor	$acc_lb, $acc_lb, $h7.16b				@ GHASH block 8k+1 - low
+	ldr	$h3q, [$current_tag, #80]				@ load h3l | h3h
+	ext     $h3.16b, $h3.16b, $h3.16b, #8
+	ldr	$h4q, [$current_tag, #112]				@ load h3l | h3h
+	ext     $h4.16b, $h4.16b, $h4.16b, #8
+	aese	$ctr5b, $rk0  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 0
+
+	aese	$ctr1b, $rk0  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 0
+	aese	$ctr4b, $rk0  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 0
+	eor	$acc_hb, $acc_hb, $t0.16b				@ GHASH block 8k+1 - high
+
+	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+15
+	aese	$ctr2b, $rk0  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 0
+	eor	$res0.16b, $res0.16b, $acc_m.16b			@ GHASH block 8k, 8k+1 - mid
+
+	aese	$ctr6b, $rk0  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 0
+	aese	$ctr1b, $rk1  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 1
+	aese	$ctr0b, $rk0  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 0
+
+	aese	$ctr2b, $rk1  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 1
+	aese	$ctr3b, $rk0  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 0
+	pmull	$h6.1q, $res2.1d, $h6.1d				@ GHASH block 8k+2 - low
+
+	aese	$ctr5b, $rk1  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 1
+	aese	$ctr7b, $rk0  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 0
+	aese	$ctr0b, $rk1  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 1
+
+	eor3	$acc_hb, $acc_hb, $t1.16b,$t2.16b			@ GHASH block 8k+2, 8k+3 - high
+	trn1	$t3.2d, $res3.2d, $res2.2d				@ GHASH block 8k+2, 8k+3 - mid
+	trn2	$res2.2d, $res3.2d, $res2.2d				@ GHASH block 8k+2, 8k+3 - mid
+
+	ldp	$rk2q, $rk3q, [$cc, #32]				@ load rk2, rk3
+	aese	$ctr4b, $rk1  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 1
+	aese	$ctr3b, $rk1  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 1
+
+	pmull	$h5.1q, $res3.1d, $h5.1d				@ GHASH block 8k+3 - low
+	aese	$ctr7b, $rk1  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 1
+	aese	$ctr6b, $rk1  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 1
+
+	pmull2  $acc_m.1q, $res0.2d, $h78k.2d				@ GHASH block 8k	- mid
+	eor	$res2.16b, $res2.16b, $t3.16b				@ GHASH block 8k+2, 8k+3 - mid
+	pmull	$h78k.1q, $res0.1d, $h78k.1d				@ GHASH block 8k+1 - mid
+
+	rev64	$res6b, $res6b						@ GHASH block 8k+6 (t0, t1, and t2 free)
+	eor3	$acc_lb, $acc_lb, $h6.16b, $h5.16b			@ GHASH block 8k+2, 8k+3 - low
+
+	pmull2  $t3.1q, $res2.2d, $h56k.2d				@ GHASH block 8k+2 - mid
+	eor	$acc_mb, $acc_mb, $h78k.16b				@ GHASH block 8k+1 - mid
+	pmull	$h56k.1q, $res2.1d, $h56k.1d				@ GHASH block 8k+3 - mid
+
+	aese	$ctr5b, $rk2  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 2
+	aese	$ctr4b, $rk2  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 2
+	aese	$ctr2b, $rk2  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 2
+
+	aese	$ctr1b, $rk2  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 2
+	eor3	$acc_mb, $acc_mb, $h56k.16b, $t3.16b			@ GHASH block 8k+2, 8k+3 - mid
+	aese	$ctr6b, $rk2  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 2
+
+	aese	$ctr0b, $rk2  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 2
+	aese	$ctr3b, $rk2  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 2
+	aese	$ctr7b, $rk2  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 2
+
+	aese	$ctr6b, $rk3  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 3
+	ldr	$h12kq, [$current_tag, #48]				@ load h2k | h1k
+	ldr	$h34kq, [$current_tag, #96]				@ load h4k | h3k
+	rev64	$res4b, $res4b						@ GHASH block 8k+4 (t0, t1, and t2 free)
+
+	ldp	$rk4q, $rk5q, [$cc, #64]				@ load rk4, rk5
+	aese	$ctr2b, $rk3  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 3
+	aese	$ctr1b, $rk3  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 3
+
+	ldr	$h1q, [$current_tag, #32]				@ load h1l | h1h
+	ext     $h1.16b, $h1.16b, $h1.16b, #8
+	ldr	$h2q, [$current_tag, #64]				@ load h1l | h1h
+	ext     $h2.16b, $h2.16b, $h2.16b, #8
+	pmull2  $t4.1q, $res4.2d, $h4.2d				@ GHASH block 8k+4 - high
+	pmull	$h4.1q, $res4.1d, $h4.1d				@ GHASH block 8k+4 - low
+
+	trn1	$t6.2d, $res5.2d, $res4.2d				@ GHASH block 8k+4, 8k+5 - mid
+	trn2	$res4.2d, $res5.2d, $res4.2d				@ GHASH block 8k+4, 8k+5 - mid
+
+	aese	$ctr0b, $rk3  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 3
+	aese	$ctr3b, $rk3  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 3
+
+	aese	$ctr7b, $rk3  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 3
+	aese	$ctr4b, $rk3  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 3
+
+	aese	$ctr5b, $rk3  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 3
+	aese	$ctr0b, $rk4  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 4
+
+	aese	$ctr7b, $rk4  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 4
+	aese	$ctr3b, $rk4  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 4
+	aese	$ctr4b, $rk4  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 4
+
+	aese	$ctr5b, $rk4  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 4
+	aese	$ctr6b, $rk4  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 4
+	aese	$ctr1b, $rk4  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 4
+
+	pmull2  $t5.1q, $res5.2d, $h3.2d				@ GHASH block 8k+5 - high
+	eor	$res4.16b, $res4.16b, $t6.16b				@ GHASH block 8k+4, 8k+5 - mid
+	pmull	$h3.1q, $res5.1d, $h3.1d				@ GHASH block 8k+5 - low
+
+	aese	$ctr2b, $rk4  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 4
+	ldp	$rk6q, $rk7q, [$cc, #96]				@ load rk6, rk7
+	trn1	$t9.2d, $res7.2d, $res6.2d				@ GHASH block 8k+6, 8k+7 - mid
+
+	pmull2  $t6.1q, $res4.2d, $h34k.2d				@ GHASH block 8k+4 - mid
+	pmull	$h34k.1q, $res4.1d, $h34k.1d				@ GHASH block 8k+5 - mid
+	pmull2  $t7.1q, $res6.2d, $h2.2d				@ GHASH block 8k+6 - high
+
+	pmull2  $t8.1q, $res7.2d, $h1.2d				@ GHASH block 8k+7 - high
+	aese	$ctr2b, $rk5  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 5
+	aese	$ctr5b, $rk5  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 5
+
+	pmull	$h2.1q, $res6.1d, $h2.1d				@ GHASH block 8k+6 - low
+	eor3	$acc_hb, $acc_hb, $t4.16b, $t5.16b			@ GHASH block 8k+4, 8k+5 - high
+	trn2	$res6.2d, $res7.2d, $res6.2d				@ GHASH block 8k+6, 8k+7 - mid
+
+	eor3	$acc_lb, $acc_lb, $h4.16b, $h3.16b			@ GHASH block 8k+4, 8k+5 - low
+	aese	$ctr6b, $rk5  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 5
+
+	eor	$res6.16b, $res6.16b, $t9.16b				@ GHASH block 8k+6, 8k+7 - mid
+	aese	$ctr7b, $rk5  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 5
+	aese	$ctr1b, $rk5  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 5
+
+	aese	$ctr3b, $rk5  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 5
+	aese	$ctr4b, $rk5  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 5
+	aese	$ctr0b, $rk5  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 5
+
+	eor3	$acc_mb, $acc_mb, $h34k.16b, $t6.16b			@ GHASH block 8k+4, 8k+5 - mid
+	ldr	$mod_constantd, [$modulo_constant]			@ MODULO - load modulo constant
+	pmull	$h1.1q, $res7.1d, $h1.1d				@ GHASH block 8k+7 - low
+
+	aese	$ctr7b, $rk6  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 6
+	aese	$ctr5b, $rk6  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 6
+
+	pmull2  $t9.1q, $res6.2d, $h12k.2d				@ GHASH block 8k+6 - mid
+	aese	$ctr1b, $rk6  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 6
+	aese	$ctr2b, $rk6  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 6
+
+	pmull	$h12k.1q, $res6.1d, $h12k.1d				@ GHASH block 8k+7 - mid
+	eor3	$acc_lb, $acc_lb, $h2.16b, $h1.16b			@ GHASH block 8k+6, 8k+7 - low
+	ldp	$ctr_t0q, $ctr_t1q, [$input_ptr], #32			@ AES block 8k+8, 8k+9 - load plaintext
+
+	aese	$ctr3b, $rk6  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 6
+	rev32	$h1.16b, $rtmp_ctr.16b					@ CTR block 8k+16
+	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+16
+
+	aese	$ctr4b, $rk6  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 6
+	aese	$ctr0b, $rk6  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 6
+	aese	$ctr6b, $rk6  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 6
+
+	eor3	$acc_mb, $acc_mb, $h12k.16b, $t9.16b			@ GHASH block 8k+6, 8k+7 - mid
+	ldp	$rk8q, $rk9q, [$cc, #128]				@ load rk8, rk9
+	eor3	$acc_hb, $acc_hb, $t7.16b, $t8.16b			@ GHASH block 8k+6, 8k+7 - high
+
+	aese	$ctr2b, $rk7  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 7
+	aese	$ctr7b, $rk7  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 7
+	ldp	$ctr_t2q, $ctr_t3q, [$input_ptr], #32			@ AES block 8k+10, 8k+11 - load plaintext
+
+	aese	$ctr5b, $rk7  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 7
+	aese	$ctr6b, $rk7  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 7
+	aese	$ctr1b, $rk7  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 7
+
+	pmull	$t11.1q, $acc_h.1d, $mod_constant.1d		 	@ MODULO - top 64b align with mid
+	aese	$ctr0b, $rk7  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 7
+	aese	$ctr4b, $rk7  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 7
+
+	rev32	$h2.16b, $rtmp_ctr.16b					@ CTR block 8k+17
+	aese	$ctr3b, $rk7  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 7
+
+	aese	$ctr5b, $rk8  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 8
+	ldp	$ctr_t4q, $ctr_t5q, [$input_ptr], #32			@ AES block 8k+12, 8k+13 - load plaintext
+	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+17
+
+	aese	$ctr2b, $rk8  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 8
+	aese	$ctr1b, $rk8  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 8
+	aese	$ctr7b, $rk8  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 8
+
+	aese	$ctr4b, $rk8  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 8
+	eor3	$acc_mb, $acc_mb, $acc_hb, $acc_lb		 	@ MODULO - karatsuba tidy up
+	ldr	$rk10q, [$cc, #160]					@ load rk10
+
+	ext	$t12.16b, $acc_hb, $acc_hb, #8				@ MODULO - other top alignment
+	rev32	$h3.16b, $rtmp_ctr.16b					@ CTR block 8k+18
+	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+18
+	aese	$ctr3b, $rk8  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 8
+
+	aese	$ctr0b, $rk8  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 8
+	eor3	$acc_mb, $acc_mb, $t12.16b, $t11.16b			@ MODULO - fold into mid
+	aese	$ctr6b, $rk8  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 8
+
+	aese	$ctr2b, $rk9						@ AES block 8k+10 - round 9
+	aese	$ctr4b, $rk9						@ AES block 8k+12 - round 9
+	aese	$ctr1b, $rk9						@ AES block 8k+9 - round 9
+
+	ldp	$ctr_t6q, $ctr_t7q, [$input_ptr], #32			@ AES block 8k+14, 8k+15 - load plaintext
+	rev32	$h4.16b, $rtmp_ctr.16b					@ CTR block 8k+19
+	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+19
+
+	cmp	$input_ptr, $main_end_input_ptr				@ LOOP CONTROL
+	eor3	$res4b, $ctr_t4b, $ctr4b, $rk10				@ AES block 4 - result
+	aese	$ctr7b, $rk9						@ AES block 8k+15 - round 9
+
+	aese	$ctr6b, $rk9						@ AES block 8k+14 - round 9
+	aese	$ctr3b, $rk9						@ AES block 8k+11 - round 9
+
+	eor3	$res2b, $ctr_t2b, $ctr2b, $rk10				@ AES block 8k+10 - result
+
+	mov	$ctr2.16b, $h3.16b					@ CTR block 8k+18
+	aese	$ctr0b, $rk9						@ AES block 8k+8 - round 9
+
+	rev32	$ctr4.16b, $rtmp_ctr.16b				@ CTR block 8k+20
+	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+20
+
+	eor3	$res7b, $ctr_t7b, $ctr7b, $rk10				@ AES block 7 - result
+	aese	$ctr5b, $rk9						@ AES block 8k+13 - round 9
+	pmull	$acc_h.1q, $acc_m.1d, $mod_constant.1d			@ MODULO - mid 64b align with low
+
+	eor3	$res1b, $ctr_t1b, $ctr1b, $rk10				@ AES block 8k+9 - result
+	eor3	$res3b, $ctr_t3b, $ctr3b, $rk10				@ AES block 8k+11 - result
+	mov	$ctr3.16b, $h4.16b					@ CTR block 8k+19
+
+	ext	$t11.16b, $acc_mb, $acc_mb, #8				@ MODULO - other mid alignment
+	eor3	$res5b, $ctr_t5b, $ctr5b, $rk10				@ AES block 5 - result
+	mov	$ctr1.16b, $h2.16b					@ CTR block 8k+17
+
+	eor3	$res0b, $ctr_t0b, $ctr0b, $rk10				@ AES block 8k+8 - result
+	mov	$ctr0.16b, $h1.16b					@ CTR block 8k+16
+	stp	$res0q, $res1q, [$output_ptr], #32			@ AES block 8k+8, 8k+9 - store result
+
+	stp	$res2q, $res3q, [$output_ptr], #32			@ AES block 8k+10, 8k+11 - store result
+	eor3	$res6b, $ctr_t6b, $ctr6b, $rk10				@ AES block 6 - result
+
+	stp	$res4q, $res5q, [$output_ptr], #32			@ AES block 8k+12, 8k+13 - store result
+	eor3	$acc_lb, $acc_lb, $acc_hb, $t11.16b		 	@ MODULO - fold into low
+
+	stp	$res6q, $res7q, [$output_ptr], #32			@ AES block 8k+14, 8k+15 - store result
+	b.lt	.L128_enc_main_loop
+
+.L128_enc_prepretail:							@ PREPRETAIL
+	rev32	$ctr5.16b, $rtmp_ctr.16b				@ CTR block 8k+13
+	ldr	$h7q, [$current_tag, #176]				@ load h7l | h7h
+	ext     $h7.16b, $h7.16b, $h7.16b, #8
+	ldr	$h8q, [$current_tag, #208]				@ load h8l | h8h
+	ext     $h8.16b, $h8.16b, $h8.16b, #8
+	ext	$acc_lb, $acc_lb, $acc_lb, #8				@ PRE 0
+
+	ldr	$h5q, [$current_tag, #128]				@ load h5l | h5h
+	ext     $h5.16b, $h5.16b, $h5.16b, #8
+	ldr	$h6q, [$current_tag, #160]				@ load h6l | h6h
+	ext     $h6.16b, $h6.16b, $h6.16b, #8
+	rev64	$res0b, $res0b						@ GHASH block 8k
+	rev64	$res1b, $res1b						@ GHASH block 8k+1
+
+	ldr	$h56kq, [$current_tag, #144]				@ load h6k | h5k
+	ldr	$h78kq, [$current_tag, #192]				@ load h6k | h5k
+	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+13
+	rev64	$res3b, $res3b						@ GHASH block 8k+3
+
+	rev64	$res2b, $res2b						@ GHASH block 8k+2
+	eor	$res0b, $res0b, $acc_lb				 	@ PRE 1
+
+	rev32	$ctr6.16b, $rtmp_ctr.16b				@ CTR block 8k+14
+
+	pmull2  $t0.1q, $res1.2d, $h7.2d				@ GHASH block 8k+1 - high
+	pmull	$acc_l.1q, $res0.1d, $h8.1d				@ GHASH block 8k - low
+	pmull2  $acc_h.1q, $res0.2d, $h8.2d				@ GHASH block 8k - high
+
+	rev64	$res5b, $res5b						@ GHASH block 8k+5 (t0, t1, t2 and t3 free)
+	trn1	$acc_m.2d, $res1.2d, $res0.2d				@ GHASH block 8k, 8k+1 - mid
+
+	pmull	$h7.1q, $res1.1d, $h7.1d				@ GHASH block 8k+1 - low
+	eor	$acc_hb, $acc_hb, $t0.16b				@ GHASH block 8k+1 - high
+	trn2	$res0.2d, $res1.2d, $res0.2d				@ GHASH block 8k, 8k+1 - mid
+
+	eor	$acc_lb, $acc_lb, $h7.16b				@ GHASH block 8k+1 - low
+	eor	$res0.16b, $res0.16b, $acc_m.16b			@ GHASH block 8k, 8k+1 - mid
+
+	ldp	$rk0q, $rk1q, [$cc, #0]				 	@ load rk0, rk1
+	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+14
+
+	pmull2  $acc_m.1q, $res0.2d, $h78k.2d				@ GHASH block 8k	- mid
+	pmull	$h78k.1q, $res0.1d, $h78k.1d				@ GHASH block 8k+1 - mid
+
+	rev64	$res4b, $res4b						@ GHASH block 8k+4 (t0, t1, and t2 free)
+	rev64	$res7b, $res7b						@ GHASH block 8k+7 (t0, t1, t2 and t3 free)
+
+
author	XiaokangQian <xiaokang.qian@arm.com>	2021-06-09 06:35:46 +0000
committer	Tomas Mraz <tomas@openssl.org>	2022-11-11 10:02:44 +0100
commit	34ca334e5de6837f2c6bc0b0b0df28bdd237e4d7 (patch)
tree	7e1cc3c3cc26f34e53ac4c4a1f957bd5e892fb65 /crypto/modes/asm/aes-gcm-armv8-unroll8_64.pl
parent	a2bdca6fe666c3a0a13e7f0a51626715608f8597 (diff)