diff options
author | Bernd Edlinger <bernd.edlinger@hotmail.de> | 2022-06-12 09:37:26 +0200 |
---|---|---|
committer | Hugo Landau <hlandau@openssl.org> | 2022-07-06 08:11:03 +0100 |
commit | a8f6d73fda64d514171e99a50d1483c0c0b8d968 (patch) | |
tree | b808341a1007e6cde385d31e2151f51a5f5ecdf9 | |
parent | 60f011f584d80447e86cae1d1bd3ae24bc13235b (diff) |
Fix reported performance degradation on aarch64
This restores the implementation prior to
commit 2621751 ("aes/asm/aesv8-armx.pl: avoid 32-bit lane assignment in CTR mode")
for 64bit targets only, since it is reportedly 2-17% slower,
and the silicon errata only affects 32bit targets.
Only for 32bit targets the new algorithm is used.
Fixes #18445
Reviewed-by: Tomas Mraz <tomas@openssl.org>
Reviewed-by: Paul Dale <pauli@openssl.org>
Reviewed-by: Hugo Landau <hlandau@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/18539)
-rwxr-xr-x | crypto/aes/asm/aesv8-armx.pl | 62 |
1 files changed, 62 insertions, 0 deletions
diff --git a/crypto/aes/asm/aesv8-armx.pl b/crypto/aes/asm/aesv8-armx.pl index 2b0e982996..1856d9977f 100755 --- a/crypto/aes/asm/aesv8-armx.pl +++ b/crypto/aes/asm/aesv8-armx.pl @@ -740,6 +740,21 @@ $code.=<<___; #ifndef __ARMEB__ rev $ctr, $ctr #endif +___ +$code.=<<___ if ($flavour =~ /64/); + vorr $dat1,$dat0,$dat0 + add $tctr1, $ctr, #1 + vorr $dat2,$dat0,$dat0 + add $ctr, $ctr, #2 + vorr $ivec,$dat0,$dat0 + rev $tctr1, $tctr1 + vmov.32 ${dat1}[3],$tctr1 + b.ls .Lctr32_tail + rev $tctr2, $ctr + sub $len,$len,#3 // bias + vmov.32 ${dat2}[3],$tctr2 +___ +$code.=<<___ if ($flavour !~ /64/); add $tctr1, $ctr, #1 vorr $ivec,$dat0,$dat0 rev $tctr1, $tctr1 @@ -751,6 +766,8 @@ $code.=<<___; vmov.32 ${ivec}[3],$tctr2 sub $len,$len,#3 // bias vorr $dat2,$ivec,$ivec +___ +$code.=<<___; b .Loop3x_ctr32 .align 4 @@ -777,11 +794,25 @@ $code.=<<___; aese $dat1,q8 aesmc $tmp1,$dat1 vld1.8 {$in0},[$inp],#16 +___ +$code.=<<___ if ($flavour =~ /64/); + vorr $dat0,$ivec,$ivec +___ +$code.=<<___ if ($flavour !~ /64/); add $tctr0,$ctr,#1 +___ +$code.=<<___; aese $dat2,q8 aesmc $dat2,$dat2 vld1.8 {$in1},[$inp],#16 +___ +$code.=<<___ if ($flavour =~ /64/); + vorr $dat1,$ivec,$ivec +___ +$code.=<<___ if ($flavour !~ /64/); rev $tctr0,$tctr0 +___ +$code.=<<___; aese $tmp0,q9 aesmc $tmp0,$tmp0 aese $tmp1,q9 @@ -790,6 +821,12 @@ $code.=<<___; mov $key_,$key aese $dat2,q9 aesmc $tmp2,$dat2 +___ +$code.=<<___ if ($flavour =~ /64/); + vorr $dat2,$ivec,$ivec + add $tctr0,$ctr,#1 +___ +$code.=<<___; aese $tmp0,q12 aesmc $tmp0,$tmp0 aese $tmp1,q12 @@ -805,22 +842,47 @@ $code.=<<___; aese $tmp1,q13 aesmc $tmp1,$tmp1 veor $in2,$in2,$rndlast +___ +$code.=<<___ if ($flavour =~ /64/); + rev $tctr0,$tctr0 + aese $tmp2,q13 + aesmc $tmp2,$tmp2 + vmov.32 ${dat0}[3], $tctr0 +___ +$code.=<<___ if ($flavour !~ /64/); vmov.32 ${ivec}[3], $tctr0 aese $tmp2,q13 aesmc $tmp2,$tmp2 vorr $dat0,$ivec,$ivec +___ +$code.=<<___; rev $tctr1,$tctr1 aese $tmp0,q14 aesmc $tmp0,$tmp0 +___ +$code.=<<___ if ($flavour !~ /64/); vmov.32 ${ivec}[3], $tctr1 rev $tctr2,$ctr +___ +$code.=<<___; aese $tmp1,q14 aesmc $tmp1,$tmp1 +___ +$code.=<<___ if ($flavour =~ /64/); + vmov.32 ${dat1}[3], $tctr1 + rev $tctr2,$ctr + aese $tmp2,q14 + aesmc $tmp2,$tmp2 + vmov.32 ${dat2}[3], $tctr2 +___ +$code.=<<___ if ($flavour !~ /64/); vorr $dat1,$ivec,$ivec vmov.32 ${ivec}[3], $tctr2 aese $tmp2,q14 aesmc $tmp2,$tmp2 vorr $dat2,$ivec,$ivec +___ +$code.=<<___; subs $len,$len,#3 aese $tmp0,q15 aese $tmp1,q15 |