Fix reported performance degradation on aarch64

This restores the implementation prior to commit 2621751 ("aes/asm/aesv8-armx.pl: avoid 32-bit lane assignment in CTR mode") for 64bit targets only, since it is reportedly 2-17% slower, and the silicon errata only affects 32bit targets. Only for 32bit targets the new algorithm is used. Fixes #18445 Reviewed-by: Tomas Mraz <tomas@openssl.org> Reviewed-by: Paul Dale <pauli@openssl.org> Reviewed-by: Hugo Landau <hlandau@openssl.org> (Merged from https://github.com/openssl/openssl/pull/18539)
author: Bernd Edlinger <bernd.edlinger@hotmail.de> 2022-06-12 09:37:26 +0200
committer: Hugo Landau <hlandau@openssl.org> 2022-07-06 08:11:03 +0100
commit: a8f6d73fda64d514171e99a50d1483c0c0b8d968 (patch)
tree: b808341a1007e6cde385d31e2151f51a5f5ecdf9
parent: 60f011f584d80447e86cae1d1bd3ae24bc13235b (diff)
1 files changed, 62 insertions, 0 deletions
diff --git a/crypto/aes/asm/aesv8-armx.pl b/crypto/aes/asm/aesv8-armx.pl
index 2b0e982996..1856d9977f 100755
--- a/crypto/aes/asm/aesv8-armx.pl
+++ b/crypto/aes/asm/aesv8-armx.pl
@@ -740,6 +740,21 @@ $code.=<<___;
 #ifndef __ARMEB__
 	rev		$ctr, $ctr
 #endif
+___
+$code.=<<___	if ($flavour =~ /64/);
+	vorr		$dat1,$dat0,$dat0
+	add		$tctr1, $ctr, #1
+	vorr		$dat2,$dat0,$dat0
+	add		$ctr, $ctr, #2
+	vorr		$ivec,$dat0,$dat0
+	rev		$tctr1, $tctr1
+	vmov.32		${dat1}[3],$tctr1
+	b.ls		.Lctr32_tail
+	rev		$tctr2, $ctr
+	sub		$len,$len,#3		// bias
+	vmov.32		${dat2}[3],$tctr2
+___
+$code.=<<___	if ($flavour !~ /64/);
 	add		$tctr1, $ctr, #1
 	vorr		$ivec,$dat0,$dat0
 	rev		$tctr1, $tctr1
@@ -751,6 +766,8 @@ $code.=<<___;
 	vmov.32		${ivec}[3],$tctr2
 	sub		$len,$len,#3		// bias
 	vorr		$dat2,$ivec,$ivec
+___
+$code.=<<___;
 	b		.Loop3x_ctr32
 
 .align	4
@@ -777,11 +794,25 @@ $code.=<<___;
 	aese		$dat1,q8
 	aesmc		$tmp1,$dat1
 	 vld1.8		{$in0},[$inp],#16
+___
+$code.=<<___	if ($flavour =~ /64/);
+	 vorr		$dat0,$ivec,$ivec
+___
+$code.=<<___	if ($flavour !~ /64/);
 	 add		$tctr0,$ctr,#1
+___
+$code.=<<___;
 	aese		$dat2,q8
 	aesmc		$dat2,$dat2
 	 vld1.8		{$in1},[$inp],#16
+___
+$code.=<<___	if ($flavour =~ /64/);
+	 vorr		$dat1,$ivec,$ivec
+___
+$code.=<<___	if ($flavour !~ /64/);
 	 rev		$tctr0,$tctr0
+___
+$code.=<<___;
 	aese		$tmp0,q9
 	aesmc		$tmp0,$tmp0
 	aese		$tmp1,q9
@@ -790,6 +821,12 @@ $code.=<<___;
 	 mov		$key_,$key
 	aese		$dat2,q9
 	aesmc		$tmp2,$dat2
+___
+$code.=<<___	if ($flavour =~ /64/);
+	 vorr		$dat2,$ivec,$ivec
+	 add		$tctr0,$ctr,#1
+___
+$code.=<<___;
 	aese		$tmp0,q12
 	aesmc		$tmp0,$tmp0
 	aese		$tmp1,q12
@@ -805,22 +842,47 @@ $code.=<<___;
 	aese		$tmp1,q13
 	aesmc		$tmp1,$tmp1
 	 veor		$in2,$in2,$rndlast
+___
+$code.=<<___	if ($flavour =~ /64/);
+	 rev		$tctr0,$tctr0
+	aese		$tmp2,q13
+	aesmc		$tmp2,$tmp2
+	 vmov.32	${dat0}[3], $tctr0
+___
+$code.=<<___	if ($flavour !~ /64/);
 	 vmov.32	${ivec}[3], $tctr0
 	aese		$tmp2,q13
 	aesmc		$tmp2,$tmp2
 	 vorr		$dat0,$ivec,$ivec
+___
+$code.=<<___;
 	 rev		$tctr1,$tctr1
 	aese		$tmp0,q14
 	aesmc		$tmp0,$tmp0
+___
+$code.=<<___	if ($flavour !~ /64/);
 	 vmov.32	${ivec}[3], $tctr1
 	 rev		$tctr2,$ctr
+___
+$code.=<<___;
 	aese		$tmp1,q14
 	aesmc		$tmp1,$tmp1
+___
+$code.=<<___	if ($flavour =~ /64/);
+	 vmov.32	${dat1}[3], $tctr1
+	 rev		$tctr2,$ctr
+	aese		$tmp2,q14
+	aesmc		$tmp2,$tmp2
+	 vmov.32	${dat2}[3], $tctr2
+___
+$code.=<<___	if ($flavour !~ /64/);
 	 vorr		$dat1,$ivec,$ivec
 	 vmov.32	${ivec}[3], $tctr2
 	aese		$tmp2,q14
 	aesmc		$tmp2,$tmp2
 	 vorr		$dat2,$ivec,$ivec
+___
+$code.=<<___;
 	 subs		$len,$len,#3
 	aese		$tmp0,q15
 	aese		$tmp1,q15
author	Bernd Edlinger <bernd.edlinger@hotmail.de>	2022-06-12 09:37:26 +0200
committer	Hugo Landau <hlandau@openssl.org>	2022-07-06 08:11:03 +0100
commit	a8f6d73fda64d514171e99a50d1483c0c0b8d968 (patch)
tree	b808341a1007e6cde385d31e2151f51a5f5ecdf9
parent	60f011f584d80447e86cae1d1bd3ae24bc13235b (diff)