ARM assembler pack: reschedule instructions for dual-issue pipeline [from HEAD].

Modest improvement coefficients mean that code already had some parallelism and there was not very much room for improvement. Special thanks to Ted Krovetz for benchmarking the code with such patience.
author: Andy Polyakov <appro@openssl.org> 2010-07-13 14:08:22 +0000
committer: Andy Polyakov <appro@openssl.org> 2010-07-13 14:08:22 +0000
commit: 4e2b9907345c4d4c057cd7981f6d377c1412db93 (patch)
tree: f2430fdedcbe25dc88497b57237cc3387cec1898 /crypto/sha
parent: 6ca141858718c6ba0dfccb7efc9916561b9fcc15 (diff)
2 files changed, 37 insertions, 28 deletions
diff --git a/crypto/sha/asm/sha256-armv4.pl b/crypto/sha/asm/sha256-armv4.pl
index 48d846deec..492cb62bc0 100644
--- a/crypto/sha/asm/sha256-armv4.pl
+++ b/crypto/sha/asm/sha256-armv4.pl
@@ -11,9 +11,14 @@
 
 # Performance is ~2x better than gcc 3.4 generated code and in "abso-
 # lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
-# byte.
+# byte [on single-issue Xscale PXA250 core].
 
-$output=shift;
+# July 2010.
+#
+# Rescheduling for dual-issue pipeline resulted in 22% improvement on
+# Cortex A8 core and ~20 cycles per processed byte.
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
 open STDOUT,">$output";
 
 $ctx="r0";	$t0="r0";
@@ -52,27 +57,27 @@ $code.=<<___ if ($i<16);
 ___
 $code.=<<___;
 	ldr	$t2,[$Ktbl],#4			@ *K256++
-	str	$T1,[sp,#`$i%16`*4]
 	mov	$t0,$e,ror#$Sigma1[0]
+	str	$T1,[sp,#`$i%16`*4]
 	eor	$t0,$t0,$e,ror#$Sigma1[1]
-	eor	$t0,$t0,$e,ror#$Sigma1[2]	@ Sigma1(e)
-	add	$T1,$T1,$t0
 	eor	$t1,$f,$g
+	eor	$t0,$t0,$e,ror#$Sigma1[2]	@ Sigma1(e)
 	and	$t1,$t1,$e
+	add	$T1,$T1,$t0
 	eor	$t1,$t1,$g			@ Ch(e,f,g)
-	add	$T1,$T1,$t1
 	add	$T1,$T1,$h
-	add	$T1,$T1,$t2
 	mov	$h,$a,ror#$Sigma0[0]
+	add	$T1,$T1,$t1
 	eor	$h,$h,$a,ror#$Sigma0[1]
+	add	$T1,$T1,$t2
 	eor	$h,$h,$a,ror#$Sigma0[2]		@ Sigma0(a)
 	orr	$t0,$a,$b
-	and	$t0,$t0,$c
 	and	$t1,$a,$b
+	and	$t0,$t0,$c
+	add	$h,$h,$T1
 	orr	$t0,$t0,$t1			@ Maj(a,b,c)
-	add	$h,$h,$t0
 	add	$d,$d,$T1
-	add	$h,$h,$T1
+	add	$h,$h,$t0
 ___
 }
 
@@ -80,19 +85,19 @@ sub BODY_16_XX {
 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
 
 $code.=<<___;
-	ldr	$t1,[sp,#`($i+1)%16`*4]	@ $i
+	ldr	$t1,[sp,#`($i+1)%16`*4]		@ $i
 	ldr	$t2,[sp,#`($i+14)%16`*4]
 	ldr	$T1,[sp,#`($i+0)%16`*4]
-	ldr	$inp,[sp,#`($i+9)%16`*4]
 	mov	$t0,$t1,ror#$sigma0[0]
+	ldr	$inp,[sp,#`($i+9)%16`*4]
 	eor	$t0,$t0,$t1,ror#$sigma0[1]
 	eor	$t0,$t0,$t1,lsr#$sigma0[2]	@ sigma0(X[i+1])
 	mov	$t1,$t2,ror#$sigma1[0]
+	add	$T1,$T1,$t0
 	eor	$t1,$t1,$t2,ror#$sigma1[1]
+	add	$T1,$T1,$inp
 	eor	$t1,$t1,$t2,lsr#$sigma1[2]	@ sigma1(X[i+14])
-	add	$T1,$T1,$t0
 	add	$T1,$T1,$t1
-	add	$T1,$T1,$inp
 ___
 	&BODY_00_15(@_);
 }
diff --git a/crypto/sha/asm/sha512-armv4.pl b/crypto/sha/asm/sha512-armv4.pl
index 4fbb94a914..3a35861ac6 100644
--- a/crypto/sha/asm/sha512-armv4.pl
+++ b/crypto/sha/asm/sha512-armv4.pl
@@ -10,7 +10,13 @@
 # SHA512 block procedure for ARMv4. September 2007.
 
 # This code is ~4.5 (four and a half) times faster than code generated
-# by gcc 3.4 and it spends ~72 clock cycles per byte. 
+# by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
+# Xscale PXA250 core].
+#
+# July 2010.
+#
+# Rescheduling for dual-issue pipeline resulted in 6% improvement on
+# Cortex A8 core and ~40 cycles per processed byte.
 
 # Byte order [in]dependence. =========================================
 #
@@ -22,7 +28,7 @@ $hi=0;
 $lo=4;
 # ====================================================================
 
-$output=shift;
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
 open STDOUT,">$output";
 
 $ctx="r0";
@@ -73,33 +79,31 @@ $code.=<<___;
 	eor	$t0,$t0,$Elo,lsl#23
 	eor	$t1,$t1,$Ehi,lsl#23	@ Sigma1(e)
 	adds	$Tlo,$Tlo,$t0
-	adc	$Thi,$Thi,$t1		@ T += Sigma1(e)
-	adds	$Tlo,$Tlo,$t2
-	adc	$Thi,$Thi,$t3		@ T += h
-
 	ldr	$t0,[sp,#$Foff+0]	@ f.lo
+	adc	$Thi,$Thi,$t1		@ T += Sigma1(e)
 	ldr	$t1,[sp,#$Foff+4]	@ f.hi
+	adds	$Tlo,$Tlo,$t2
 	ldr	$t2,[sp,#$Goff+0]	@ g.lo
+	adc	$Thi,$Thi,$t3		@ T += h
 	ldr	$t3,[sp,#$Goff+4]	@ g.hi
-	str	$Elo,[sp,#$Eoff+0]
-	str	$Ehi,[sp,#$Eoff+4]
-	str	$Alo,[sp,#$Aoff+0]
-	str	$Ahi,[sp,#$Aoff+4]
 
 	eor	$t0,$t0,$t2
+	str	$Elo,[sp,#$Eoff+0]
 	eor	$t1,$t1,$t3
+	str	$Ehi,[sp,#$Eoff+4]
 	and	$t0,$t0,$Elo
+	str	$Alo,[sp,#$Aoff+0]
 	and	$t1,$t1,$Ehi
+	str	$Ahi,[sp,#$Aoff+4]
 	eor	$t0,$t0,$t2
-	eor	$t1,$t1,$t3		@ Ch(e,f,g)
-
 	ldr	$t2,[$Ktbl,#4]		@ K[i].lo
+	eor	$t1,$t1,$t3		@ Ch(e,f,g)
 	ldr	$t3,[$Ktbl,#0]		@ K[i].hi
-	ldr	$Elo,[sp,#$Doff+0]	@ d.lo
-	ldr	$Ehi,[sp,#$Doff+4]	@ d.hi
 
 	adds	$Tlo,$Tlo,$t0
+	ldr	$Elo,[sp,#$Doff+0]	@ d.lo
 	adc	$Thi,$Thi,$t1		@ T += Ch(e,f,g)
+	ldr	$Ehi,[sp,#$Doff+4]	@ d.hi
 	adds	$Tlo,$Tlo,$t2
 	adc	$Thi,$Thi,$t3		@ T += K[i]
 	adds	$Elo,$Elo,$Tlo
author	Andy Polyakov <appro@openssl.org>	2010-07-13 14:08:22 +0000
committer	Andy Polyakov <appro@openssl.org>	2010-07-13 14:08:22 +0000
commit	4e2b9907345c4d4c057cd7981f6d377c1412db93 (patch)
tree	f2430fdedcbe25dc88497b57237cc3387cec1898 /crypto/sha
parent	6ca141858718c6ba0dfccb7efc9916561b9fcc15 (diff)