ARM assembler pack: reschedule instructions for dual-issue pipeline.

Modest improvement coefficients mean that code already had some parallelism and there was not very much room for improvement. Special thanks to Ted Krovetz for benchmarking the code with such patience.
author: Andy Polyakov <appro@openssl.org> 2010-07-13 14:03:31 +0000
committer: Andy Polyakov <appro@openssl.org> 2010-07-13 14:03:31 +0000
commit: 2d22e08083d031eb63cada59aa95afe5279008d3 (patch)
tree: 5971d660922f8ef8c58bc18ab4fc6f1a22346ed5 /crypto/sha/asm
parent: 0852f90c300405c79c2af5c549e74d0d4a8f664c (diff)
2 files changed, 35 insertions, 26 deletions
diff --git a/crypto/sha/asm/sha256-armv4.pl b/crypto/sha/asm/sha256-armv4.pl
index db87434f91..492cb62bc0 100644
--- a/crypto/sha/asm/sha256-armv4.pl
+++ b/crypto/sha/asm/sha256-armv4.pl
@@ -11,7 +11,12 @@
 
 # Performance is ~2x better than gcc 3.4 generated code and in "abso-
 # lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
-# byte.
+# byte [on single-issue Xscale PXA250 core].
+
+# July 2010.
+#
+# Rescheduling for dual-issue pipeline resulted in 22% improvement on
+# Cortex A8 core and ~20 cycles per processed byte.
 
 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
 open STDOUT,">$output";
@@ -52,27 +57,27 @@ $code.=<<___ if ($i<16);
 ___
 $code.=<<___;
 	ldr	$t2,[$Ktbl],#4			@ *K256++
-	str	$T1,[sp,#`$i%16`*4]
 	mov	$t0,$e,ror#$Sigma1[0]
+	str	$T1,[sp,#`$i%16`*4]
 	eor	$t0,$t0,$e,ror#$Sigma1[1]
-	eor	$t0,$t0,$e,ror#$Sigma1[2]	@ Sigma1(e)
-	add	$T1,$T1,$t0
 	eor	$t1,$f,$g
+	eor	$t0,$t0,$e,ror#$Sigma1[2]	@ Sigma1(e)
 	and	$t1,$t1,$e
+	add	$T1,$T1,$t0
 	eor	$t1,$t1,$g			@ Ch(e,f,g)
-	add	$T1,$T1,$t1
 	add	$T1,$T1,$h
-	add	$T1,$T1,$t2
 	mov	$h,$a,ror#$Sigma0[0]
+	add	$T1,$T1,$t1
 	eor	$h,$h,$a,ror#$Sigma0[1]
+	add	$T1,$T1,$t2
 	eor	$h,$h,$a,ror#$Sigma0[2]		@ Sigma0(a)
 	orr	$t0,$a,$b
-	and	$t0,$t0,$c
 	and	$t1,$a,$b
+	and	$t0,$t0,$c
+	add	$h,$h,$T1
 	orr	$t0,$t0,$t1			@ Maj(a,b,c)
-	add	$h,$h,$t0
 	add	$d,$d,$T1
-	add	$h,$h,$T1
+	add	$h,$h,$t0
 ___
 }
 
@@ -80,19 +85,19 @@ sub BODY_16_XX {
 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
 
 $code.=<<___;
-	ldr	$t1,[sp,#`($i+1)%16`*4]	@ $i
+	ldr	$t1,[sp,#`($i+1)%16`*4]		@ $i
 	ldr	$t2,[sp,#`($i+14)%16`*4]
 	ldr	$T1,[sp,#`($i+0)%16`*4]
-	ldr	$inp,[sp,#`($i+9)%16`*4]
 	mov	$t0,$t1,ror#$sigma0[0]
+	ldr	$inp,[sp,#`($i+9)%16`*4]
 	eor	$t0,$t0,$t1,ror#$sigma0[1]
 	eor	$t0,$t0,$t1,lsr#$sigma0[2]	@ sigma0(X[i+1])
 	mov	$t1,$t2,ror#$sigma1[0]
+	add	$T1,$T1,$t0
 	eor	$t1,$t1,$t2,ror#$sigma1[1]
+	add	$T1,$T1,$inp
 	eor	$t1,$t1,$t2,lsr#$sigma1[2]	@ sigma1(X[i+14])
-	add	$T1,$T1,$t0
 	add	$T1,$T1,$t1
-	add	$T1,$T1,$inp
 ___
 	&BODY_00_15(@_);
 }
diff --git a/crypto/sha/asm/sha512-armv4.pl b/crypto/sha/asm/sha512-armv4.pl
index 7d27f0b78d..3a35861ac6 100644
--- a/crypto/sha/asm/sha512-armv4.pl
+++ b/crypto/sha/asm/sha512-armv4.pl
@@ -10,7 +10,13 @@
 # SHA512 block procedure for ARMv4. September 2007.
 
 # This code is ~4.5 (four and a half) times faster than code generated
-# by gcc 3.4 and it spends ~72 clock cycles per byte. 
+# by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
+# Xscale PXA250 core].
+#
+# July 2010.
+#
+# Rescheduling for dual-issue pipeline resulted in 6% improvement on
+# Cortex A8 core and ~40 cycles per processed byte.
 
 # Byte order [in]dependence. =========================================
 #
@@ -73,33 +79,31 @@ $code.=<<___;
 	eor	$t0,$t0,$Elo,lsl#23
 	eor	$t1,$t1,$Ehi,lsl#23	@ Sigma1(e)
 	adds	$Tlo,$Tlo,$t0
-	adc	$Thi,$Thi,$t1		@ T += Sigma1(e)
-	adds	$Tlo,$Tlo,$t2
-	adc	$Thi,$Thi,$t3		@ T += h
-
 	ldr	$t0,[sp,#$Foff+0]	@ f.lo
+	adc	$Thi,$Thi,$t1		@ T += Sigma1(e)
 	ldr	$t1,[sp,#$Foff+4]	@ f.hi
+	adds	$Tlo,$Tlo,$t2
 	ldr	$t2,[sp,#$Goff+0]	@ g.lo
+	adc	$Thi,$Thi,$t3		@ T += h
 	ldr	$t3,[sp,#$Goff+4]	@ g.hi
-	str	$Elo,[sp,#$Eoff+0]
-	str	$Ehi,[sp,#$Eoff+4]
-	str	$Alo,[sp,#$Aoff+0]
-	str	$Ahi,[sp,#$Aoff+4]
 
 	eor	$t0,$t0,$t2
+	str	$Elo,[sp,#$Eoff+0]
 	eor	$t1,$t1,$t3
+	str	$Ehi,[sp,#$Eoff+4]
 	and	$t0,$t0,$Elo
+	str	$Alo,[sp,#$Aoff+0]
 	and	$t1,$t1,$Ehi
+	str	$Ahi,[sp,#$Aoff+4]
 	eor	$t0,$t0,$t2
-	eor	$t1,$t1,$t3		@ Ch(e,f,g)
-
 	ldr	$t2,[$Ktbl,#4]		@ K[i].lo
+	eor	$t1,$t1,$t3		@ Ch(e,f,g)
 	ldr	$t3,[$Ktbl,#0]		@ K[i].hi
-	ldr	$Elo,[sp,#$Doff+0]	@ d.lo
-	ldr	$Ehi,[sp,#$Doff+4]	@ d.hi
 
 	adds	$Tlo,$Tlo,$t0
+	ldr	$Elo,[sp,#$Doff+0]	@ d.lo
 	adc	$Thi,$Thi,$t1		@ T += Ch(e,f,g)
+	ldr	$Ehi,[sp,#$Doff+4]	@ d.hi
 	adds	$Tlo,$Tlo,$t2
 	adc	$Thi,$Thi,$t3		@ T += K[i]
 	adds	$Elo,$Elo,$Tlo
author	Andy Polyakov <appro@openssl.org>	2010-07-13 14:03:31 +0000
committer	Andy Polyakov <appro@openssl.org>	2010-07-13 14:03:31 +0000
commit	2d22e08083d031eb63cada59aa95afe5279008d3 (patch)
tree	5971d660922f8ef8c58bc18ab4fc6f1a22346ed5 /crypto/sha/asm
parent	0852f90c300405c79c2af5c549e74d0d4a8f664c (diff)