From 4e2b9907345c4d4c057cd7981f6d377c1412db93 Mon Sep 17 00:00:00 2001
From: Andy Polyakov <appro@openssl.org>
Date: Tue, 13 Jul 2010 14:08:22 +0000
Subject: ARM assembler pack: reschedule instructions for dual-issue pipeline
 [from HEAD]. Modest improvement coefficients mean that code already had some
 parallelism and there was not very much room for improvement. Special thanks
 to Ted Krovetz for benchmarking the code with such patience.

---
 crypto/sha/asm/sha256-armv4.pl | 33 +++++++++++++++++++--------------
 crypto/sha/asm/sha512-armv4.pl | 32 ++++++++++++++++++--------------
 2 files changed, 37 insertions(+), 28 deletions(-)

(limited to 'crypto/sha')

diff --git a/crypto/sha/asm/sha256-armv4.pl b/crypto/sha/asm/sha256-armv4.pl
index 48d846deec..492cb62bc0 100644
--- a/crypto/sha/asm/sha256-armv4.pl
+++ b/crypto/sha/asm/sha256-armv4.pl
@@ -11,9 +11,14 @@
 
 # Performance is ~2x better than gcc 3.4 generated code and in "abso-
 # lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
-# byte.
+# byte [on single-issue Xscale PXA250 core].
 
-$output=shift;
+# July 2010.
+#
+# Rescheduling for dual-issue pipeline resulted in 22% improvement on
+# Cortex A8 core and ~20 cycles per processed byte.
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
 open STDOUT,">$output";
 
 $ctx="r0";	$t0="r0";
@@ -52,27 +57,27 @@ $code.=<<___ if ($i<16);
 ___
 $code.=<<___;
 	ldr	$t2,[$Ktbl],#4			@ *K256++
-	str	$T1,[sp,#`$i%16`*4]
 	mov	$t0,$e,ror#$Sigma1[0]
+	str	$T1,[sp,#`$i%16`*4]
 	eor	$t0,$t0,$e,ror#$Sigma1[1]
-	eor	$t0,$t0,$e,ror#$Sigma1[2]	@ Sigma1(e)
-	add	$T1,$T1,$t0
 	eor	$t1,$f,$g
+	eor	$t0,$t0,$e,ror#$Sigma1[2]	@ Sigma1(e)
 	and	$t1,$t1,$e
+	add	$T1,$T1,$t0
 	eor	$t1,$t1,$g			@ Ch(e,f,g)
-	add	$T1,$T1,$t1
 	add	$T1,$T1,$h
-	add	$T1,$T1,$t2
 	mov	$h,$a,ror#$Sigma0[0]
+	add	$T1,$T1,$t1
 	eor	$h,$h,$a,ror#$Sigma0[1]
+	add	$T1,$T1,$t2
 	eor	$h,$h,$a,ror#$Sigma0[2]		@ Sigma0(a)
 	orr	$t0,$a,$b
-	and	$t0,$t0,$c
 	and	$t1,$a,$b
+	and	$t0,$t0,$c
+	add	$h,$h,$T1
 	orr	$t0,$t0,$t1			@ Maj(a,b,c)
-	add	$h,$h,$t0
 	add	$d,$d,$T1
-	add	$h,$h,$T1
+	add	$h,$h,$t0
 ___
 }
 
@@ -80,19 +85,19 @@ sub BODY_16_XX {
 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
 
 $code.=<<___;
-	ldr	$t1,[sp,#`($i+1)%16`*4]	@ $i
+	ldr	$t1,[sp,#`($i+1)%16`*4]		@ $i
 	ldr	$t2,[sp,#`($i+14)%16`*4]
 	ldr	$T1,[sp,#`($i+0)%16`*4]
-	ldr	$inp,[sp,#`($i+9)%16`*4]
 	mov	$t0,$t1,ror#$sigma0[0]
+	ldr	$inp,[sp,#`($i+9)%16`*4]
 	eor	$t0,$t0,$t1,ror#$sigma0[1]
 	eor	$t0,$t0,$t1,lsr#$sigma0[2]	@ sigma0(X[i+1])
 	mov	$t1,$t2,ror#$sigma1[0]
+	add	$T1,$T1,$t0
 	eor	$t1,$t1,$t2,ror#$sigma1[1]
+	add	$T1,$T1,$inp
 	eor	$t1,$t1,$t2,lsr#$sigma1[2]	@ sigma1(X[i+14])
-	add	$T1,$T1,$t0
 	add	$T1,$T1,$t1
-	add	$T1,$T1,$inp
 ___
 	&BODY_00_15(@_);
 }
diff --git a/crypto/sha/asm/sha512-armv4.pl b/crypto/sha/asm/sha512-armv4.pl
index 4fbb94a914..3a35861ac6 100644
--- a/crypto/sha/asm/sha512-armv4.pl
+++ b/crypto/sha/asm/sha512-armv4.pl
@@ -10,7 +10,13 @@
 # SHA512 block procedure for ARMv4. September 2007.
 
 # This code is ~4.5 (four and a half) times faster than code generated
-# by gcc 3.4 and it spends ~72 clock cycles per byte. 
+# by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
+# Xscale PXA250 core].
+#
+# July 2010.
+#
+# Rescheduling for dual-issue pipeline resulted in 6% improvement on
+# Cortex A8 core and ~40 cycles per processed byte.
 
 # Byte order [in]dependence. =========================================
 #
@@ -22,7 +28,7 @@ $hi=0;
 $lo=4;
 # ====================================================================
 
-$output=shift;
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
 open STDOUT,">$output";
 
 $ctx="r0";
@@ -73,33 +79,31 @@ $code.=<<___;
 	eor	$t0,$t0,$Elo,lsl#23
 	eor	$t1,$t1,$Ehi,lsl#23	@ Sigma1(e)
 	adds	$Tlo,$Tlo,$t0
-	adc	$Thi,$Thi,$t1		@ T += Sigma1(e)
-	adds	$Tlo,$Tlo,$t2
-	adc	$Thi,$Thi,$t3		@ T += h
-
 	ldr	$t0,[sp,#$Foff+0]	@ f.lo
+	adc	$Thi,$Thi,$t1		@ T += Sigma1(e)
 	ldr	$t1,[sp,#$Foff+4]	@ f.hi
+	adds	$Tlo,$Tlo,$t2
 	ldr	$t2,[sp,#$Goff+0]	@ g.lo
+	adc	$Thi,$Thi,$t3		@ T += h
 	ldr	$t3,[sp,#$Goff+4]	@ g.hi
-	str	$Elo,[sp,#$Eoff+0]
-	str	$Ehi,[sp,#$Eoff+4]
-	str	$Alo,[sp,#$Aoff+0]
-	str	$Ahi,[sp,#$Aoff+4]
 
 	eor	$t0,$t0,$t2
+	str	$Elo,[sp,#$Eoff+0]
 	eor	$t1,$t1,$t3
+	str	$Ehi,[sp,#$Eoff+4]
 	and	$t0,$t0,$Elo
+	str	$Alo,[sp,#$Aoff+0]
 	and	$t1,$t1,$Ehi
+	str	$Ahi,[sp,#$Aoff+4]
 	eor	$t0,$t0,$t2
-	eor	$t1,$t1,$t3		@ Ch(e,f,g)
-
 	ldr	$t2,[$Ktbl,#4]		@ K[i].lo
+	eor	$t1,$t1,$t3		@ Ch(e,f,g)
 	ldr	$t3,[$Ktbl,#0]		@ K[i].hi
-	ldr	$Elo,[sp,#$Doff+0]	@ d.lo
-	ldr	$Ehi,[sp,#$Doff+4]	@ d.hi
 
 	adds	$Tlo,$Tlo,$t0
+	ldr	$Elo,[sp,#$Doff+0]	@ d.lo
 	adc	$Thi,$Thi,$t1		@ T += Ch(e,f,g)
+	ldr	$Ehi,[sp,#$Doff+4]	@ d.hi
 	adds	$Tlo,$Tlo,$t2
 	adc	$Thi,$Thi,$t3		@ T += K[i]
 	adds	$Elo,$Elo,$Tlo
-- 
cgit v1.2.3