summaryrefslogtreecommitdiffstats
path: root/crypto/sha
diff options
context:
space:
mode:
authorAndy Polyakov <appro@openssl.org>2010-07-13 14:08:22 +0000
committerAndy Polyakov <appro@openssl.org>2010-07-13 14:08:22 +0000
commit4e2b9907345c4d4c057cd7981f6d377c1412db93 (patch)
treef2430fdedcbe25dc88497b57237cc3387cec1898 /crypto/sha
parent6ca141858718c6ba0dfccb7efc9916561b9fcc15 (diff)
ARM assembler pack: reschedule instructions for dual-issue pipeline [from HEAD].
Modest improvement coefficients mean that code already had some parallelism and there was not very much room for improvement. Special thanks to Ted Krovetz for benchmarking the code with such patience.
Diffstat (limited to 'crypto/sha')
-rw-r--r--crypto/sha/asm/sha256-armv4.pl33
-rw-r--r--crypto/sha/asm/sha512-armv4.pl32
2 files changed, 37 insertions, 28 deletions
diff --git a/crypto/sha/asm/sha256-armv4.pl b/crypto/sha/asm/sha256-armv4.pl
index 48d846deec..492cb62bc0 100644
--- a/crypto/sha/asm/sha256-armv4.pl
+++ b/crypto/sha/asm/sha256-armv4.pl
@@ -11,9 +11,14 @@
# Performance is ~2x better than gcc 3.4 generated code and in "abso-
# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
-# byte.
+# byte [on single-issue Xscale PXA250 core].
-$output=shift;
+# July 2010.
+#
+# Rescheduling for dual-issue pipeline resulted in 22% improvement on
+# Cortex A8 core and ~20 cycles per processed byte.
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
$ctx="r0"; $t0="r0";
@@ -52,27 +57,27 @@ $code.=<<___ if ($i<16);
___
$code.=<<___;
ldr $t2,[$Ktbl],#4 @ *K256++
- str $T1,[sp,#`$i%16`*4]
mov $t0,$e,ror#$Sigma1[0]
+ str $T1,[sp,#`$i%16`*4]
eor $t0,$t0,$e,ror#$Sigma1[1]
- eor $t0,$t0,$e,ror#$Sigma1[2] @ Sigma1(e)
- add $T1,$T1,$t0
eor $t1,$f,$g
+ eor $t0,$t0,$e,ror#$Sigma1[2] @ Sigma1(e)
and $t1,$t1,$e
+ add $T1,$T1,$t0
eor $t1,$t1,$g @ Ch(e,f,g)
- add $T1,$T1,$t1
add $T1,$T1,$h
- add $T1,$T1,$t2
mov $h,$a,ror#$Sigma0[0]
+ add $T1,$T1,$t1
eor $h,$h,$a,ror#$Sigma0[1]
+ add $T1,$T1,$t2
eor $h,$h,$a,ror#$Sigma0[2] @ Sigma0(a)
orr $t0,$a,$b
- and $t0,$t0,$c
and $t1,$a,$b
+ and $t0,$t0,$c
+ add $h,$h,$T1
orr $t0,$t0,$t1 @ Maj(a,b,c)
- add $h,$h,$t0
add $d,$d,$T1
- add $h,$h,$T1
+ add $h,$h,$t0
___
}
@@ -80,19 +85,19 @@ sub BODY_16_XX {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
$code.=<<___;
- ldr $t1,[sp,#`($i+1)%16`*4] @ $i
+ ldr $t1,[sp,#`($i+1)%16`*4] @ $i
ldr $t2,[sp,#`($i+14)%16`*4]
ldr $T1,[sp,#`($i+0)%16`*4]
- ldr $inp,[sp,#`($i+9)%16`*4]
mov $t0,$t1,ror#$sigma0[0]
+ ldr $inp,[sp,#`($i+9)%16`*4]
eor $t0,$t0,$t1,ror#$sigma0[1]
eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1])
mov $t1,$t2,ror#$sigma1[0]
+ add $T1,$T1,$t0
eor $t1,$t1,$t2,ror#$sigma1[1]
+ add $T1,$T1,$inp
eor $t1,$t1,$t2,lsr#$sigma1[2] @ sigma1(X[i+14])
- add $T1,$T1,$t0
add $T1,$T1,$t1
- add $T1,$T1,$inp
___
&BODY_00_15(@_);
}
diff --git a/crypto/sha/asm/sha512-armv4.pl b/crypto/sha/asm/sha512-armv4.pl
index 4fbb94a914..3a35861ac6 100644
--- a/crypto/sha/asm/sha512-armv4.pl
+++ b/crypto/sha/asm/sha512-armv4.pl
@@ -10,7 +10,13 @@
# SHA512 block procedure for ARMv4. September 2007.
# This code is ~4.5 (four and a half) times faster than code generated
-# by gcc 3.4 and it spends ~72 clock cycles per byte.
+# by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
+# Xscale PXA250 core].
+#
+# July 2010.
+#
+# Rescheduling for dual-issue pipeline resulted in 6% improvement on
+# Cortex A8 core and ~40 cycles per processed byte.
# Byte order [in]dependence. =========================================
#
@@ -22,7 +28,7 @@ $hi=0;
$lo=4;
# ====================================================================
-$output=shift;
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
$ctx="r0";
@@ -73,33 +79,31 @@ $code.=<<___;
eor $t0,$t0,$Elo,lsl#23
eor $t1,$t1,$Ehi,lsl#23 @ Sigma1(e)
adds $Tlo,$Tlo,$t0
- adc $Thi,$Thi,$t1 @ T += Sigma1(e)
- adds $Tlo,$Tlo,$t2
- adc $Thi,$Thi,$t3 @ T += h
-
ldr $t0,[sp,#$Foff+0] @ f.lo
+ adc $Thi,$Thi,$t1 @ T += Sigma1(e)
ldr $t1,[sp,#$Foff+4] @ f.hi
+ adds $Tlo,$Tlo,$t2
ldr $t2,[sp,#$Goff+0] @ g.lo
+ adc $Thi,$Thi,$t3 @ T += h
ldr $t3,[sp,#$Goff+4] @ g.hi
- str $Elo,[sp,#$Eoff+0]
- str $Ehi,[sp,#$Eoff+4]
- str $Alo,[sp,#$Aoff+0]
- str $Ahi,[sp,#$Aoff+4]
eor $t0,$t0,$t2
+ str $Elo,[sp,#$Eoff+0]
eor $t1,$t1,$t3
+ str $Ehi,[sp,#$Eoff+4]
and $t0,$t0,$Elo
+ str $Alo,[sp,#$Aoff+0]
and $t1,$t1,$Ehi
+ str $Ahi,[sp,#$Aoff+4]
eor $t0,$t0,$t2
- eor $t1,$t1,$t3 @ Ch(e,f,g)
-
ldr $t2,[$Ktbl,#4] @ K[i].lo
+ eor $t1,$t1,$t3 @ Ch(e,f,g)
ldr $t3,[$Ktbl,#0] @ K[i].hi
- ldr $Elo,[sp,#$Doff+0] @ d.lo
- ldr $Ehi,[sp,#$Doff+4] @ d.hi
adds $Tlo,$Tlo,$t0
+ ldr $Elo,[sp,#$Doff+0] @ d.lo
adc $Thi,$Thi,$t1 @ T += Ch(e,f,g)
+ ldr $Ehi,[sp,#$Doff+4] @ d.hi
adds $Tlo,$Tlo,$t2
adc $Thi,$Thi,$t3 @ T += K[i]
adds $Elo,$Elo,$Tlo