diff options
Diffstat (limited to 'crypto/sha')
-rw-r--r-- | crypto/sha/asm/sha256-armv4.pl | 33 | ||||
-rw-r--r-- | crypto/sha/asm/sha512-armv4.pl | 32 |
2 files changed, 37 insertions, 28 deletions
diff --git a/crypto/sha/asm/sha256-armv4.pl b/crypto/sha/asm/sha256-armv4.pl index 48d846deec..492cb62bc0 100644 --- a/crypto/sha/asm/sha256-armv4.pl +++ b/crypto/sha/asm/sha256-armv4.pl @@ -11,9 +11,14 @@ # Performance is ~2x better than gcc 3.4 generated code and in "abso- # lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per -# byte. +# byte [on single-issue Xscale PXA250 core]. -$output=shift; +# July 2010. +# +# Rescheduling for dual-issue pipeline resulted in 22% improvement on +# Cortex A8 core and ~20 cycles per processed byte. + +while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} open STDOUT,">$output"; $ctx="r0"; $t0="r0"; @@ -52,27 +57,27 @@ $code.=<<___ if ($i<16); ___ $code.=<<___; ldr $t2,[$Ktbl],#4 @ *K256++ - str $T1,[sp,#`$i%16`*4] mov $t0,$e,ror#$Sigma1[0] + str $T1,[sp,#`$i%16`*4] eor $t0,$t0,$e,ror#$Sigma1[1] - eor $t0,$t0,$e,ror#$Sigma1[2] @ Sigma1(e) - add $T1,$T1,$t0 eor $t1,$f,$g + eor $t0,$t0,$e,ror#$Sigma1[2] @ Sigma1(e) and $t1,$t1,$e + add $T1,$T1,$t0 eor $t1,$t1,$g @ Ch(e,f,g) - add $T1,$T1,$t1 add $T1,$T1,$h - add $T1,$T1,$t2 mov $h,$a,ror#$Sigma0[0] + add $T1,$T1,$t1 eor $h,$h,$a,ror#$Sigma0[1] + add $T1,$T1,$t2 eor $h,$h,$a,ror#$Sigma0[2] @ Sigma0(a) orr $t0,$a,$b - and $t0,$t0,$c and $t1,$a,$b + and $t0,$t0,$c + add $h,$h,$T1 orr $t0,$t0,$t1 @ Maj(a,b,c) - add $h,$h,$t0 add $d,$d,$T1 - add $h,$h,$T1 + add $h,$h,$t0 ___ } @@ -80,19 +85,19 @@ sub BODY_16_XX { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; $code.=<<___; - ldr $t1,[sp,#`($i+1)%16`*4] @ $i + ldr $t1,[sp,#`($i+1)%16`*4] @ $i ldr $t2,[sp,#`($i+14)%16`*4] ldr $T1,[sp,#`($i+0)%16`*4] - ldr $inp,[sp,#`($i+9)%16`*4] mov $t0,$t1,ror#$sigma0[0] + ldr $inp,[sp,#`($i+9)%16`*4] eor $t0,$t0,$t1,ror#$sigma0[1] eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1]) mov $t1,$t2,ror#$sigma1[0] + add $T1,$T1,$t0 eor $t1,$t1,$t2,ror#$sigma1[1] + add $T1,$T1,$inp eor $t1,$t1,$t2,lsr#$sigma1[2] @ sigma1(X[i+14]) - add $T1,$T1,$t0 add $T1,$T1,$t1 - add $T1,$T1,$inp ___ &BODY_00_15(@_); } diff --git a/crypto/sha/asm/sha512-armv4.pl b/crypto/sha/asm/sha512-armv4.pl index 4fbb94a914..3a35861ac6 100644 --- a/crypto/sha/asm/sha512-armv4.pl +++ b/crypto/sha/asm/sha512-armv4.pl @@ -10,7 +10,13 @@ # SHA512 block procedure for ARMv4. September 2007. # This code is ~4.5 (four and a half) times faster than code generated -# by gcc 3.4 and it spends ~72 clock cycles per byte. +# by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue +# Xscale PXA250 core]. +# +# July 2010. +# +# Rescheduling for dual-issue pipeline resulted in 6% improvement on +# Cortex A8 core and ~40 cycles per processed byte. # Byte order [in]dependence. ========================================= # @@ -22,7 +28,7 @@ $hi=0; $lo=4; # ==================================================================== -$output=shift; +while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} open STDOUT,">$output"; $ctx="r0"; @@ -73,33 +79,31 @@ $code.=<<___; eor $t0,$t0,$Elo,lsl#23 eor $t1,$t1,$Ehi,lsl#23 @ Sigma1(e) adds $Tlo,$Tlo,$t0 - adc $Thi,$Thi,$t1 @ T += Sigma1(e) - adds $Tlo,$Tlo,$t2 - adc $Thi,$Thi,$t3 @ T += h - ldr $t0,[sp,#$Foff+0] @ f.lo + adc $Thi,$Thi,$t1 @ T += Sigma1(e) ldr $t1,[sp,#$Foff+4] @ f.hi + adds $Tlo,$Tlo,$t2 ldr $t2,[sp,#$Goff+0] @ g.lo + adc $Thi,$Thi,$t3 @ T += h ldr $t3,[sp,#$Goff+4] @ g.hi - str $Elo,[sp,#$Eoff+0] - str $Ehi,[sp,#$Eoff+4] - str $Alo,[sp,#$Aoff+0] - str $Ahi,[sp,#$Aoff+4] eor $t0,$t0,$t2 + str $Elo,[sp,#$Eoff+0] eor $t1,$t1,$t3 + str $Ehi,[sp,#$Eoff+4] and $t0,$t0,$Elo + str $Alo,[sp,#$Aoff+0] and $t1,$t1,$Ehi + str $Ahi,[sp,#$Aoff+4] eor $t0,$t0,$t2 - eor $t1,$t1,$t3 @ Ch(e,f,g) - ldr $t2,[$Ktbl,#4] @ K[i].lo + eor $t1,$t1,$t3 @ Ch(e,f,g) ldr $t3,[$Ktbl,#0] @ K[i].hi - ldr $Elo,[sp,#$Doff+0] @ d.lo - ldr $Ehi,[sp,#$Doff+4] @ d.hi adds $Tlo,$Tlo,$t0 + ldr $Elo,[sp,#$Doff+0] @ d.lo adc $Thi,$Thi,$t1 @ T += Ch(e,f,g) + ldr $Ehi,[sp,#$Doff+4] @ d.hi adds $Tlo,$Tlo,$t2 adc $Thi,$Thi,$t3 @ T += K[i] adds $Elo,$Elo,$Tlo |