summaryrefslogtreecommitdiffstats
path: root/crypto/sha
diff options
context:
space:
mode:
authorAndy Polyakov <appro@openssl.org>2018-06-02 15:25:50 +0200
committerAndy Polyakov <appro@openssl.org>2018-06-03 21:20:40 +0200
commitc4d9ef4cc5bf1c48a74b64879622ae9fd6f26b03 (patch)
tree0efd8ba216093d0029a697208136fcbe76fda7d5 /crypto/sha
parent1a467bd12f20928f3d5e6809b5f9394dbe606541 (diff)
sha/asm/sha512p8-ppc.pl: improve POWER9 performance by ~10%.
Biggest part, ~7%, of improvement resulted from omitting constants' table index increment in each round. And minor part from rescheduling instructions. Apparently POWER9 (and POWER8) manage to dispatch instructions more efficiently if they are laid down as if they have no latency... Reviewed-by: Rich Salz <rsalz@openssl.org> (Merged from https://github.com/openssl/openssl/pull/6406)
Diffstat (limited to 'crypto/sha')
-rwxr-xr-xcrypto/sha/asm/sha512p8-ppc.pl122
1 files changed, 55 insertions, 67 deletions
diff --git a/crypto/sha/asm/sha512p8-ppc.pl b/crypto/sha/asm/sha512p8-ppc.pl
index 7a8d4358f0..e3f522cb7c 100755
--- a/crypto/sha/asm/sha512p8-ppc.pl
+++ b/crypto/sha/asm/sha512p8-ppc.pl
@@ -37,8 +37,8 @@
# build of sha512-ppc.pl, presented for reference.
#
# POWER8 POWER9
-# SHA256 9.9 [15.8] 12.2 [12.5]
-# SHA512 6.3 [10.3] 7.7 [7.9]
+# SHA256 9.7 [15.8] 11.2 [12.5]
+# SHA512 6.1 [10.3] 7.0 [7.9]
$flavour=shift;
$output =shift;
@@ -79,7 +79,8 @@ if ($output =~ /512/) {
}
$func="sha${bits}_block_p8";
-$FRAME=8*$SIZE_T;
+$LOCALS=8*$SIZE_T+8*16;
+$FRAME=$LOCALS+9*16+6*$SIZE_T;
$sp ="r1";
$toc="r2";
@@ -91,16 +92,17 @@ $idx="r7";
$lrsave="r8";
$offload="r11";
$vrsave="r12";
-($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,10,26..31));
- $x00=0 if ($flavour =~ /osx/);
+@I = ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,10,26..31));
+ $x00=0 if ($flavour =~ /osx/);
@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("v$_",(0..7));
-@X=map("v$_",(8..23));
-($Ki,$Func,$S0,$S1,$s0,$s1,$lemask)=map("v$_",(24..31));
+@X=map("v$_",(8..19,24..27));
+($Ki,$Func,$Sigma,$lemask)=map("v$_",(28..31));
sub ROUND {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
my $j=($i+1)%16;
+my $k=($i+2)%8;
$code.=<<___ if ($i<15 && ($i%(16/$SZ))==(16/$SZ-1));
lvx_u @X[$i+1],0,$inp ; load X[i] in advance
@@ -112,26 +114,30 @@ ___
$code.=<<___ if ($LENDIAN && $i<16 && ($i%(16/$SZ))==0);
vperm @X[$i],@X[$i],@X[$i],$lemask
___
+$code.=<<___ if ($i>=15);
+ vshasigma${sz} $Sigma,@X[($j+1)%16],0,0
+ vaddu${sz}m @X[$j],@X[$j],$Sigma
+ vshasigma${sz} $Sigma,@X[($j+14)%16],0,15
+ vaddu${sz}m @X[$j],@X[$j],$Sigma
+ vaddu${sz}m @X[$j],@X[$j],@X[($j+9)%16]
+___
$code.=<<___;
- `"vshasigma${sz} $s0,@X[($j+1)%16],0,0" if ($i>=15)`
- vsel $Func,$g,$f,$e ; Ch(e,f,g)
- vshasigma${sz} $S1,$e,1,15 ; Sigma1(e)
vaddu${sz}m $h,$h,@X[$i%16] ; h+=X[i]
- vshasigma${sz} $S0,$a,1,0 ; Sigma0(a)
- `"vshasigma${sz} $s1,@X[($j+14)%16],0,15" if ($i>=15)`
+ vsel $Func,$g,$f,$e ; Ch(e,f,g)
+ vaddu${sz}m $g,$g,$Ki ; future h+=K[i]
vaddu${sz}m $h,$h,$Func ; h+=Ch(e,f,g)
+ vshasigma${sz} $Sigma,$e,1,15 ; Sigma1(e)
+ vaddu${sz}m $h,$h,$Sigma ; h+=Sigma1(e)
vxor $Func,$a,$b
- `"vaddu${sz}m @X[$j],@X[$j],@X[($j+9)%16]" if ($i>=15)`
- vaddu${sz}m $h,$h,$S1 ; h+=Sigma1(e)
vsel $Func,$b,$c,$Func ; Maj(a,b,c)
- vaddu${sz}m $g,$g,$Ki ; future h+=K[i]
vaddu${sz}m $d,$d,$h ; d+=h
- vaddu${sz}m $S0,$S0,$Func ; Sigma0(a)+Maj(a,b,c)
- `"vaddu${sz}m @X[$j],@X[$j],$s0" if ($i>=15)`
- lvx $Ki,$idx,$Tbl ; load next K[i]
- addi $idx,$idx,16
- vaddu${sz}m $h,$h,$S0 ; h+=Sigma0(a)+Maj(a,b,c)
- `"vaddu${sz}m @X[$j],@X[$j],$s1" if ($i>=15)`
+ vshasigma${sz} $Sigma,$a,1,0 ; Sigma0(a)
+ vaddu${sz}m $Sigma,$Sigma,$Func ; Sigma0(a)+Maj(a,b,c)
+ vaddu${sz}m $h,$h,$Sigma ; h+=Sigma0(a)+Maj(a,b,c)
+ lvx $Ki,@I[$k],$idx ; load next K[i]
+___
+$code.=<<___ if ($k == 7);
+ addi $idx,$idx,0x80
___
}
@@ -142,21 +148,13 @@ $code=<<___;
.globl $func
.align 6
$func:
- $STU $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
+ $STU $sp,-$FRAME($sp)
mflr $lrsave
- li r10,`$FRAME+8*16+15`
- li r11,`$FRAME+8*16+31`
- stvx v20,r10,$sp # ABI says so
+ li r10,`$LOCALS+15`
+ li r11,`$LOCALS+31`
+ stvx v24,r10,$sp # ABI says so
addi r10,r10,32
mfspr $vrsave,256
- stvx v21,r11,$sp
- addi r11,r11,32
- stvx v22,r10,$sp
- addi r10,r10,32
- stvx v23,r11,$sp
- addi r11,r11,32
- stvx v24,r10,$sp
- addi r10,r10,32
stvx v25,r11,$sp
addi r11,r11,32
stvx v26,r10,$sp
@@ -169,26 +167,26 @@ $func:
addi r11,r11,32
stvx v30,r10,$sp
stvx v31,r11,$sp
- li r11,-1
- stw $vrsave,`$FRAME+21*16-4`($sp) # save vrsave
+ li r11,-4096+255
+ stw $vrsave,`$FRAME+6*$SIZE_T-4`($sp) # save vrsave
li $x10,0x10
- $PUSH r26,`$FRAME+21*16+0*$SIZE_T`($sp)
+ $PUSH r26,`$FRAME-6*$SIZE_T`($sp)
li $x20,0x20
- $PUSH r27,`$FRAME+21*16+1*$SIZE_T`($sp)
+ $PUSH r27,`$FRAME-5*$SIZE_T`($sp)
li $x30,0x30
- $PUSH r28,`$FRAME+21*16+2*$SIZE_T`($sp)
+ $PUSH r28,`$FRAME-4*$SIZE_T`($sp)
li $x40,0x40
- $PUSH r29,`$FRAME+21*16+3*$SIZE_T`($sp)
+ $PUSH r29,`$FRAME-3*$SIZE_T`($sp)
li $x50,0x50
- $PUSH r30,`$FRAME+21*16+4*$SIZE_T`($sp)
+ $PUSH r30,`$FRAME-2*$SIZE_T`($sp)
li $x60,0x60
- $PUSH r31,`$FRAME+21*16+5*$SIZE_T`($sp)
+ $PUSH r31,`$FRAME-1*$SIZE_T`($sp)
li $x70,0x70
- $PUSH $lrsave,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp)
+ $PUSH $lrsave,`$FRAME+$LRSAVE`($sp)
mtspr 256,r11
bl LPICmeup
- addi $offload,$sp,$FRAME+15
+ addi $offload,$sp,`8*$SIZE_T+15`
___
$code.=<<___ if ($LENDIAN);
li $idx,8
@@ -222,9 +220,9 @@ $code.=<<___;
.align 5
Loop:
lvx $Ki,$x00,$Tbl
- li $idx,16
lvx_u @X[0],0,$inp
addi $inp,$inp,16
+ mr $idx,$Tbl # copy $Tbl
stvx $A,$x00,$offload # offload $A-$H
stvx $B,$x10,$offload
stvx $C,$x20,$offload
@@ -234,8 +232,7 @@ Loop:
stvx $G,$x60,$offload
stvx $H,$x70,$offload
vaddu${sz}m $H,$H,$Ki # h+K[i]
- lvx $Ki,$idx,$Tbl
- addi $idx,$idx,16
+ lvx $Ki,$x10,$Tbl
___
for ($i=0;$i<16;$i++) { &ROUND($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
@@ -268,10 +265,9 @@ $code.=<<___;
bne Loop
___
$code.=<<___ if ($SZ==4);
- lvx @X[0],$idx,$Tbl
- addi $idx,$idx,16
+ lvx @X[0],$x20,$idx
vperm $A,$A,$B,$Ki # pack the answer
- lvx @X[1],$idx,$Tbl
+ lvx @X[1],$x30,$idx
vperm $E,$E,$F,$Ki
vperm $A,$A,$C,@X[0]
vperm $E,$E,$G,@X[0]
@@ -291,19 +287,11 @@ $code.=<<___ if ($SZ==8);
stvx_u $G,$x30,$ctx
___
$code.=<<___;
- li r10,`$FRAME+8*16+15`
+ li r10,`$LOCALS+15`
mtlr $lrsave
- li r11,`$FRAME+8*16+31`
+ li r11,`$LOCALS+31`
mtspr 256,$vrsave
- lvx v20,r10,$sp # ABI says so
- addi r10,r10,32
- lvx v21,r11,$sp
- addi r11,r11,32
- lvx v22,r10,$sp
- addi r10,r10,32
- lvx v23,r11,$sp
- addi r11,r11,32
- lvx v24,r10,$sp
+ lvx v24,r10,$sp # ABI says so
addi r10,r10,32
lvx v25,r11,$sp
addi r11,r11,32
@@ -317,13 +305,13 @@ $code.=<<___;
addi r11,r11,32
lvx v30,r10,$sp
lvx v31,r11,$sp
- $POP r26,`$FRAME+21*16+0*$SIZE_T`($sp)
- $POP r27,`$FRAME+21*16+1*$SIZE_T`($sp)
- $POP r28,`$FRAME+21*16+2*$SIZE_T`($sp)
- $POP r29,`$FRAME+21*16+3*$SIZE_T`($sp)
- $POP r30,`$FRAME+21*16+4*$SIZE_T`($sp)
- $POP r31,`$FRAME+21*16+5*$SIZE_T`($sp)
- addi $sp,$sp,`$FRAME+21*16+6*$SIZE_T`
+ $POP r26,`$FRAME-6*$SIZE_T`($sp)
+ $POP r27,`$FRAME-5*$SIZE_T`($sp)
+ $POP r28,`$FRAME-4*$SIZE_T`($sp)
+ $POP r29,`$FRAME-3*$SIZE_T`($sp)
+ $POP r30,`$FRAME-2*$SIZE_T`($sp)
+ $POP r31,`$FRAME-1*$SIZE_T`($sp)
+ addi $sp,$sp,$FRAME
blr
.long 0
.byte 0,12,4,1,0x80,6,3,0