diff options
author | Andy Polyakov <appro@openssl.org> | 2018-06-02 15:25:50 +0200 |
---|---|---|
committer | Andy Polyakov <appro@openssl.org> | 2018-06-03 21:20:40 +0200 |
commit | c4d9ef4cc5bf1c48a74b64879622ae9fd6f26b03 (patch) | |
tree | 0efd8ba216093d0029a697208136fcbe76fda7d5 /crypto/sha | |
parent | 1a467bd12f20928f3d5e6809b5f9394dbe606541 (diff) |
sha/asm/sha512p8-ppc.pl: improve POWER9 performance by ~10%.
Biggest part, ~7%, of improvement resulted from omitting constants'
table index increment in each round. And minor part from rescheduling
instructions. Apparently POWER9 (and POWER8) manage to dispatch
instructions more efficiently if they are laid down as if they have
no latency...
Reviewed-by: Rich Salz <rsalz@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/6406)
Diffstat (limited to 'crypto/sha')
-rwxr-xr-x | crypto/sha/asm/sha512p8-ppc.pl | 122 |
1 files changed, 55 insertions, 67 deletions
diff --git a/crypto/sha/asm/sha512p8-ppc.pl b/crypto/sha/asm/sha512p8-ppc.pl index 7a8d4358f0..e3f522cb7c 100755 --- a/crypto/sha/asm/sha512p8-ppc.pl +++ b/crypto/sha/asm/sha512p8-ppc.pl @@ -37,8 +37,8 @@ # build of sha512-ppc.pl, presented for reference. # # POWER8 POWER9 -# SHA256 9.9 [15.8] 12.2 [12.5] -# SHA512 6.3 [10.3] 7.7 [7.9] +# SHA256 9.7 [15.8] 11.2 [12.5] +# SHA512 6.1 [10.3] 7.0 [7.9] $flavour=shift; $output =shift; @@ -79,7 +79,8 @@ if ($output =~ /512/) { } $func="sha${bits}_block_p8"; -$FRAME=8*$SIZE_T; +$LOCALS=8*$SIZE_T+8*16; +$FRAME=$LOCALS+9*16+6*$SIZE_T; $sp ="r1"; $toc="r2"; @@ -91,16 +92,17 @@ $idx="r7"; $lrsave="r8"; $offload="r11"; $vrsave="r12"; -($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,10,26..31)); - $x00=0 if ($flavour =~ /osx/); +@I = ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,10,26..31)); + $x00=0 if ($flavour =~ /osx/); @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("v$_",(0..7)); -@X=map("v$_",(8..23)); -($Ki,$Func,$S0,$S1,$s0,$s1,$lemask)=map("v$_",(24..31)); +@X=map("v$_",(8..19,24..27)); +($Ki,$Func,$Sigma,$lemask)=map("v$_",(28..31)); sub ROUND { my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; my $j=($i+1)%16; +my $k=($i+2)%8; $code.=<<___ if ($i<15 && ($i%(16/$SZ))==(16/$SZ-1)); lvx_u @X[$i+1],0,$inp ; load X[i] in advance @@ -112,26 +114,30 @@ ___ $code.=<<___ if ($LENDIAN && $i<16 && ($i%(16/$SZ))==0); vperm @X[$i],@X[$i],@X[$i],$lemask ___ +$code.=<<___ if ($i>=15); + vshasigma${sz} $Sigma,@X[($j+1)%16],0,0 + vaddu${sz}m @X[$j],@X[$j],$Sigma + vshasigma${sz} $Sigma,@X[($j+14)%16],0,15 + vaddu${sz}m @X[$j],@X[$j],$Sigma + vaddu${sz}m @X[$j],@X[$j],@X[($j+9)%16] +___ $code.=<<___; - `"vshasigma${sz} $s0,@X[($j+1)%16],0,0" if ($i>=15)` - vsel $Func,$g,$f,$e ; Ch(e,f,g) - vshasigma${sz} $S1,$e,1,15 ; Sigma1(e) vaddu${sz}m $h,$h,@X[$i%16] ; h+=X[i] - vshasigma${sz} $S0,$a,1,0 ; Sigma0(a) - `"vshasigma${sz} $s1,@X[($j+14)%16],0,15" if ($i>=15)` + vsel $Func,$g,$f,$e ; Ch(e,f,g) + vaddu${sz}m $g,$g,$Ki ; future h+=K[i] vaddu${sz}m $h,$h,$Func ; h+=Ch(e,f,g) + vshasigma${sz} $Sigma,$e,1,15 ; Sigma1(e) + vaddu${sz}m $h,$h,$Sigma ; h+=Sigma1(e) vxor $Func,$a,$b - `"vaddu${sz}m @X[$j],@X[$j],@X[($j+9)%16]" if ($i>=15)` - vaddu${sz}m $h,$h,$S1 ; h+=Sigma1(e) vsel $Func,$b,$c,$Func ; Maj(a,b,c) - vaddu${sz}m $g,$g,$Ki ; future h+=K[i] vaddu${sz}m $d,$d,$h ; d+=h - vaddu${sz}m $S0,$S0,$Func ; Sigma0(a)+Maj(a,b,c) - `"vaddu${sz}m @X[$j],@X[$j],$s0" if ($i>=15)` - lvx $Ki,$idx,$Tbl ; load next K[i] - addi $idx,$idx,16 - vaddu${sz}m $h,$h,$S0 ; h+=Sigma0(a)+Maj(a,b,c) - `"vaddu${sz}m @X[$j],@X[$j],$s1" if ($i>=15)` + vshasigma${sz} $Sigma,$a,1,0 ; Sigma0(a) + vaddu${sz}m $Sigma,$Sigma,$Func ; Sigma0(a)+Maj(a,b,c) + vaddu${sz}m $h,$h,$Sigma ; h+=Sigma0(a)+Maj(a,b,c) + lvx $Ki,@I[$k],$idx ; load next K[i] +___ +$code.=<<___ if ($k == 7); + addi $idx,$idx,0x80 ___ } @@ -142,21 +148,13 @@ $code=<<___; .globl $func .align 6 $func: - $STU $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp) + $STU $sp,-$FRAME($sp) mflr $lrsave - li r10,`$FRAME+8*16+15` - li r11,`$FRAME+8*16+31` - stvx v20,r10,$sp # ABI says so + li r10,`$LOCALS+15` + li r11,`$LOCALS+31` + stvx v24,r10,$sp # ABI says so addi r10,r10,32 mfspr $vrsave,256 - stvx v21,r11,$sp - addi r11,r11,32 - stvx v22,r10,$sp - addi r10,r10,32 - stvx v23,r11,$sp - addi r11,r11,32 - stvx v24,r10,$sp - addi r10,r10,32 stvx v25,r11,$sp addi r11,r11,32 stvx v26,r10,$sp @@ -169,26 +167,26 @@ $func: addi r11,r11,32 stvx v30,r10,$sp stvx v31,r11,$sp - li r11,-1 - stw $vrsave,`$FRAME+21*16-4`($sp) # save vrsave + li r11,-4096+255 + stw $vrsave,`$FRAME+6*$SIZE_T-4`($sp) # save vrsave li $x10,0x10 - $PUSH r26,`$FRAME+21*16+0*$SIZE_T`($sp) + $PUSH r26,`$FRAME-6*$SIZE_T`($sp) li $x20,0x20 - $PUSH r27,`$FRAME+21*16+1*$SIZE_T`($sp) + $PUSH r27,`$FRAME-5*$SIZE_T`($sp) li $x30,0x30 - $PUSH r28,`$FRAME+21*16+2*$SIZE_T`($sp) + $PUSH r28,`$FRAME-4*$SIZE_T`($sp) li $x40,0x40 - $PUSH r29,`$FRAME+21*16+3*$SIZE_T`($sp) + $PUSH r29,`$FRAME-3*$SIZE_T`($sp) li $x50,0x50 - $PUSH r30,`$FRAME+21*16+4*$SIZE_T`($sp) + $PUSH r30,`$FRAME-2*$SIZE_T`($sp) li $x60,0x60 - $PUSH r31,`$FRAME+21*16+5*$SIZE_T`($sp) + $PUSH r31,`$FRAME-1*$SIZE_T`($sp) li $x70,0x70 - $PUSH $lrsave,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp) + $PUSH $lrsave,`$FRAME+$LRSAVE`($sp) mtspr 256,r11 bl LPICmeup - addi $offload,$sp,$FRAME+15 + addi $offload,$sp,`8*$SIZE_T+15` ___ $code.=<<___ if ($LENDIAN); li $idx,8 @@ -222,9 +220,9 @@ $code.=<<___; .align 5 Loop: lvx $Ki,$x00,$Tbl - li $idx,16 lvx_u @X[0],0,$inp addi $inp,$inp,16 + mr $idx,$Tbl # copy $Tbl stvx $A,$x00,$offload # offload $A-$H stvx $B,$x10,$offload stvx $C,$x20,$offload @@ -234,8 +232,7 @@ Loop: stvx $G,$x60,$offload stvx $H,$x70,$offload vaddu${sz}m $H,$H,$Ki # h+K[i] - lvx $Ki,$idx,$Tbl - addi $idx,$idx,16 + lvx $Ki,$x10,$Tbl ___ for ($i=0;$i<16;$i++) { &ROUND($i,@V); unshift(@V,pop(@V)); } $code.=<<___; @@ -268,10 +265,9 @@ $code.=<<___; bne Loop ___ $code.=<<___ if ($SZ==4); - lvx @X[0],$idx,$Tbl - addi $idx,$idx,16 + lvx @X[0],$x20,$idx vperm $A,$A,$B,$Ki # pack the answer - lvx @X[1],$idx,$Tbl + lvx @X[1],$x30,$idx vperm $E,$E,$F,$Ki vperm $A,$A,$C,@X[0] vperm $E,$E,$G,@X[0] @@ -291,19 +287,11 @@ $code.=<<___ if ($SZ==8); stvx_u $G,$x30,$ctx ___ $code.=<<___; - li r10,`$FRAME+8*16+15` + li r10,`$LOCALS+15` mtlr $lrsave - li r11,`$FRAME+8*16+31` + li r11,`$LOCALS+31` mtspr 256,$vrsave - lvx v20,r10,$sp # ABI says so - addi r10,r10,32 - lvx v21,r11,$sp - addi r11,r11,32 - lvx v22,r10,$sp - addi r10,r10,32 - lvx v23,r11,$sp - addi r11,r11,32 - lvx v24,r10,$sp + lvx v24,r10,$sp # ABI says so addi r10,r10,32 lvx v25,r11,$sp addi r11,r11,32 @@ -317,13 +305,13 @@ $code.=<<___; addi r11,r11,32 lvx v30,r10,$sp lvx v31,r11,$sp - $POP r26,`$FRAME+21*16+0*$SIZE_T`($sp) - $POP r27,`$FRAME+21*16+1*$SIZE_T`($sp) - $POP r28,`$FRAME+21*16+2*$SIZE_T`($sp) - $POP r29,`$FRAME+21*16+3*$SIZE_T`($sp) - $POP r30,`$FRAME+21*16+4*$SIZE_T`($sp) - $POP r31,`$FRAME+21*16+5*$SIZE_T`($sp) - addi $sp,$sp,`$FRAME+21*16+6*$SIZE_T` + $POP r26,`$FRAME-6*$SIZE_T`($sp) + $POP r27,`$FRAME-5*$SIZE_T`($sp) + $POP r28,`$FRAME-4*$SIZE_T`($sp) + $POP r29,`$FRAME-3*$SIZE_T`($sp) + $POP r30,`$FRAME-2*$SIZE_T`($sp) + $POP r31,`$FRAME-1*$SIZE_T`($sp) + addi $sp,$sp,$FRAME blr .long 0 .byte 0,12,4,1,0x80,6,3,0 |