summaryrefslogtreecommitdiffstats
path: root/crypto/sha
diff options
context:
space:
mode:
authorAndy Polyakov <appro@openssl.org>2014-02-05 14:03:35 +0100
committerAndy Polyakov <appro@openssl.org>2014-02-05 14:03:35 +0100
commit3847d15d6bf124b1703fbc27f69bdce7755f768d (patch)
tree923e678ef0e169decc2a2ab777dc8f40878e4d38 /crypto/sha
parent3ef477c69f2fd39549123d7b0b869029b46cf989 (diff)
[aesni|sha*]-mb-x86_64.pl: add data prefetching.
Diffstat (limited to 'crypto/sha')
-rw-r--r--crypto/sha/asm/sha1-mb-x86_64.pl40
-rw-r--r--crypto/sha/asm/sha256-mb-x86_64.pl19
2 files changed, 47 insertions, 12 deletions
diff --git a/crypto/sha/asm/sha1-mb-x86_64.pl b/crypto/sha/asm/sha1-mb-x86_64.pl
index 93996e15f9..33e6620c68 100644
--- a/crypto/sha/asm/sha1-mb-x86_64.pl
+++ b/crypto/sha/asm/sha1-mb-x86_64.pl
@@ -14,20 +14,21 @@
#
# this +aesni(i) sha1 aesni-sha1 gain(iv)
# -------------------------------------------------------------------
-# Westmere(ii) 10.4/n +1.28=3.88(n=4) 5.44 6.58 +70%
-# Atom(ii) 18.9/n +3.93=8.66(n=4) 10.0 14.0 +62%
+# Westmere(ii) 10.7/n +1.28=3.96(n=4) 5.30 6.66 +68%
+# Atom(ii) 18.9?/n +3.93=8.66(n=4) 10.0 14.0 +62%
# Sandy Bridge (8.16 +5.15=13.3)/n 4.99 5.98 +80%
-# Ivy Bridge (8.03 +5.14=13.2)/n 4.60 5.54 +68%
+# Ivy Bridge (8.08 +5.14=13.2)/n 4.60 5.54 +68%
# Haswell(iii) (8.96 +5.00=14.0)/n 3.57 4.55 +160%
-# Bulldozer (9.75 +5.76=15.5)/n 5.95 6.37 +64%
+# Bulldozer (9.76 +5.76=15.5)/n 5.95 6.37 +64%
#
# (i) multi-block CBC encrypt with 128-bit key;
# (ii) (HASH+AES)/n does not apply to Westmere for n>3 and Atom,
# because of lower AES-NI instruction throughput;
# (iii) "this" is for n=8, when we gather twice as much data, result
-# for n=4 is 7.98+4.44=12.4;
-# (iv) improvement coefficients in real-life application are somewhat
-# lower and range from 30% to 100% (on Haswell);
+# for n=4 is 8.00+4.44=12.4;
+# (iv) presented improvement coefficients are asymptotic limits and
+# in real-life application are somewhat lower, e.g. for 2KB
+# fragments they range from 30% to 100% (on Haswell);
$flavour = shift;
$output = shift;
@@ -80,6 +81,14 @@ $Tbl="%rbp";
@Xi=map("%xmm$_",(10..14));
$K="%xmm15";
+if (1) {
+ # Atom-specific optimization aiming to eliminate pshufb with high
+ # registers [and thus get rid of 48 cycles accumulated penalty]
+ @Xi=map("%xmm$_",(0..4));
+ ($tx,$t0,$t1,$t2,$t3)=map("%xmm$_",(5..9));
+ @V=($A,$B,$C,$D,$E)=map("%xmm$_",(10..14));
+}
+
$REG_SZ=16;
sub Xi_off {
@@ -139,8 +148,8 @@ $code.=<<___ if ($i<14); # just load input
psrld \$2,$b
paddd $t2,$e # e+=rol(a,5)
- movd `4*$j-16*4`(@ptr[2]),$t2
pshufb $tx,@Xi[1]
+ movd `4*$j-16*4`(@ptr[2]),$t2
por $t1,$b # b=rol(b,30)
___
$code.=<<___ if ($i==14); # just load input
@@ -152,6 +161,7 @@ $code.=<<___ if ($i==14); # just load input
movdqa $b,$t1
movdqa $b,$t0
pslld \$5,$t2
+ prefetcht0 63(@ptr[0])
pandn $d,$t1
pand $c,$t0
punpckldq $t3,@Xi[1]
@@ -162,14 +172,17 @@ $code.=<<___ if ($i==14); # just load input
psrld \$27,$t3
pxor $t1,$t0 # Ch(b,c,d)
movdqa $b,$t1
+ prefetcht0 63(@ptr[1])
por $t3,$t2 # rol(a,5)
pslld \$30,$t1
paddd $t0,$e # e+=Ch(b,c,d)
+ prefetcht0 63(@ptr[2])
psrld \$2,$b
paddd $t2,$e # e+=rol(a,5)
pshufb $tx,@Xi[1]
+ prefetcht0 63(@ptr[3])
por $t1,$b # b=rol(b,30)
___
$code.=<<___ if ($i>=13 && $i<15);
@@ -382,12 +395,12 @@ $code.=<<___;
movdqu 0x60($ctx),$D
movdqu 0x80($ctx),$E
movdqa 0x60($Tbl),$tx # pbswap_mask
+ movdqa -0x20($Tbl),$K # K_00_19
jmp .Loop
.align 32
.Loop:
___
-$code.=" movdqa -0x20($Tbl),$K\n"; # K_00_19
for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
$code.=" movdqa 0x00($Tbl),$K\n"; # K_20_39
for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
@@ -434,6 +447,7 @@ $code.=<<___;
movdqa @Xi[0],(%rbx) # save counters
movdqa 0x60($Tbl),$tx # pbswap_mask
+ movdqa -0x20($Tbl),$K # K_00_19
dec $num
jnz .Loop
@@ -551,6 +565,7 @@ $code.=<<___ if ($i<14);
___
$code.=<<___ if ($i==14);
vpaddd $K,$e,$e # e+=K_00_19
+ prefetcht0 63(@ptr[0])
vpslld \$5,$a,$t2
vpandn $d,$b,$t1
vpand $c,$b,$t0
@@ -559,14 +574,17 @@ $code.=<<___ if ($i==14);
vpaddd @Xi[0],$e,$e # e+=X[i]
$vpack $t3,@Xi[1],@Xi[1]
vpsrld \$27,$a,$t3
+ prefetcht0 63(@ptr[1])
vpxor $t1,$t0,$t0 # Ch(b,c,d)
vpslld \$30,$b,$t1
vpor $t3,$t2,$t2 # rol(a,5)
+ prefetcht0 63(@ptr[2])
vpaddd $t0,$e,$e # e+=Ch(b,c,d)
vpsrld \$2,$b,$b
vpaddd $t2,$e,$e # e+=rol(a,5)
+ prefetcht0 63(@ptr[3])
vpshufb $tx,@Xi[1],@Xi[1]
vpor $t1,$b,$b # b=rol(b,30)
___
@@ -580,6 +598,7 @@ $code.=<<___ if ($i>=15); # apply Xupdate
vpaddd $K,$e,$e # e+=K_00_19
vpslld \$5,$a,$t2
vpandn $d,$b,$t1
+ `"prefetcht0 63(@ptr[4])" if ($i==15 && $REG_SZ==32)`
vpand $c,$b,$t0
vmovdqa @Xi[0],`&Xi_off($i)`
@@ -588,14 +607,17 @@ $code.=<<___ if ($i>=15); # apply Xupdate
vpsrld \$27,$a,$t3
vpxor $t1,$t0,$t0 # Ch(b,c,d)
vpxor @Xi[3],@Xi[1],@Xi[1]
+ `"prefetcht0 63(@ptr[5])" if ($i==15 && $REG_SZ==32)`
vpslld \$30,$b,$t1
vpor $t3,$t2,$t2 # rol(a,5)
vpaddd $t0,$e,$e # e+=Ch(b,c,d)
+ `"prefetcht0 63(@ptr[6])" if ($i==15 && $REG_SZ==32)`
vpsrld \$31,@Xi[1],$tx
vpaddd @Xi[1],@Xi[1],@Xi[1]
vpsrld \$2,$b,$b
+ `"prefetcht0 63(@ptr[7])" if ($i==15 && $REG_SZ==32)`
vpaddd $t2,$e,$e # e+=rol(a,5)
vpor $tx,@Xi[1],@Xi[1] # rol \$1,@Xi[1]
vpor $t1,$b,$b # b=rol(b,30)
diff --git a/crypto/sha/asm/sha256-mb-x86_64.pl b/crypto/sha/asm/sha256-mb-x86_64.pl
index 2e4b102f52..e86f0bc7fa 100644
--- a/crypto/sha/asm/sha256-mb-x86_64.pl
+++ b/crypto/sha/asm/sha256-mb-x86_64.pl
@@ -15,7 +15,7 @@
# this +aesni(i) sha256 aesni-sha256 gain(iv)
# -------------------------------------------------------------------
# Westmere(ii) 23.3/n +1.28=7.11(n=4) 12.3 +3.75=16.1 +126%
-# Atom(ii) 39.1/n +3.93=13.7(n=4) 20.8 +5.69=26.5 +93%
+# Atom(ii) ?39.1/n +3.93=13.7(n=4) 20.8 +5.69=26.5 +93%
# Sandy Bridge (20.5 +5.15=25.7)/n 11.6 13.0 +103%
# Ivy Bridge (20.4 +5.14=25.5)/n 10.3 11.6 +82%
# Haswell(iii) (21.0 +5.00=26.0)/n 7.80 8.79 +170%
@@ -27,8 +27,9 @@
# AES-NI-SHA256 stitch for these processors;
# (iii) "this" is for n=8, when we gather twice as much data, result
# for n=4 is 20.3+4.44=24.7;
-# (iv) improvement coefficients in real-life application are somewhat
-# lower and range from 75% to 130% (on Haswell);
+# (iv) presented improvement coefficients are asymptotic limits and
+# in real-life application are somewhat lower, e.g. for 2KB
+# fragments they range from 75% to 13% (on Haswell);
$flavour = shift;
$output = shift;
@@ -135,6 +136,7 @@ $code.=<<___;
psrld \$25-11,$t2
movdqa $e,$t1
+ `"prefetch 63(@ptr[0])" if ($i==15)`
pxor $t3,$sigma
movdqa $e,$axb # borrow $axb
pslld \$26-21,$t3
@@ -142,6 +144,7 @@ $code.=<<___;
pand $f,$axb
pxor $t2,$sigma
+ `"prefetch 63(@ptr[1])" if ($i==15)`
movdqa $a,$t2
pxor $t3,$sigma # Sigma1(e)
movdqa $a,$t3
@@ -153,6 +156,7 @@ $code.=<<___;
pslld \$10,$t3
pxor $a,$axb # a^b, b^c in next round
+ `"prefetch 63(@ptr[2])" if ($i==15)`
psrld \$13,$sigma
pxor $t3,$t2
paddd $t1,$Xi # Xi+=Ch(e,f,g)
@@ -160,6 +164,7 @@ $code.=<<___;
pand $axb,$bxc
pxor $sigma,$t2
+ `"prefetch 63(@ptr[3])" if ($i==15)`
psrld \$22-13,$sigma
pxor $t3,$t2
movdqa $b,$h
@@ -465,30 +470,38 @@ $code.=<<___;
vpsrld \$25,$e,$t2
vpxor $t3,$sigma,$sigma
+ `"prefetch 63(@ptr[0])" if ($i==15)`
vpslld \$7,$e,$t3
vpandn $g,$e,$t1
vpand $f,$e,$axb # borrow $axb
+ `"prefetch 63(@ptr[1])" if ($i==15)`
vpxor $t2,$sigma,$sigma
vpsrld \$2,$a,$h # borrow $h
vpxor $t3,$sigma,$sigma # Sigma1(e)
+ `"prefetch 63(@ptr[2])" if ($i==15)`
vpslld \$30,$a,$t2
vpxor $axb,$t1,$t1 # Ch(e,f,g)
vpxor $a,$b,$axb # a^b, b^c in next round
+ `"prefetch 63(@ptr[3])" if ($i==15)`
vpxor $t2,$h,$h
vpaddd $sigma,$Xi,$Xi # Xi+=Sigma1(e)
vpsrld \$13,$a,$t2
+ `"prefetch 63(@ptr[4])" if ($i==15 && $REG_SZ==32)`
vpslld \$19,$a,$t3
vpaddd $t1,$Xi,$Xi # Xi+=Ch(e,f,g)
vpand $axb,$bxc,$bxc
+ `"prefetch 63(@ptr[5])" if ($i==15 && $REG_SZ==32)`
vpxor $t2,$h,$sigma
vpsrld \$22,$a,$t2
vpxor $t3,$sigma,$sigma
+ `"prefetch 63(@ptr[6])" if ($i==15 && $REG_SZ==32)`
vpslld \$10,$a,$t3
vpxor $bxc,$b,$h # h=Maj(a,b,c)=Ch(a^b,c,b)
vpaddd $Xi,$d,$d # d+=Xi
+ `"prefetch 63(@ptr[7])" if ($i==15 && $REG_SZ==32)`
vpxor $t2,$sigma,$sigma
vpxor $t3,$sigma,$sigma # Sigma0(a)