diff options
author | Andy Polyakov <appro@openssl.org> | 2014-02-01 21:07:16 +0100 |
---|---|---|
committer | Andy Polyakov <appro@openssl.org> | 2014-02-01 21:07:16 +0100 |
commit | b217ca63b11f51178ea304e9f3b84e0ca3f6e076 (patch) | |
tree | 9966f101a925405f9270326406044d6bed4a5f83 /crypto/sha | |
parent | 30ea570f0fb2717b1faca5d611a3ab200c8eac83 (diff) |
crypto/sha/asm/sha1-x86_64.pl update:
+5% on Atom Silvermont, up to +8% improvement of legacy code.
Harmonize sha1-586.pl and aesni-sha1-x86_86.p with sha1-x86_64.pl.
Diffstat (limited to 'crypto/sha')
-rw-r--r-- | crypto/sha/asm/sha1-586.pl | 130 | ||||
-rwxr-xr-x | crypto/sha/asm/sha1-x86_64.pl | 182 |
2 files changed, 202 insertions, 110 deletions
diff --git a/crypto/sha/asm/sha1-586.pl b/crypto/sha/asm/sha1-586.pl index 632dbbe122..17b84c8bfe 100644 --- a/crypto/sha/asm/sha1-586.pl +++ b/crypto/sha/asm/sha1-586.pl @@ -93,8 +93,9 @@ # Westmere 7.3 5.5/+33% - # Sandy Bridge 8.8 6.2/+40% 5.1(**)/+73% # Ivy Bridge 7.2 4.8/+51% 4.7(**)/+53% +# Haswell 6.5 4.3/+51% 4.1(**)/+58% # Bulldozer 11.6 6.0/+92% -# VIA Nano 10.6 7.4/+43% +# VIA Nano 10.6 7.5/+41% # # (*) Loop is 1056 instructions long and expected result is ~8.25. # It remains mystery [to me] why ILP is limited to 1.7. @@ -512,7 +513,7 @@ my $_ror=sub { &ror(@_) }; &mov (@T[1],$C); &psubd (@X[-2&7],@X[3]); &xor (@T[1],$D); - &movdqa (@X[0],@X[-3&7]); + &pshufd (@X[0],@X[-4&7],0xee); # was &movdqa (@X[0],@X[-3&7]); &and (@T[0],@T[1]); &jmp (&label("loop")); @@ -539,76 +540,77 @@ sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4 my @insns = (&$body,&$body,&$body,&$body); # 40 instructions my ($a,$b,$c,$d,$e); + eval(shift(@insns)); # ror eval(shift(@insns)); eval(shift(@insns)); - &palignr(@X[0],@X[-4&7],8); # compose "X[-14]" in "X[0]" + &punpcklqdq(@X[0],@X[-3&7]); # compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8); &movdqa (@X[2],@X[-1&7]); eval(shift(@insns)); eval(shift(@insns)); &paddd (@X[3],@X[-1&7]); &movdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]);# save X[] to backtrace buffer - eval(shift(@insns)); + eval(shift(@insns)); # rol eval(shift(@insns)); &psrldq (@X[2],4); # "X[-3]", 3 dwords eval(shift(@insns)); eval(shift(@insns)); &pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]" eval(shift(@insns)); - eval(shift(@insns)); + eval(shift(@insns)); # ror &pxor (@X[2],@X[-2&7]); # "X[-3]"^"X[-8]" eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); - eval(shift(@insns)); &pxor (@X[0],@X[2]); # "X[0]"^="X[-3]"^"X[-8]" eval(shift(@insns)); - eval(shift(@insns)); + eval(shift(@insns)); # rol &movdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer to IALU eval(shift(@insns)); eval(shift(@insns)); &movdqa (@X[4],@X[0]); - &movdqa (@X[2],@X[0]); - eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); + eval(shift(@insns)); # ror + &movdqa (@X[2],@X[0]); eval(shift(@insns)); &pslldq (@X[4],12); # "X[0]"<<96, extract one dword &paddd (@X[0],@X[0]); eval(shift(@insns)); eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); &psrld (@X[2],31); eval(shift(@insns)); - eval(shift(@insns)); + eval(shift(@insns)); # rol &movdqa (@X[3],@X[4]); eval(shift(@insns)); eval(shift(@insns)); + eval(shift(@insns)); &psrld (@X[4],30); - &por (@X[0],@X[2]); # "X[0]"<<<=1 eval(shift(@insns)); + eval(shift(@insns)); # ror + &por (@X[0],@X[2]); # "X[0]"<<<=1 eval(shift(@insns)); &movdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if ($Xi>5); # restore X[] from backtrace buffer eval(shift(@insns)); eval(shift(@insns)); &pslld (@X[3],2); - &pxor (@X[0],@X[4]); - eval(shift(@insns)); eval(shift(@insns)); + eval(shift(@insns)); # rol + &pxor (@X[0],@X[4]); &movdqa (@X[4],&QWP(112-16+16*(($Xi)/5),"esp")); # K_XX_XX eval(shift(@insns)); eval(shift(@insns)); &pxor (@X[0],@X[3]); # "X[0]"^=("X[0]"<<96)<<<2 - &movdqa (@X[1],@X[-2&7]) if ($Xi<7); + &pshufd (@X[1],@X[-3&7],0xee) if ($Xi<7); # was &movdqa (@X[1],@X[-2&7]) + &pshufd (@X[3],@X[-1&7],0xee) if ($Xi==7); eval(shift(@insns)); eval(shift(@insns)); @@ -623,10 +625,9 @@ sub Xupdate_ssse3_32_79() my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions my ($a,$b,$c,$d,$e); - &movdqa (@X[2],@X[-1&7]) if ($Xi==8); eval(shift(@insns)); # body_20_39 &pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" - &palignr(@X[2],@X[-2&7],8); # compose "X[-6]" + &punpcklqdq(@X[2],@X[-1&7]); # compose "X[-6]", was &palignr(@X[2],@X[-2&7],8) eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # rol @@ -635,13 +636,14 @@ sub Xupdate_ssse3_32_79() &movdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]); # save X[] to backtrace buffer eval(shift(@insns)); eval(shift(@insns)); + eval(shift(@insns)) if (@insns[0] =~ /_rol/); if ($Xi%5) { &movdqa (@X[4],@X[3]); # "perpetuate" K_XX_XX... } else { # ... or load next one &movdqa (@X[4],&QWP(112-16+16*($Xi/5),"esp")); } - &paddd (@X[3],@X[-1&7]); eval(shift(@insns)); # ror + &paddd (@X[3],@X[-1&7]); eval(shift(@insns)); &pxor (@X[0],@X[2]); # "X[0]"^="X[-6]" @@ -656,6 +658,7 @@ sub Xupdate_ssse3_32_79() eval(shift(@insns)); eval(shift(@insns)); # ror eval(shift(@insns)); + eval(shift(@insns)) if (@insns[0] =~ /_rol/); &pslld (@X[0],2); eval(shift(@insns)); # body_20_39 @@ -667,6 +670,8 @@ sub Xupdate_ssse3_32_79() eval(shift(@insns)); eval(shift(@insns)); # ror eval(shift(@insns)); + eval(shift(@insns)) if (@insns[1] =~ /_rol/); + eval(shift(@insns)) if (@insns[0] =~ /_rol/); &por (@X[0],@X[2]); # "X[0]"<<<=2 eval(shift(@insns)); # body_20_39 @@ -677,7 +682,7 @@ sub Xupdate_ssse3_32_79() eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # ror - &movdqa (@X[3],@X[0]) if ($Xi<19); + &pshufd (@X[3],@X[-1],0xee) if ($Xi<19); # was &movdqa (@X[3],@X[0]) eval(shift(@insns)); foreach (@insns) { eval; } # remaining instructions @@ -692,6 +697,12 @@ sub Xuplast_ssse3_80() my ($a,$b,$c,$d,$e); eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); &paddd (@X[3],@X[-1&7]); eval(shift(@insns)); eval(shift(@insns)); @@ -728,9 +739,16 @@ sub Xloop_ssse3() eval(shift(@insns)); eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); &pshufb (@X[($Xi-3)&7],@X[2]); eval(shift(@insns)); eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); &paddd (@X[($Xi-4)&7],@X[3]); eval(shift(@insns)); eval(shift(@insns)); @@ -739,6 +757,8 @@ sub Xloop_ssse3() &movdqa (&QWP(0+16*$Xi,"esp"),@X[($Xi-4)&7]); # X[]+K xfer to IALU eval(shift(@insns)); eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); &psubd (@X[($Xi-4)&7],@X[3]); foreach (@insns) { eval; } @@ -816,6 +836,64 @@ sub body_40_59 () { # ((b^c)&(c^d))^c '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' ); } +###### +sub bodyx_00_19 () { # ((c^d)&b)^d + # on start @T[0]=(b&c)^(~b&d), $e+=X[]+K + return &bodyx_20_39() if ($rx==19); $rx++; + ( + '($a,$b,$c,$d,$e)=@V;'. + + '&rorx ($b,$b,2) if ($j==0);'. # $b>>>2 + '&rorx ($b,@T[1],7) if ($j!=0);', # $b>>>2 + '&lea ($e,&DWP(0,$e,@T[0]));', + '&rorx (@T[0],$a,5);', + + '&andn (@T[1],$a,$c);', + '&and ($a,$b)', + '&add ($d,&DWP(4*(($j+1)&15),"esp"));', # X[]+K xfer + + '&xor (@T[1],$a)', + '&add ($e,@T[0]);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' + ); +} + +sub bodyx_20_39 () { # b^d^c + # on start $b=b^c^d + return &bodyx_40_59() if ($rx==39); $rx++; + ( + '($a,$b,$c,$d,$e)=@V;'. + + '&add ($e,($j==19?@T[0]:$b))', + '&rorx ($b,@T[1],7);', # $b>>>2 + '&rorx (@T[0],$a,5);', + + '&xor ($a,$b) if ($j<79);', + '&add ($d,&DWP(4*(($j+1)&15),"esp")) if ($j<79);', # X[]+K xfer + '&xor ($a,$c) if ($j<79);', + '&add ($e,@T[0]);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' + ); +} + +sub bodyx_40_59 () { # ((b^c)&(c^d))^c + # on start $b=((b^c)&(c^d))^c + return &bodyx_20_39() if ($rx==59); $rx++; + ( + '($a,$b,$c,$d,$e)=@V;'. + + '&rorx (@T[0],$a,5)', + '&lea ($e,&DWP(0,$e,$b))', + '&rorx ($b,@T[1],7)', # $b>>>2 + '&add ($d,&DWP(4*(($j+1)&15),"esp"))', # X[]+K xfer + + '&mov (@T[1],$c)', + '&xor ($a,$b)', # b^c for next round + '&xor (@T[1],$b)', # c^d for next round + + '&and ($a,@T[1])', + '&add ($e,@T[0])', + '&xor ($a,$b)' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' + ); +} &set_label("loop",16); &Xupdate_ssse3_16_31(\&body_00_19); @@ -855,9 +933,10 @@ sub body_40_59 () { # ((b^c)&(c^d))^c &mov (&DWP(12,@T[1]),$D); &xor ($B,$D); &mov (&DWP(16,@T[1]),$E); - &and ($B,@T[0]); - &movdqa (@X[0],@X[-3&7]); - &xchg ($B,@T[0]); + &mov (@T[1],@T[0]); + &pshufd (@X[0],@X[-4&7],0xee); # was &movdqa (@X[0],@X[-3&7]); + &and (@T[0],$B); + &mov ($B,$T[1]); &jmp (&label("loop")); @@ -1226,9 +1305,10 @@ sub Xtail_avx() &mov (&DWP(8,@T[1]),$C); &xor ($B,$D); &mov (&DWP(12,@T[1]),$D); - &and ($B,@T[0]); &mov (&DWP(16,@T[1]),$E); - &xchg ($B,@T[0]); + &mov (@T[1],@T[0]); + &and (@T[0],$B); + &mov ($B,@T[1]); &jmp (&label("loop")); diff --git a/crypto/sha/asm/sha1-x86_64.pl b/crypto/sha/asm/sha1-x86_64.pl index 71d5c12540..a22fea8424 100755 --- a/crypto/sha/asm/sha1-x86_64.pl +++ b/crypto/sha/asm/sha1-x86_64.pl @@ -62,16 +62,20 @@ # CPU clock cycles spent to process single byte (less is better). # # x86_64 SSSE3 AVX[2] -# P4 9.8 - -# Opteron 6.65 - -# Core2 6.70 6.05/+11% - -# Westmere 7.08 5.44/+30% - -# Sandy Bridge 7.93 6.16/+28% 4.99/+59% -# Ivy Bridge 6.30 4.63/+36% 4.60/+37% -# Haswell 5.98 4.12/+45% 3.57/+67% -# Bulldozer 10.9 5.95/+82% -# VIA Nano 10.2 7.46/+37% -# Atom 11.0 9.61/+14% +# P4 9.05 - +# Opteron 6.26 - +# Core2 6.55 6.05/+8% - +# Westmere 6.73 5.30/+27% - +# Sandy Bridge 7.70 6.10/+26% 4.99/+54% +# Ivy Bridge 6.06 4.67/+30% 4.60/+32% +# Haswell 5.45 4.15/+31% 3.57/+53% +# Bulldozer 9.11 5.95/+53% +# VIA Nano 9.32 7.15/+30% +# Atom [10.5?] [9.23?]/+14% +# Silvermont 13.1(*) 9.37/+40% +# +# (*) obviously suboptimal result, nothing was done about it, +# because SSSE3 code is compiled unconditionally; $flavour = shift; $output = shift; @@ -114,7 +118,7 @@ $num="%r10"; $t0="%eax"; $t1="%ebx"; $t2="%ecx"; -@xi=("%edx","%ebp"); +@xi=("%edx","%ebp","%r14d"); $A="%esi"; $B="%edi"; $C="%r11d"; @@ -129,42 +133,40 @@ my $j=$i+1; $code.=<<___ if ($i==0); mov `4*$i`($inp),$xi[0] bswap $xi[0] - mov $xi[0],`4*$i`(%rsp) ___ $code.=<<___ if ($i<15); - mov $c,$t0 mov `4*$j`($inp),$xi[1] + mov $d,$t0 + mov $xi[0],`4*$i`(%rsp) mov $a,$t2 - xor $d,$t0 bswap $xi[1] + xor $c,$t0 rol \$5,$t2 - lea 0x5a827999($xi[0],$e),$e and $b,$t0 - mov $xi[1],`4*$j`(%rsp) + lea 0x5a827999($xi[0],$e),$e add $t2,$e xor $d,$t0 rol \$30,$b add $t0,$e ___ $code.=<<___ if ($i>=15); - mov `4*($j%16)`(%rsp),$xi[1] - mov $c,$t0 + xor `4*($j%16)`(%rsp),$xi[1] + mov $d,$t0 + mov $xi[0],`4*($i%16)`(%rsp) mov $a,$t2 xor `4*(($j+2)%16)`(%rsp),$xi[1] - xor $d,$t0 + xor $c,$t0 rol \$5,$t2 xor `4*(($j+8)%16)`(%rsp),$xi[1] and $b,$t0 lea 0x5a827999($xi[0],$e),$e - xor `4*(($j+13)%16)`(%rsp),$xi[1] + rol \$30,$b xor $d,$t0 - rol \$1,$xi[1] add $t2,$e - rol \$30,$b - mov $xi[1],`4*($j%16)`(%rsp) + rol \$1,$xi[1] add $t0,$e ___ -unshift(@xi,pop(@xi)); +push(@xi,shift(@xi)); } sub BODY_20_39 { @@ -172,62 +174,58 @@ my ($i,$a,$b,$c,$d,$e)=@_; my $j=$i+1; my $K=($i<40)?0x6ed9eba1:0xca62c1d6; $code.=<<___ if ($i<79); - mov `4*($j%16)`(%rsp),$xi[1] - mov $c,$t0 + xor `4*($j%16)`(%rsp),$xi[1] + mov $b,$t0 + `"mov $xi[0],".4*($i%16)."(%rsp)" if ($i<72)` mov $a,$t2 xor `4*(($j+2)%16)`(%rsp),$xi[1] - xor $b,$t0 + xor $d,$t0 rol \$5,$t2 - lea $K($xi[0],$e),$e xor `4*(($j+8)%16)`(%rsp),$xi[1] - xor $d,$t0 + lea $K($xi[0],$e),$e + xor $c,$t0 add $t2,$e - xor `4*(($j+13)%16)`(%rsp),$xi[1] rol \$30,$b add $t0,$e rol \$1,$xi[1] ___ -$code.=<<___ if ($i<76); - mov $xi[1],`4*($j%16)`(%rsp) -___ $code.=<<___ if ($i==79); - mov $c,$t0 + mov $b,$t0 mov $a,$t2 - xor $b,$t0 + xor $d,$t0 lea $K($xi[0],$e),$e rol \$5,$t2 - xor $d,$t0 + xor $c,$t0 add $t2,$e rol \$30,$b add $t0,$e ___ -unshift(@xi,pop(@xi)); +push(@xi,shift(@xi)); } sub BODY_40_59 { my ($i,$a,$b,$c,$d,$e)=@_; my $j=$i+1; $code.=<<___; - mov `4*($j%16)`(%rsp),$xi[1] - mov $c,$t0 - mov $c,$t1 + xor `4*($j%16)`(%rsp),$xi[1] + mov $d,$t0 + mov $xi[0],`4*($i%16)`(%rsp) + mov $d,$t1 xor `4*(($j+2)%16)`(%rsp),$xi[1] - and $d,$t0 + and $c,$t0 mov $a,$t2 xor `4*(($j+8)%16)`(%rsp),$xi[1] - xor $d,$t1 lea 0x8f1bbcdc($xi[0],$e),$e + xor $c,$t1 rol \$5,$t2 - xor `4*(($j+13)%16)`(%rsp),$xi[1] add $t0,$e - and $b,$t1 rol \$1,$xi[1] - add $t1,$e - rol \$30,$b - mov $xi[1],`4*($j%16)`(%rsp) + and $b,$t1 add $t2,$e + rol \$30,$b + add $t1,$e ___ -unshift(@xi,pop(@xi)); +push(@xi,shift(@xi)); } $code.=<<___; @@ -261,17 +259,18 @@ $code.=<<___; .align 16 .Lialu: + mov %rsp,%rax push %rbx push %rbp push %r12 push %r13 - mov %rsp,%r11 + push %r14 mov %rdi,$ctx # reassigned argument sub \$`8+16*4`,%rsp mov %rsi,$inp # reassigned argument and \$-64,%rsp mov %rdx,$num # reassigned argument - mov %r11,`16*4`(%rsp) + mov %rax,`16*4`(%rsp) .Lprologue: mov 0($ctx),$A @@ -305,11 +304,12 @@ $code.=<<___; jnz .Lloop mov `16*4`(%rsp),%rsi - mov (%rsi),%r13 - mov 8(%rsi),%r12 - mov 16(%rsi),%rbp - mov 24(%rsi),%rbx - lea 32(%rsi),%rsp + mov -40(%rsi),%r14 + mov -32(%rsi),%r13 + mov -24(%rsi),%r12 + mov -16(%rsi),%rbp + mov -8(%rsi),%rbx + lea (%rsi),%rsp .Lepilogue: ret .size sha1_block_data_order,.-sha1_block_data_order @@ -389,11 +389,11 @@ $code.=<<___; movdqu 32($inp),@X[-2&7] movdqu 48($inp),@X[-1&7] pshufb @X[2],@X[-4&7] # byte swap - add \$64,$inp pshufb @X[2],@X[-3&7] pshufb @X[2],@X[-2&7] - pshufb @X[2],@X[-1&7] + add \$64,$inp paddd @Tx[1],@X[-4&7] # add K_00_19 + pshufb @X[2],@X[-1&7] paddd @Tx[1],@X[-3&7] paddd @Tx[1],@X[-2&7] movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU @@ -418,61 +418,61 @@ sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4 my @insns = (&$body,&$body,&$body,&$body); # 40 instructions my ($a,$b,$c,$d,$e); - &movdqa (@X[0],@X[-3&7]); - eval(shift(@insns)); + eval(shift(@insns)); # ror + &pshufd (@X[0],@X[-4&7],0xee); # was &movdqa (@X[0],@X[-3&7]); eval(shift(@insns)); &movdqa (@Tx[0],@X[-1&7]); - &palignr(@X[0],@X[-4&7],8); # compose "X[-14]" in "X[0]" + &paddd (@Tx[1],@X[-1&7]); eval(shift(@insns)); eval(shift(@insns)); - &paddd (@Tx[1],@X[-1&7]); + &punpcklqdq(@X[0],@X[-3&7]); # compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8); eval(shift(@insns)); + eval(shift(@insns)); # rol eval(shift(@insns)); &psrldq (@Tx[0],4); # "X[-3]", 3 dwords eval(shift(@insns)); eval(shift(@insns)); + &pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]" eval(shift(@insns)); - eval(shift(@insns)); - + eval(shift(@insns)); # ror &pxor (@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]" eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); - eval(shift(@insns)); &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]" eval(shift(@insns)); - eval(shift(@insns)); + eval(shift(@insns)); # rol &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU eval(shift(@insns)); eval(shift(@insns)); &movdqa (@Tx[2],@X[0]); - &movdqa (@Tx[0],@X[0]); - eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); + eval(shift(@insns)); # ror + &movdqa (@Tx[0],@X[0]); eval(shift(@insns)); &pslldq (@Tx[2],12); # "X[0]"<<96, extract one dword &paddd (@X[0],@X[0]); eval(shift(@insns)); eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); &psrld (@Tx[0],31); eval(shift(@insns)); + eval(shift(@insns)); # rol eval(shift(@insns)); &movdqa (@Tx[1],@Tx[2]); eval(shift(@insns)); eval(shift(@insns)); &psrld (@Tx[2],30); - &por (@X[0],@Tx[0]); # "X[0]"<<<=1 eval(shift(@insns)); + eval(shift(@insns)); # ror + &por (@X[0],@Tx[0]); # "X[0]"<<<=1 eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); @@ -480,12 +480,13 @@ sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4 &pslld (@Tx[1],2); &pxor (@X[0],@Tx[2]); eval(shift(@insns)); - eval(shift(@insns)); &movdqa (@Tx[2],eval(2*16*(($Xi)/5)-64)."($K_XX_XX)"); # K_XX_XX + eval(shift(@insns)); # rol eval(shift(@insns)); eval(shift(@insns)); &pxor (@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2 + &pshufd (@Tx[1],@X[-1&7],0xee) if ($Xi==7); # was &movdqa (@Tx[0],@X[-1&7]) in Xupdate_ssse3_32_79 foreach (@insns) { eval; } # remaining instructions [if any] @@ -499,24 +500,27 @@ sub Xupdate_ssse3_32_79() my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions my ($a,$b,$c,$d,$e); - &movdqa (@Tx[0],@X[-1&7]) if ($Xi==8); - eval(shift(@insns)); # body_20_39 + eval(shift(@insns)) if ($Xi==8); &pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" - &palignr(@Tx[0],@X[-2&7],8); # compose "X[-6]" + eval(shift(@insns)) if ($Xi==8); + eval(shift(@insns)); # body_20_39 eval(shift(@insns)); + eval(shift(@insns)) if (@insns[1] =~ /_ror/); + eval(shift(@insns)) if (@insns[0] =~ /_ror/); + &punpcklqdq(@Tx[0],@X[-1&7]); # compose "X[-6]", was &palignr(@Tx[0],@X[-2&7],8); eval(shift(@insns)); eval(shift(@insns)); # rol &pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]" eval(shift(@insns)); - eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/); + eval(shift(@insns)); if ($Xi%5) { &movdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX... } else { # ... or load next one &movdqa (@Tx[2],eval(2*16*($Xi/5)-64)."($K_XX_XX)"); } - &paddd (@Tx[1],@X[-1&7]); eval(shift(@insns)); # ror + &paddd (@Tx[1],@X[-1&7]); eval(shift(@insns)); &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-6]" @@ -524,29 +528,31 @@ sub Xupdate_ssse3_32_79() eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # rol + eval(shift(@insns)) if (@insns[0] =~ /_ror/); &movdqa (@Tx[0],@X[0]); - &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU eval(shift(@insns)); eval(shift(@insns)); + &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU eval(shift(@insns)); # ror eval(shift(@insns)); + eval(shift(@insns)); # body_20_39 &pslld (@X[0],2); - eval(shift(@insns)); # body_20_39 eval(shift(@insns)); - &psrld (@Tx[0],30); eval(shift(@insns)); - eval(shift(@insns)); # rol + &psrld (@Tx[0],30); + eval(shift(@insns)) if (@insns[0] =~ /_rol/);# rol eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # ror - eval(shift(@insns)); &por (@X[0],@Tx[0]); # "X[0]"<<<=2 - eval(shift(@insns)); # body_20_39 eval(shift(@insns)); - &movdqa (@Tx[1],@X[0]) if ($Xi<19); + eval(shift(@insns)); # body_20_39 + eval(shift(@insns)) if (@insns[1] =~ /_rol/); + eval(shift(@insns)) if (@insns[0] =~ /_rol/); + &pshufd(@Tx[1],@X[-1&7],0xee) if ($Xi<19); # was &movdqa (@Tx[1],@X[0]) eval(shift(@insns)); eval(shift(@insns)); # rol eval(shift(@insns)); @@ -567,10 +573,11 @@ sub Xuplast_ssse3_80() my ($a,$b,$c,$d,$e); eval(shift(@insns)); - &paddd (@Tx[1],@X[-1&7]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); + &paddd (@Tx[1],@X[-1&7]); + eval(shift(@insns)); eval(shift(@insns)); &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU @@ -602,10 +609,12 @@ sub Xloop_ssse3() eval(shift(@insns)); eval(shift(@insns)); + eval(shift(@insns)); &pshufb (@X[($Xi-3)&7],@X[2]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); + eval(shift(@insns)); &paddd (@X[($Xi-4)&7],@Tx[1]); eval(shift(@insns)); eval(shift(@insns)); @@ -614,6 +623,8 @@ sub Xloop_ssse3() &movdqa (eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]); # X[]+K xfer to IALU eval(shift(@insns)); eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); &psubd (@X[($Xi-4)&7],@Tx[1]); foreach (@insns) { eval; } @@ -1680,16 +1691,17 @@ se_handler: jae .Lcommon_seh_tail mov `16*4`(%rax),%rax # pull saved stack pointer - lea 32(%rax),%rax mov -8(%rax),%rbx mov -16(%rax),%rbp mov -24(%rax),%r12 mov -32(%rax),%r13 + mov -40(%rax),%r14 mov %rbx,144($context) # restore context->Rbx mov %rbp,160($context) # restore context->Rbp mov %r12,216($context) # restore context->R12 mov %r13,224($context) # restore context->R13 + mov %r14,232($context) # restore context->R14 jmp .Lcommon_seh_tail .size se_handler,.-se_handler |