summaryrefslogtreecommitdiffstats
path: root/crypto/sha
diff options
context:
space:
mode:
authorAndy Polyakov <appro@openssl.org>2014-02-01 21:24:55 +0100
committerAndy Polyakov <appro@openssl.org>2014-02-01 21:24:55 +0100
commit729d334106e6ef3a2b2f4f9cb2520669a07ae79d (patch)
treed62f23000a0ffcf110dc677cd16ec3662341108b /crypto/sha
parentcacdfcb2479984d9bfcc79b623118d8af6fea169 (diff)
crypto/sha/asm/sha1-x86_64.pl: jumbo update from master.
Diffstat (limited to 'crypto/sha')
-rw-r--r--crypto/sha/asm/sha1-586.pl130
-rwxr-xr-xcrypto/sha/asm/sha1-x86_64.pl182
2 files changed, 202 insertions, 110 deletions
diff --git a/crypto/sha/asm/sha1-586.pl b/crypto/sha/asm/sha1-586.pl
index 632dbbe122..17b84c8bfe 100644
--- a/crypto/sha/asm/sha1-586.pl
+++ b/crypto/sha/asm/sha1-586.pl
@@ -93,8 +93,9 @@
# Westmere 7.3 5.5/+33% -
# Sandy Bridge 8.8 6.2/+40% 5.1(**)/+73%
# Ivy Bridge 7.2 4.8/+51% 4.7(**)/+53%
+# Haswell 6.5 4.3/+51% 4.1(**)/+58%
# Bulldozer 11.6 6.0/+92%
-# VIA Nano 10.6 7.4/+43%
+# VIA Nano 10.6 7.5/+41%
#
# (*) Loop is 1056 instructions long and expected result is ~8.25.
# It remains mystery [to me] why ILP is limited to 1.7.
@@ -512,7 +513,7 @@ my $_ror=sub { &ror(@_) };
&mov (@T[1],$C);
&psubd (@X[-2&7],@X[3]);
&xor (@T[1],$D);
- &movdqa (@X[0],@X[-3&7]);
+ &pshufd (@X[0],@X[-4&7],0xee); # was &movdqa (@X[0],@X[-3&7]);
&and (@T[0],@T[1]);
&jmp (&label("loop"));
@@ -539,76 +540,77 @@ sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4
my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
my ($a,$b,$c,$d,$e);
+ eval(shift(@insns)); # ror
eval(shift(@insns));
eval(shift(@insns));
- &palignr(@X[0],@X[-4&7],8); # compose "X[-14]" in "X[0]"
+ &punpcklqdq(@X[0],@X[-3&7]); # compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8);
&movdqa (@X[2],@X[-1&7]);
eval(shift(@insns));
eval(shift(@insns));
&paddd (@X[3],@X[-1&7]);
&movdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]);# save X[] to backtrace buffer
- eval(shift(@insns));
+ eval(shift(@insns)); # rol
eval(shift(@insns));
&psrldq (@X[2],4); # "X[-3]", 3 dwords
eval(shift(@insns));
eval(shift(@insns));
&pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
eval(shift(@insns));
- eval(shift(@insns));
+ eval(shift(@insns)); # ror
&pxor (@X[2],@X[-2&7]); # "X[-3]"^"X[-8]"
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
- eval(shift(@insns));
&pxor (@X[0],@X[2]); # "X[0]"^="X[-3]"^"X[-8]"
eval(shift(@insns));
- eval(shift(@insns));
+ eval(shift(@insns)); # rol
&movdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer to IALU
eval(shift(@insns));
eval(shift(@insns));
&movdqa (@X[4],@X[0]);
- &movdqa (@X[2],@X[0]);
- eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
+ eval(shift(@insns)); # ror
+ &movdqa (@X[2],@X[0]);
eval(shift(@insns));
&pslldq (@X[4],12); # "X[0]"<<96, extract one dword
&paddd (@X[0],@X[0]);
eval(shift(@insns));
eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
&psrld (@X[2],31);
eval(shift(@insns));
- eval(shift(@insns));
+ eval(shift(@insns)); # rol
&movdqa (@X[3],@X[4]);
eval(shift(@insns));
eval(shift(@insns));
+ eval(shift(@insns));
&psrld (@X[4],30);
- &por (@X[0],@X[2]); # "X[0]"<<<=1
eval(shift(@insns));
+ eval(shift(@insns)); # ror
+ &por (@X[0],@X[2]); # "X[0]"<<<=1
eval(shift(@insns));
&movdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if ($Xi>5); # restore X[] from backtrace buffer
eval(shift(@insns));
eval(shift(@insns));
&pslld (@X[3],2);
- &pxor (@X[0],@X[4]);
- eval(shift(@insns));
eval(shift(@insns));
+ eval(shift(@insns)); # rol
+ &pxor (@X[0],@X[4]);
&movdqa (@X[4],&QWP(112-16+16*(($Xi)/5),"esp")); # K_XX_XX
eval(shift(@insns));
eval(shift(@insns));
&pxor (@X[0],@X[3]); # "X[0]"^=("X[0]"<<96)<<<2
- &movdqa (@X[1],@X[-2&7]) if ($Xi<7);
+ &pshufd (@X[1],@X[-3&7],0xee) if ($Xi<7); # was &movdqa (@X[1],@X[-2&7])
+ &pshufd (@X[3],@X[-1&7],0xee) if ($Xi==7);
eval(shift(@insns));
eval(shift(@insns));
@@ -623,10 +625,9 @@ sub Xupdate_ssse3_32_79()
my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions
my ($a,$b,$c,$d,$e);
- &movdqa (@X[2],@X[-1&7]) if ($Xi==8);
eval(shift(@insns)); # body_20_39
&pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
- &palignr(@X[2],@X[-2&7],8); # compose "X[-6]"
+ &punpcklqdq(@X[2],@X[-1&7]); # compose "X[-6]", was &palignr(@X[2],@X[-2&7],8)
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns)); # rol
@@ -635,13 +636,14 @@ sub Xupdate_ssse3_32_79()
&movdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]); # save X[] to backtrace buffer
eval(shift(@insns));
eval(shift(@insns));
+ eval(shift(@insns)) if (@insns[0] =~ /_rol/);
if ($Xi%5) {
&movdqa (@X[4],@X[3]); # "perpetuate" K_XX_XX...
} else { # ... or load next one
&movdqa (@X[4],&QWP(112-16+16*($Xi/5),"esp"));
}
- &paddd (@X[3],@X[-1&7]);
eval(shift(@insns)); # ror
+ &paddd (@X[3],@X[-1&7]);
eval(shift(@insns));
&pxor (@X[0],@X[2]); # "X[0]"^="X[-6]"
@@ -656,6 +658,7 @@ sub Xupdate_ssse3_32_79()
eval(shift(@insns));
eval(shift(@insns)); # ror
eval(shift(@insns));
+ eval(shift(@insns)) if (@insns[0] =~ /_rol/);
&pslld (@X[0],2);
eval(shift(@insns)); # body_20_39
@@ -667,6 +670,8 @@ sub Xupdate_ssse3_32_79()
eval(shift(@insns));
eval(shift(@insns)); # ror
eval(shift(@insns));
+ eval(shift(@insns)) if (@insns[1] =~ /_rol/);
+ eval(shift(@insns)) if (@insns[0] =~ /_rol/);
&por (@X[0],@X[2]); # "X[0]"<<<=2
eval(shift(@insns)); # body_20_39
@@ -677,7 +682,7 @@ sub Xupdate_ssse3_32_79()
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns)); # ror
- &movdqa (@X[3],@X[0]) if ($Xi<19);
+ &pshufd (@X[3],@X[-1],0xee) if ($Xi<19); # was &movdqa (@X[3],@X[0])
eval(shift(@insns));
foreach (@insns) { eval; } # remaining instructions
@@ -692,6 +697,12 @@ sub Xuplast_ssse3_80()
my ($a,$b,$c,$d,$e);
eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
&paddd (@X[3],@X[-1&7]);
eval(shift(@insns));
eval(shift(@insns));
@@ -728,9 +739,16 @@ sub Xloop_ssse3()
eval(shift(@insns));
eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
&pshufb (@X[($Xi-3)&7],@X[2]);
eval(shift(@insns));
eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
&paddd (@X[($Xi-4)&7],@X[3]);
eval(shift(@insns));
eval(shift(@insns));
@@ -739,6 +757,8 @@ sub Xloop_ssse3()
&movdqa (&QWP(0+16*$Xi,"esp"),@X[($Xi-4)&7]); # X[]+K xfer to IALU
eval(shift(@insns));
eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
&psubd (@X[($Xi-4)&7],@X[3]);
foreach (@insns) { eval; }
@@ -816,6 +836,64 @@ sub body_40_59 () { # ((b^c)&(c^d))^c
'&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
);
}
+######
+sub bodyx_00_19 () { # ((c^d)&b)^d
+ # on start @T[0]=(b&c)^(~b&d), $e+=X[]+K
+ return &bodyx_20_39() if ($rx==19); $rx++;
+ (
+ '($a,$b,$c,$d,$e)=@V;'.
+
+ '&rorx ($b,$b,2) if ($j==0);'. # $b>>>2
+ '&rorx ($b,@T[1],7) if ($j!=0);', # $b>>>2
+ '&lea ($e,&DWP(0,$e,@T[0]));',
+ '&rorx (@T[0],$a,5);',
+
+ '&andn (@T[1],$a,$c);',
+ '&and ($a,$b)',
+ '&add ($d,&DWP(4*(($j+1)&15),"esp"));', # X[]+K xfer
+
+ '&xor (@T[1],$a)',
+ '&add ($e,@T[0]);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+ );
+}
+
+sub bodyx_20_39 () { # b^d^c
+ # on start $b=b^c^d
+ return &bodyx_40_59() if ($rx==39); $rx++;
+ (
+ '($a,$b,$c,$d,$e)=@V;'.
+
+ '&add ($e,($j==19?@T[0]:$b))',
+ '&rorx ($b,@T[1],7);', # $b>>>2
+ '&rorx (@T[0],$a,5);',
+
+ '&xor ($a,$b) if ($j<79);',
+ '&add ($d,&DWP(4*(($j+1)&15),"esp")) if ($j<79);', # X[]+K xfer
+ '&xor ($a,$c) if ($j<79);',
+ '&add ($e,@T[0]);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+ );
+}
+
+sub bodyx_40_59 () { # ((b^c)&(c^d))^c
+ # on start $b=((b^c)&(c^d))^c
+ return &bodyx_20_39() if ($rx==59); $rx++;
+ (
+ '($a,$b,$c,$d,$e)=@V;'.
+
+ '&rorx (@T[0],$a,5)',
+ '&lea ($e,&DWP(0,$e,$b))',
+ '&rorx ($b,@T[1],7)', # $b>>>2
+ '&add ($d,&DWP(4*(($j+1)&15),"esp"))', # X[]+K xfer
+
+ '&mov (@T[1],$c)',
+ '&xor ($a,$b)', # b^c for next round
+ '&xor (@T[1],$b)', # c^d for next round
+
+ '&and ($a,@T[1])',
+ '&add ($e,@T[0])',
+ '&xor ($a,$b)' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+ );
+}
&set_label("loop",16);
&Xupdate_ssse3_16_31(\&body_00_19);
@@ -855,9 +933,10 @@ sub body_40_59 () { # ((b^c)&(c^d))^c
&mov (&DWP(12,@T[1]),$D);
&xor ($B,$D);
&mov (&DWP(16,@T[1]),$E);
- &and ($B,@T[0]);
- &movdqa (@X[0],@X[-3&7]);
- &xchg ($B,@T[0]);
+ &mov (@T[1],@T[0]);
+ &pshufd (@X[0],@X[-4&7],0xee); # was &movdqa (@X[0],@X[-3&7]);
+ &and (@T[0],$B);
+ &mov ($B,$T[1]);
&jmp (&label("loop"));
@@ -1226,9 +1305,10 @@ sub Xtail_avx()
&mov (&DWP(8,@T[1]),$C);
&xor ($B,$D);
&mov (&DWP(12,@T[1]),$D);
- &and ($B,@T[0]);
&mov (&DWP(16,@T[1]),$E);
- &xchg ($B,@T[0]);
+ &mov (@T[1],@T[0]);
+ &and (@T[0],$B);
+ &mov ($B,@T[1]);
&jmp (&label("loop"));
diff --git a/crypto/sha/asm/sha1-x86_64.pl b/crypto/sha/asm/sha1-x86_64.pl
index 71d5c12540..a22fea8424 100755
--- a/crypto/sha/asm/sha1-x86_64.pl
+++ b/crypto/sha/asm/sha1-x86_64.pl
@@ -62,16 +62,20 @@
# CPU clock cycles spent to process single byte (less is better).
#
# x86_64 SSSE3 AVX[2]
-# P4 9.8 -
-# Opteron 6.65 -
-# Core2 6.70 6.05/+11% -
-# Westmere 7.08 5.44/+30% -
-# Sandy Bridge 7.93 6.16/+28% 4.99/+59%
-# Ivy Bridge 6.30 4.63/+36% 4.60/+37%
-# Haswell 5.98 4.12/+45% 3.57/+67%
-# Bulldozer 10.9 5.95/+82%
-# VIA Nano 10.2 7.46/+37%
-# Atom 11.0 9.61/+14%
+# P4 9.05 -
+# Opteron 6.26 -
+# Core2 6.55 6.05/+8% -
+# Westmere 6.73 5.30/+27% -
+# Sandy Bridge 7.70 6.10/+26% 4.99/+54%
+# Ivy Bridge 6.06 4.67/+30% 4.60/+32%
+# Haswell 5.45 4.15/+31% 3.57/+53%
+# Bulldozer 9.11 5.95/+53%
+# VIA Nano 9.32 7.15/+30%
+# Atom [10.5?] [9.23?]/+14%
+# Silvermont 13.1(*) 9.37/+40%
+#
+# (*) obviously suboptimal result, nothing was done about it,
+# because SSSE3 code is compiled unconditionally;
$flavour = shift;
$output = shift;
@@ -114,7 +118,7 @@ $num="%r10";
$t0="%eax";
$t1="%ebx";
$t2="%ecx";
-@xi=("%edx","%ebp");
+@xi=("%edx","%ebp","%r14d");
$A="%esi";
$B="%edi";
$C="%r11d";
@@ -129,42 +133,40 @@ my $j=$i+1;
$code.=<<___ if ($i==0);
mov `4*$i`($inp),$xi[0]
bswap $xi[0]
- mov $xi[0],`4*$i`(%rsp)
___
$code.=<<___ if ($i<15);
- mov $c,$t0
mov `4*$j`($inp),$xi[1]
+ mov $d,$t0
+ mov $xi[0],`4*$i`(%rsp)
mov $a,$t2
- xor $d,$t0
bswap $xi[1]
+ xor $c,$t0
rol \$5,$t2
- lea 0x5a827999($xi[0],$e),$e
and $b,$t0
- mov $xi[1],`4*$j`(%rsp)
+ lea 0x5a827999($xi[0],$e),$e
add $t2,$e
xor $d,$t0
rol \$30,$b
add $t0,$e
___
$code.=<<___ if ($i>=15);
- mov `4*($j%16)`(%rsp),$xi[1]
- mov $c,$t0
+ xor `4*($j%16)`(%rsp),$xi[1]
+ mov $d,$t0
+ mov $xi[0],`4*($i%16)`(%rsp)
mov $a,$t2
xor `4*(($j+2)%16)`(%rsp),$xi[1]
- xor $d,$t0
+ xor $c,$t0
rol \$5,$t2
xor `4*(($j+8)%16)`(%rsp),$xi[1]
and $b,$t0
lea 0x5a827999($xi[0],$e),$e
- xor `4*(($j+13)%16)`(%rsp),$xi[1]
+ rol \$30,$b
xor $d,$t0
- rol \$1,$xi[1]
add $t2,$e
- rol \$30,$b
- mov $xi[1],`4*($j%16)`(%rsp)
+ rol \$1,$xi[1]
add $t0,$e
___
-unshift(@xi,pop(@xi));
+push(@xi,shift(@xi));
}
sub BODY_20_39 {
@@ -172,62 +174,58 @@ my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i+1;
my $K=($i<40)?0x6ed9eba1:0xca62c1d6;
$code.=<<___ if ($i<79);
- mov `4*($j%16)`(%rsp),$xi[1]
- mov $c,$t0
+ xor `4*($j%16)`(%rsp),$xi[1]
+ mov $b,$t0
+ `"mov $xi[0],".4*($i%16)."(%rsp)" if ($i<72)`
mov $a,$t2
xor `4*(($j+2)%16)`(%rsp),$xi[1]
- xor $b,$t0
+ xor $d,$t0
rol \$5,$t2
- lea $K($xi[0],$e),$e
xor `4*(($j+8)%16)`(%rsp),$xi[1]
- xor $d,$t0
+ lea $K($xi[0],$e),$e
+ xor $c,$t0
add $t2,$e
- xor `4*(($j+13)%16)`(%rsp),$xi[1]
rol \$30,$b
add $t0,$e
rol \$1,$xi[1]
___
-$code.=<<___ if ($i<76);
- mov $xi[1],`4*($j%16)`(%rsp)
-___
$code.=<<___ if ($i==79);
- mov $c,$t0
+ mov $b,$t0
mov $a,$t2
- xor $b,$t0
+ xor $d,$t0
lea $K($xi[0],$e),$e
rol \$5,$t2
- xor $d,$t0
+ xor $c,$t0
add $t2,$e
rol \$30,$b
add $t0,$e
___
-unshift(@xi,pop(@xi));
+push(@xi,shift(@xi));
}
sub BODY_40_59 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i+1;
$code.=<<___;
- mov `4*($j%16)`(%rsp),$xi[1]
- mov $c,$t0
- mov $c,$t1
+ xor `4*($j%16)`(%rsp),$xi[1]
+ mov $d,$t0
+ mov $xi[0],`4*($i%16)`(%rsp)
+ mov $d,$t1
xor `4*(($j+2)%16)`(%rsp),$xi[1]
- and $d,$t0
+ and $c,$t0
mov $a,$t2
xor `4*(($j+8)%16)`(%rsp),$xi[1]
- xor $d,$t1
lea 0x8f1bbcdc($xi[0],$e),$e
+ xor $c,$t1
rol \$5,$t2
- xor `4*(($j+13)%16)`(%rsp),$xi[1]
add $t0,$e
- and $b,$t1
rol \$1,$xi[1]
- add $t1,$e
- rol \$30,$b
- mov $xi[1],`4*($j%16)`(%rsp)
+ and $b,$t1
add $t2,$e
+ rol \$30,$b
+ add $t1,$e
___
-unshift(@xi,pop(@xi));
+push(@xi,shift(@xi));
}
$code.=<<___;
@@ -261,17 +259,18 @@ $code.=<<___;
.align 16
.Lialu:
+ mov %rsp,%rax
push %rbx
push %rbp
push %r12
push %r13
- mov %rsp,%r11
+ push %r14
mov %rdi,$ctx # reassigned argument
sub \$`8+16*4`,%rsp
mov %rsi,$inp # reassigned argument
and \$-64,%rsp
mov %rdx,$num # reassigned argument
- mov %r11,`16*4`(%rsp)
+ mov %rax,`16*4`(%rsp)
.Lprologue:
mov 0($ctx),$A
@@ -305,11 +304,12 @@ $code.=<<___;
jnz .Lloop
mov `16*4`(%rsp),%rsi
- mov (%rsi),%r13
- mov 8(%rsi),%r12
- mov 16(%rsi),%rbp
- mov 24(%rsi),%rbx
- lea 32(%rsi),%rsp
+ mov -40(%rsi),%r14
+ mov -32(%rsi),%r13
+ mov -24(%rsi),%r12
+ mov -16(%rsi),%rbp
+ mov -8(%rsi),%rbx
+ lea (%rsi),%rsp
.Lepilogue:
ret
.size sha1_block_data_order,.-sha1_block_data_order
@@ -389,11 +389,11 @@ $code.=<<___;
movdqu 32($inp),@X[-2&7]
movdqu 48($inp),@X[-1&7]
pshufb @X[2],@X[-4&7] # byte swap
- add \$64,$inp
pshufb @X[2],@X[-3&7]
pshufb @X[2],@X[-2&7]
- pshufb @X[2],@X[-1&7]
+ add \$64,$inp
paddd @Tx[1],@X[-4&7] # add K_00_19
+ pshufb @X[2],@X[-1&7]
paddd @Tx[1],@X[-3&7]
paddd @Tx[1],@X[-2&7]
movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU
@@ -418,61 +418,61 @@ sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4
my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
my ($a,$b,$c,$d,$e);
- &movdqa (@X[0],@X[-3&7]);
- eval(shift(@insns));
+ eval(shift(@insns)); # ror
+ &pshufd (@X[0],@X[-4&7],0xee); # was &movdqa (@X[0],@X[-3&7]);
eval(shift(@insns));
&movdqa (@Tx[0],@X[-1&7]);
- &palignr(@X[0],@X[-4&7],8); # compose "X[-14]" in "X[0]"
+ &paddd (@Tx[1],@X[-1&7]);
eval(shift(@insns));
eval(shift(@insns));
- &paddd (@Tx[1],@X[-1&7]);
+ &punpcklqdq(@X[0],@X[-3&7]); # compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8);
eval(shift(@insns));
+ eval(shift(@insns)); # rol
eval(shift(@insns));
&psrldq (@Tx[0],4); # "X[-3]", 3 dwords
eval(shift(@insns));
eval(shift(@insns));
+
&pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
eval(shift(@insns));
- eval(shift(@insns));
-
+ eval(shift(@insns)); # ror
&pxor (@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
- eval(shift(@insns));
&pxor (@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
eval(shift(@insns));
- eval(shift(@insns));
+ eval(shift(@insns)); # rol
&movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
eval(shift(@insns));
eval(shift(@insns));
&movdqa (@Tx[2],@X[0]);
- &movdqa (@Tx[0],@X[0]);
- eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
+ eval(shift(@insns)); # ror
+ &movdqa (@Tx[0],@X[0]);
eval(shift(@insns));
&pslldq (@Tx[2],12); # "X[0]"<<96, extract one dword
&paddd (@X[0],@X[0]);
eval(shift(@insns));
eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
&psrld (@Tx[0],31);
eval(shift(@insns));
+ eval(shift(@insns)); # rol
eval(shift(@insns));
&movdqa (@Tx[1],@Tx[2]);
eval(shift(@insns));
eval(shift(@insns));
&psrld (@Tx[2],30);
- &por (@X[0],@Tx[0]); # "X[0]"<<<=1
eval(shift(@insns));
+ eval(shift(@insns)); # ror
+ &por (@X[0],@Tx[0]); # "X[0]"<<<=1
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
@@ -480,12 +480,13 @@ sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4
&pslld (@Tx[1],2);
&pxor (@X[0],@Tx[2]);
eval(shift(@insns));
- eval(shift(@insns));
&movdqa (@Tx[2],eval(2*16*(($Xi)/5)-64)."($K_XX_XX)"); # K_XX_XX
+ eval(shift(@insns)); # rol
eval(shift(@insns));
eval(shift(@insns));
&pxor (@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2
+ &pshufd (@Tx[1],@X[-1&7],0xee) if ($Xi==7); # was &movdqa (@Tx[0],@X[-1&7]) in Xupdate_ssse3_32_79
foreach (@insns) { eval; } # remaining instructions [if any]
@@ -499,24 +500,27 @@ sub Xupdate_ssse3_32_79()
my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions
my ($a,$b,$c,$d,$e);
- &movdqa (@Tx[0],@X[-1&7]) if ($Xi==8);
- eval(shift(@insns)); # body_20_39
+ eval(shift(@insns)) if ($Xi==8);
&pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
- &palignr(@Tx[0],@X[-2&7],8); # compose "X[-6]"
+ eval(shift(@insns)) if ($Xi==8);
+ eval(shift(@insns)); # body_20_39
eval(shift(@insns));
+ eval(shift(@insns)) if (@insns[1] =~ /_ror/);
+ eval(shift(@insns)) if (@insns[0] =~ /_ror/);
+ &punpcklqdq(@Tx[0],@X[-1&7]); # compose "X[-6]", was &palignr(@Tx[0],@X[-2&7],8);
eval(shift(@insns));
eval(shift(@insns)); # rol
&pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
eval(shift(@insns));
- eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/);
+ eval(shift(@insns));
if ($Xi%5) {
&movdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
} else { # ... or load next one
&movdqa (@Tx[2],eval(2*16*($Xi/5)-64)."($K_XX_XX)");
}
- &paddd (@Tx[1],@X[-1&7]);
eval(shift(@insns)); # ror
+ &paddd (@Tx[1],@X[-1&7]);
eval(shift(@insns));
&pxor (@X[0],@Tx[0]); # "X[0]"^="X[-6]"
@@ -524,29 +528,31 @@ sub Xupdate_ssse3_32_79()
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns)); # rol
+ eval(shift(@insns)) if (@insns[0] =~ /_ror/);
&movdqa (@Tx[0],@X[0]);
- &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
eval(shift(@insns));
eval(shift(@insns));
+ &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
eval(shift(@insns)); # ror
eval(shift(@insns));
+ eval(shift(@insns)); # body_20_39
&pslld (@X[0],2);
- eval(shift(@insns)); # body_20_39
eval(shift(@insns));
- &psrld (@Tx[0],30);
eval(shift(@insns));
- eval(shift(@insns)); # rol
+ &psrld (@Tx[0],30);
+ eval(shift(@insns)) if (@insns[0] =~ /_rol/);# rol
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns)); # ror
- eval(shift(@insns));
&por (@X[0],@Tx[0]); # "X[0]"<<<=2
- eval(shift(@insns)); # body_20_39
eval(shift(@insns));
- &movdqa (@Tx[1],@X[0]) if ($Xi<19);
+ eval(shift(@insns)); # body_20_39
+ eval(shift(@insns)) if (@insns[1] =~ /_rol/);
+ eval(shift(@insns)) if (@insns[0] =~ /_rol/);
+ &pshufd(@Tx[1],@X[-1&7],0xee) if ($Xi<19); # was &movdqa (@Tx[1],@X[0])
eval(shift(@insns));
eval(shift(@insns)); # rol
eval(shift(@insns));
@@ -567,10 +573,11 @@ sub Xuplast_ssse3_80()
my ($a,$b,$c,$d,$e);
eval(shift(@insns));
- &paddd (@Tx[1],@X[-1&7]);
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
+ &paddd (@Tx[1],@X[-1&7]);
+ eval(shift(@insns));
eval(shift(@insns));
&movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
@@ -602,10 +609,12 @@ sub Xloop_ssse3()
eval(shift(@insns));
eval(shift(@insns));
+ eval(shift(@insns));
&pshufb (@X[($Xi-3)&7],@X[2]);
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
+ eval(shift(@insns));
&paddd (@X[($Xi-4)&7],@Tx[1]);
eval(shift(@insns));
eval(shift(@insns));
@@ -614,6 +623,8 @@ sub Xloop_ssse3()
&movdqa (eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]); # X[]+K xfer to IALU
eval(shift(@insns));
eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
&psubd (@X[($Xi-4)&7],@Tx[1]);
foreach (@insns) { eval; }
@@ -1680,16 +1691,17 @@ se_handler:
jae .Lcommon_seh_tail
mov `16*4`(%rax),%rax # pull saved stack pointer
- lea 32(%rax),%rax
mov -8(%rax),%rbx
mov -16(%rax),%rbp
mov -24(%rax),%r12
mov -32(%rax),%r13
+ mov -40(%rax),%r14
mov %rbx,144($context) # restore context->Rbx
mov %rbp,160($context) # restore context->Rbp
mov %r12,216($context) # restore context->R12
mov %r13,224($context) # restore context->R13
+ mov %r14,232($context) # restore context->R14
jmp .Lcommon_seh_tail
.size se_handler,.-se_handler