summaryrefslogtreecommitdiffstats
path: root/crypto/aes
diff options
context:
space:
mode:
authorAndy Polyakov <appro@openssl.org>2012-11-12 18:11:17 +0000
committerAndy Polyakov <appro@openssl.org>2012-11-12 18:11:17 +0000
commitd90bf2ab2179f363d1180fe0a5a26422cc5c6794 (patch)
tree1c51c3dc856dd188b205dbfb7d6a43c98d3e3487 /crypto/aes
parent02620cfcd5a6ce11022f73f56698e23ac1c5aee6 (diff)
[vp]aes-x86[_64].pl: update from HEAD.
Diffstat (limited to 'crypto/aes')
-rwxr-xr-xcrypto/aes/asm/aes-586.pl283
-rwxr-xr-xcrypto/aes/asm/aes-x86_64.pl250
-rw-r--r--crypto/aes/asm/vpaes-x86.pl91
-rw-r--r--crypto/aes/asm/vpaes-x86_64.pl85
4 files changed, 354 insertions, 355 deletions
diff --git a/crypto/aes/asm/aes-586.pl b/crypto/aes/asm/aes-586.pl
index 687ed811be..451d0e0ed1 100755
--- a/crypto/aes/asm/aes-586.pl
+++ b/crypto/aes/asm/aes-586.pl
@@ -39,7 +39,7 @@
# but exhibits up to 10% improvement on other cores.
#
# Second version is "monolithic" replacement for aes_core.c, which in
-# addition to AES_[de|en]crypt implements private_AES_set_[de|en]cryption_key.
+# addition to AES_[de|en]crypt implements AES_set_[de|en]cryption_key.
# This made it possible to implement little-endian variant of the
# algorithm without modifying the base C code. Motivating factor for
# the undertaken effort was that it appeared that in tight IA-32
@@ -103,11 +103,12 @@
# byte for 128-bit key.
#
# ECB encrypt ECB decrypt CBC large chunk
-# P4 56[60] 84[100] 23
-# AMD K8 48[44] 70[79] 18
-# PIII 41[50] 61[91] 24
-# Core 2 32[38] 45[70] 18.5
-# Pentium 120 160 77
+# P4 52[54] 83[95] 23
+# AMD K8 46[41] 66[70] 18
+# PIII 41[50] 60[77] 24
+# Core 2 31[36] 45[64] 18.5
+# Atom 76[100] 96[138] 60
+# Pentium 115 150 77
#
# Version 4.1 switches to compact S-box even in key schedule setup.
#
@@ -242,7 +243,7 @@ $vertical_spin=0; # shift "verticaly" defaults to 0, because of
sub encvert()
{ my ($te,@s) = @_;
- my $v0 = $acc, $v1 = $key;
+ my ($v0,$v1) = ($acc,$key);
&mov ($v0,$s[3]); # copy s3
&mov (&DWP(4,"esp"),$s[2]); # save s2
@@ -299,7 +300,7 @@ sub encvert()
# Another experimental routine, which features "horizontal spin," but
# eliminates one reference to stack. Strangely enough runs slower...
sub enchoriz()
-{ my $v0 = $key, $v1 = $acc;
+{ my ($v0,$v1) = ($key,$acc);
&movz ($v0,&LB($s0)); # 3, 2, 1, 0*
&rotr ($s2,8); # 8,11,10, 9
@@ -427,7 +428,7 @@ sub sse_encbody()
######################################################################
sub enccompact()
-{ my $Fn = mov;
+{ my $Fn = \&mov;
while ($#_>5) { pop(@_); $Fn=sub{}; }
my ($i,$te,@s)=@_;
my $tmp = $key;
@@ -476,24 +477,25 @@ sub enctransform()
my $tmp = $tbl;
my $r2 = $key ;
- &mov ($acc,$s[$i]);
- &and ($acc,0x80808080);
- &mov ($tmp,$acc);
- &shr ($tmp,7);
+ &and ($tmp,$s[$i]);
&lea ($r2,&DWP(0,$s[$i],$s[$i]));
- &sub ($acc,$tmp);
+ &mov ($acc,$tmp);
+ &shr ($tmp,7);
&and ($r2,0xfefefefe);
- &and ($acc,0x1b1b1b1b);
+ &sub ($acc,$tmp);
&mov ($tmp,$s[$i]);
+ &and ($acc,0x1b1b1b1b);
+ &rotr ($tmp,16);
&xor ($acc,$r2); # r2
+ &mov ($r2,$s[$i]);
&xor ($s[$i],$acc); # r0 ^ r2
+ &rotr ($r2,16+8);
+ &xor ($acc,$tmp);
&rotl ($s[$i],24);
- &xor ($s[$i],$acc) # ROTATE(r2^r0,24) ^ r2
- &rotr ($tmp,16);
- &xor ($s[$i],$tmp);
- &rotr ($tmp,8);
- &xor ($s[$i],$tmp);
+ &xor ($acc,$r2);
+ &mov ($tmp,0x80808080) if ($i!=1);
+ &xor ($s[$i],$acc); # ROTATE(r2^r0,24) ^ r2
}
&function_begin_B("_x86_AES_encrypt_compact");
@@ -526,6 +528,7 @@ sub enctransform()
&enccompact(1,$tbl,$s1,$s2,$s3,$s0,1);
&enccompact(2,$tbl,$s2,$s3,$s0,$s1,1);
&enccompact(3,$tbl,$s3,$s0,$s1,$s2,1);
+ &mov ($tbl,0x80808080);
&enctransform(2);
&enctransform(3);
&enctransform(0);
@@ -607,82 +610,84 @@ sub sse_enccompact()
&pshufw ("mm5","mm4",0x0d); # 15,14,11,10
&movd ("eax","mm1"); # 5, 4, 1, 0
&movd ("ebx","mm5"); # 15,14,11,10
+ &mov ($__key,$key);
&movz ($acc,&LB("eax")); # 0
- &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0
- &pshufw ("mm2","mm0",0x0d); # 7, 6, 3, 2
&movz ("edx",&HB("eax")); # 1
+ &pshufw ("mm2","mm0",0x0d); # 7, 6, 3, 2
+ &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0
+ &movz ($key,&LB("ebx")); # 10
&movz ("edx",&BP(-128,$tbl,"edx",1)); # 1
- &shl ("edx",8); # 1
&shr ("eax",16); # 5, 4
+ &shl ("edx",8); # 1
- &movz ($acc,&LB("ebx")); # 10
- &movz ($acc,&BP(-128,$tbl,$acc,1)); # 10
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 10
+ &movz ($key,&HB("ebx")); # 11
&shl ($acc,16); # 10
- &or ("ecx",$acc); # 10
&pshufw ("mm6","mm4",0x08); # 13,12, 9, 8
- &movz ($acc,&HB("ebx")); # 11
- &movz ($acc,&BP(-128,$tbl,$acc,1)); # 11
+ &or ("ecx",$acc); # 10
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 11
+ &movz ($key,&HB("eax")); # 5
&shl ($acc,24); # 11
- &or ("edx",$acc); # 11
&shr ("ebx",16); # 15,14
+ &or ("edx",$acc); # 11
- &movz ($acc,&HB("eax")); # 5
- &movz ($acc,&BP(-128,$tbl,$acc,1)); # 5
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 5
+ &movz ($key,&HB("ebx")); # 15
&shl ($acc,8); # 5
&or ("ecx",$acc); # 5
- &movz ($acc,&HB("ebx")); # 15
- &movz ($acc,&BP(-128,$tbl,$acc,1)); # 15
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 15
+ &movz ($key,&LB("eax")); # 4
&shl ($acc,24); # 15
&or ("ecx",$acc); # 15
- &movd ("mm0","ecx"); # t[0] collected
- &movz ($acc,&LB("eax")); # 4
- &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 4
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 4
+ &movz ($key,&LB("ebx")); # 14
&movd ("eax","mm2"); # 7, 6, 3, 2
- &movz ($acc,&LB("ebx")); # 14
- &movz ($acc,&BP(-128,$tbl,$acc,1)); # 14
- &shl ($acc,16); # 14
+ &movd ("mm0","ecx"); # t[0] collected
+ &movz ("ecx",&BP(-128,$tbl,$key,1)); # 14
+ &movz ($key,&HB("eax")); # 3
+ &shl ("ecx",16); # 14
+ &movd ("ebx","mm6"); # 13,12, 9, 8
&or ("ecx",$acc); # 14
- &movd ("ebx","mm6"); # 13,12, 9, 8
- &movz ($acc,&HB("eax")); # 3
- &movz ($acc,&BP(-128,$tbl,$acc,1)); # 3
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 3
+ &movz ($key,&HB("ebx")); # 9
&shl ($acc,24); # 3
&or ("ecx",$acc); # 3
- &movz ($acc,&HB("ebx")); # 9
- &movz ($acc,&BP(-128,$tbl,$acc,1)); # 9
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 9
+ &movz ($key,&LB("ebx")); # 8
&shl ($acc,8); # 9
+ &shr ("ebx",16); # 13,12
&or ("ecx",$acc); # 9
- &movd ("mm1","ecx"); # t[1] collected
- &movz ($acc,&LB("ebx")); # 8
- &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 8
- &shr ("ebx",16); # 13,12
- &movz ($acc,&LB("eax")); # 2
- &movz ($acc,&BP(-128,$tbl,$acc,1)); # 2
- &shl ($acc,16); # 2
- &or ("ecx",$acc); # 2
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 8
+ &movz ($key,&LB("eax")); # 2
&shr ("eax",16); # 7, 6
+ &movd ("mm1","ecx"); # t[1] collected
+ &movz ("ecx",&BP(-128,$tbl,$key,1)); # 2
+ &movz ($key,&HB("eax")); # 7
+ &shl ("ecx",16); # 2
+ &and ("eax",0xff); # 6
+ &or ("ecx",$acc); # 2
&punpckldq ("mm0","mm1"); # t[0,1] collected
- &movz ($acc,&HB("eax")); # 7
- &movz ($acc,&BP(-128,$tbl,$acc,1)); # 7
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 7
+ &movz ($key,&HB("ebx")); # 13
&shl ($acc,24); # 7
- &or ("ecx",$acc); # 7
- &and ("eax",0xff); # 6
+ &and ("ebx",0xff); # 12
&movz ("eax",&BP(-128,$tbl,"eax",1)); # 6
+ &or ("ecx",$acc); # 7
&shl ("eax",16); # 6
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 13
&or ("edx","eax"); # 6
- &movz ($acc,&HB("ebx")); # 13
- &movz ($acc,&BP(-128,$tbl,$acc,1)); # 13
&shl ($acc,8); # 13
- &or ("ecx",$acc); # 13
- &movd ("mm4","ecx"); # t[2] collected
- &and ("ebx",0xff); # 12
&movz ("ebx",&BP(-128,$tbl,"ebx",1)); # 12
+ &or ("ecx",$acc); # 13
&or ("edx","ebx"); # 12
+ &mov ($key,$__key);
+ &movd ("mm4","ecx"); # t[2] collected
&movd ("mm5","edx"); # t[3] collected
&punpckldq ("mm4","mm5"); # t[2,3] collected
@@ -1222,7 +1227,7 @@ sub enclast()
######################################################################
sub deccompact()
-{ my $Fn = mov;
+{ my $Fn = \&mov;
while ($#_>5) { pop(@_); $Fn=sub{}; }
my ($i,$td,@s)=@_;
my $tmp = $key;
@@ -1270,30 +1275,30 @@ sub dectransform()
my $tp4 = @s[($i+3)%4]; $tp4 = @s[3] if ($i==1);
my $tp8 = $tbl;
- &mov ($acc,$s[$i]);
- &and ($acc,0x80808080);
- &mov ($tmp,$acc);
+ &mov ($tmp,0x80808080);
+ &and ($tmp,$s[$i]);
+ &mov ($acc,$tmp);
&shr ($tmp,7);
&lea ($tp2,&DWP(0,$s[$i],$s[$i]));
&sub ($acc,$tmp);
&and ($tp2,0xfefefefe);
&and ($acc,0x1b1b1b1b);
- &xor ($acc,$tp2);
- &mov ($tp2,$acc);
+ &xor ($tp2,$acc);
+ &mov ($tmp,0x80808080);
- &and ($acc,0x80808080);
- &mov ($tmp,$acc);
+ &and ($tmp,$tp2);
+ &mov ($acc,$tmp);
&shr ($tmp,7);
&lea ($tp4,&DWP(0,$tp2,$tp2));
&sub ($acc,$tmp);
&and ($tp4,0xfefefefe);
&and ($acc,0x1b1b1b1b);
&xor ($tp2,$s[$i]); # tp2^tp1
- &xor ($acc,$tp4);
- &mov ($tp4,$acc);
+ &xor ($tp4,$acc);
+ &mov ($tmp,0x80808080);
- &and ($acc,0x80808080);
- &mov ($tmp,$acc);
+ &and ($tmp,$tp4);
+ &mov ($acc,$tmp);
&shr ($tmp,7);
&lea ($tp8,&DWP(0,$tp4,$tp4));
&sub ($acc,$tmp);
@@ -1305,13 +1310,13 @@ sub dectransform()
&xor ($s[$i],$tp2);
&xor ($tp2,$tp8);
- &rotl ($tp2,24);
&xor ($s[$i],$tp4);
&xor ($tp4,$tp8);
- &rotl ($tp4,16);
+ &rotl ($tp2,24);
&xor ($s[$i],$tp8); # ^= tp8^(tp4^tp1)^(tp2^tp1)
- &rotl ($tp8,8);
+ &rotl ($tp4,16);
&xor ($s[$i],$tp2); # ^= ROTATE(tp8^tp2^tp1,24)
+ &rotl ($tp8,8);
&xor ($s[$i],$tp4); # ^= ROTATE(tp8^tp4^tp1,16)
&mov ($s[0],$__s0) if($i==2); #prefetch $s0
&mov ($s[1],$__s1) if($i==3); #prefetch $s1
@@ -1389,85 +1394,87 @@ sub dectransform()
sub sse_deccompact()
{
&pshufw ("mm1","mm0",0x0c); # 7, 6, 1, 0
+ &pshufw ("mm5","mm4",0x09); # 13,12,11,10
&movd ("eax","mm1"); # 7, 6, 1, 0
+ &movd ("ebx","mm5"); # 13,12,11,10
+ &mov ($__key,$key);
- &pshufw ("mm5","mm4",0x09); # 13,12,11,10
&movz ($acc,&LB("eax")); # 0
- &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0
- &movd ("ebx","mm5"); # 13,12,11,10
&movz ("edx",&HB("eax")); # 1
+ &pshufw ("mm2","mm0",0x06); # 3, 2, 5, 4
+ &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0
+ &movz ($key,&LB("ebx")); # 10
&movz ("edx",&BP(-128,$tbl,"edx",1)); # 1
+ &shr ("eax",16); # 7, 6
&shl ("edx",8); # 1
- &pshufw ("mm2","mm0",0x06); # 3, 2, 5, 4
- &movz ($acc,&LB("ebx")); # 10
- &movz ($acc,&BP(-128,$tbl,$acc,1)); # 10
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 10
+ &movz ($key,&HB("ebx")); # 11
&shl ($acc,16); # 10
+ &pshufw ("mm6","mm4",0x03); # 9, 8,15,14
&or ("ecx",$acc); # 10
- &shr ("eax",16); # 7, 6
- &movz ($acc,&HB("ebx")); # 11
- &movz ($acc,&BP(-128,$tbl,$acc,1)); # 11
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 11
+ &movz ($key,&HB("eax")); # 7
&shl ($acc,24); # 11
- &or ("edx",$acc); # 11
&shr ("ebx",16); # 13,12
+ &or ("edx",$acc); # 11
- &pshufw ("mm6","mm4",0x03); # 9, 8,15,14
- &movz ($acc,&HB("eax")); # 7
- &movz ($acc,&BP(-128,$tbl,$acc,1)); # 7
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 7
+ &movz ($key,&HB("ebx")); # 13
&shl ($acc,24); # 7
&or ("ecx",$acc); # 7
- &movz ($acc,&HB("ebx")); # 13
- &movz ($acc,&BP(-128,$tbl,$acc,1)); # 13
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 13
+ &movz ($key,&LB("eax")); # 6
&shl ($acc,8); # 13
+ &movd ("eax","mm2"); # 3, 2, 5, 4
&or ("ecx",$acc); # 13
- &movd ("mm0","ecx"); # t[0] collected
- &movz ($acc,&LB("eax")); # 6
- &movd ("eax","mm2"); # 3, 2, 5, 4
- &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 6
- &shl ("ecx",16); # 6
- &movz ($acc,&LB("ebx")); # 12
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 6
+ &movz ($key,&LB("ebx")); # 12
+ &shl ($acc,16); # 6
&movd ("ebx","mm6"); # 9, 8,15,14
- &movz ($acc,&BP(-128,$tbl,$acc,1)); # 12
+ &movd ("mm0","ecx"); # t[0] collected
+ &movz ("ecx",&BP(-128,$tbl,$key,1)); # 12
+ &movz ($key,&LB("eax")); # 4
&or ("ecx",$acc); # 12
- &movz ($acc,&LB("eax")); # 4
- &movz ($acc,&BP(-128,$tbl,$acc,1)); # 4
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 4
+ &movz ($key,&LB("ebx")); # 14
&or ("edx",$acc); # 4
- &movz ($acc,&LB("ebx")); # 14
- &movz ($acc,&BP(-128,$tbl,$acc,1)); # 14
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 14
+ &movz ($key,&HB("eax")); # 5
&shl ($acc,16); # 14
+ &shr ("eax",16); # 3, 2
&or ("edx",$acc); # 14
- &movd ("mm1","edx"); # t[1] collected
- &movz ($acc,&HB("eax")); # 5
- &movz ("edx",&BP(-128,$tbl,$acc,1)); # 5
- &shl ("edx",8); # 5
- &movz ($acc,&HB("ebx")); # 15
- &shr ("eax",16); # 3, 2
- &movz ($acc,&BP(-128,$tbl,$acc,1)); # 15
- &shl ($acc,24); # 15
- &or ("edx",$acc); # 15
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 5
+ &movz ($key,&HB("ebx")); # 15
&shr ("ebx",16); # 9, 8
+ &shl ($acc,8); # 5
+ &movd ("mm1","edx"); # t[1] collected
+ &movz ("edx",&BP(-128,$tbl,$key,1)); # 15
+ &movz ($key,&HB("ebx")); # 9
+ &shl ("edx",24); # 15
+ &and ("ebx",0xff); # 8
+ &or ("edx",$acc); # 15
&punpckldq ("mm0","mm1"); # t[0,1] collected
- &movz ($acc,&HB("ebx")); # 9
- &movz ($acc,&BP(-128,$tbl,$acc,1)); # 9
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 9
+ &movz ($key,&LB("eax")); # 2
&shl ($acc,8); # 9
- &or ("ecx",$acc); # 9
- &and ("ebx",0xff); # 8
+ &movz ("eax",&HB("eax")); # 3
&movz ("ebx",&BP(-128,$tbl,"ebx",1)); # 8
+ &or ("ecx",$acc); # 9
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 2
&or ("edx","ebx"); # 8
- &movz ($acc,&LB("eax")); # 2
- &movz ($acc,&BP(-128,$tbl,$acc,1)); # 2
&shl ($acc,16); # 2
- &or ("edx",$acc); # 2
- &movd ("mm4","edx"); # t[2] collected
- &movz ("eax",&HB("eax")); # 3
&movz ("eax",&BP(-128,$tbl,"eax",1)); # 3
+ &or ("edx",$acc); # 2
&shl ("eax",24); # 3
&or ("ecx","eax"); # 3
+ &mov ($key,$__key);
+ &movd ("mm4","edx"); # t[2] collected
&movd ("mm5","ecx"); # t[3] collected
&punpckldq ("mm4","mm5"); # t[2,3] collected
@@ -2181,8 +2188,8 @@ my $mark=&DWP(76+240,"esp"); # copy of aes_key->rounds
&mov ("ecx",240/4);
&xor ("eax","eax");
&align (4);
- &data_word(0xABF3F689); # rep stosd
- &set_label("skip_ezero")
+ &data_word(0xABF3F689); # rep stosd
+ &set_label("skip_ezero");
&mov ("esp",$_esp);
&popf ();
&set_label("drop_out");
@@ -2301,8 +2308,8 @@ my $mark=&DWP(76+240,"esp"); # copy of aes_key->rounds
&mov ("ecx",240/4);
&xor ("eax","eax");
&align (4);
- &data_word(0xABF3F689); # rep stosd
- &set_label("skip_dzero")
+ &data_word(0xABF3F689); # rep stosd
+ &set_label("skip_dzero");
&mov ("esp",$_esp);
&popf ();
&function_end_A();
@@ -2865,32 +2872,32 @@ sub deckey()
{ my ($i,$key,$tp1,$tp2,$tp4,$tp8) = @_;
my $tmp = $tbl;
- &mov ($acc,$tp1);
- &and ($acc,0x80808080);
- &mov ($tmp,$acc);
- &shr ($tmp,7);
+ &mov ($tmp,0x80808080);
+ &and ($tmp,$tp1);
&lea ($tp2,&DWP(0,$tp1,$tp1));
+ &mov ($acc,$tmp);
+ &shr ($tmp,7);
&sub ($acc,$tmp);
&and ($tp2,0xfefefefe);
&and ($acc,0x1b1b1b1b);
- &xor ($acc,$tp2);
- &mov ($tp2,$acc);
+ &xor ($tp2,$acc);
+ &mov ($tmp,0x80808080);
- &and ($acc,0x80808080);
- &mov ($tmp,$acc);
- &shr ($tmp,7);
+ &and ($tmp,$tp2);
&lea ($tp4,&DWP(0,$tp2,$tp2));
+ &mov ($acc,$tmp);
+ &shr ($tmp,7);
&sub ($acc,$tmp);
&and ($tp4,0xfefefefe);
&and ($acc,0x1b1b1b1b);
&xor ($tp2,$tp1); # tp2^tp1
- &xor ($acc,$tp4);
- &mov ($tp4,$acc);
+ &xor ($tp4,$acc);
+ &mov ($tmp,0x80808080);
- &and ($acc,0x80808080);
- &mov ($tmp,$acc);
- &shr ($tmp,7);
+ &and ($tmp,$tp4);
&lea ($tp8,&DWP(0,$tp4,$tp4));
+ &mov ($acc,$tmp);
+ &shr ($tmp,7);
&xor ($tp4,$tp1); # tp4^tp1
&sub ($acc,$tmp);
&and ($tp8,0xfefefefe);
diff --git a/crypto/aes/asm/aes-x86_64.pl b/crypto/aes/asm/aes-x86_64.pl
index 91459287dd..98066a594f 100755
--- a/crypto/aes/asm/aes-x86_64.pl
+++ b/crypto/aes/asm/aes-x86_64.pl
@@ -19,9 +19,10 @@
# Performance in number of cycles per processed byte for 128-bit key:
#
# ECB encrypt ECB decrypt CBC large chunk
-# AMD64 33 41 13.0
-# EM64T 38 59 18.6(*)
-# Core 2 30 43 14.5(*)
+# AMD64 33 43 13.0
+# EM64T 38 56 18.6(*)
+# Core 2 30 42 14.5(*)
+# Atom 65 86 32.1(*)
#
# (*) with hyper-threading off
@@ -365,68 +366,66 @@ $code.=<<___;
movzb `&lo("$s0")`,$t0
movzb `&lo("$s1")`,$t1
movzb `&lo("$s2")`,$t2
- movzb ($sbox,$t0,1),$t0
- movzb ($sbox,$t1,1),$t1
- movzb ($sbox,$t2,1),$t2
-
movzb `&lo("$s3")`,$t3
movzb `&hi("$s1")`,$acc0
movzb `&hi("$s2")`,$acc1
+ shr \$16,$s2
+ movzb `&hi("$s3")`,$acc2
+ movzb ($sbox,$t0,1),$t0
+ movzb ($sbox,$t1,1),$t1
+ movzb ($sbox,$t2,1),$t2
movzb ($sbox,$t3,1),$t3
- movzb ($sbox,$acc0,1),$t4 #$t0
- movzb ($sbox,$acc1,1),$t5 #$t1
- movzb `&hi("$s3")`,$acc2
+ movzb ($sbox,$acc0,1),$t4 #$t0
movzb `&hi("$s0")`,$acc0
- shr \$16,$s2
+ movzb ($sbox,$acc1,1),$t5 #$t1
+ movzb `&lo("$s2")`,$acc1
movzb ($sbox,$acc2,1),$acc2 #$t2
movzb ($sbox,$acc0,1),$acc0 #$t3
- shr \$16,$s3
- movzb `&lo("$s2")`,$acc1
shl \$8,$t4
+ shr \$16,$s3
shl \$8,$t5
- movzb ($sbox,$acc1,1),$acc1 #$t0
xor $t4,$t0
- xor $t5,$t1
-
- movzb `&lo("$s3")`,$t4
shr \$16,$s0
+ movzb `&lo("$s3")`,$t4
shr \$16,$s1
- movzb `&lo("$s0")`,$t5
+ xor $t5,$t1
shl \$8,$acc2
- shl \$8,$acc0
- movzb ($sbox,$t4,1),$t4 #$t1
- movzb ($sbox,$t5,1),$t5 #$t2
+ movzb `&lo("$s0")`,$t5
+ movzb ($sbox,$acc1,1),$acc1 #$t0
xor $acc2,$t2
- xor $acc0,$t3
+ shl \$8,$acc0
movzb `&lo("$s1")`,$acc2
- movzb `&hi("$s3")`,$acc0
shl \$16,$acc1
- movzb ($sbox,$acc2,1),$acc2 #$t3
- movzb ($sbox,$acc0,1),$acc0 #$t0
+ xor $acc0,$t3
+ movzb ($sbox,$t4,1),$t4 #$t1
+ movzb `&hi("$s3")`,$acc0
+ movzb ($sbox,$t5,1),$t5 #$t2
xor $acc1,$t0
- movzb `&hi("$s0")`,$acc1
shr \$8,$s2
+ movzb `&hi("$s0")`,$acc1
+ shl \$16,$t4
shr \$8,$s1
+ shl \$16,$t5
+ xor $t4,$t1
+ movzb ($sbox,$acc2,1),$acc2 #$t3
+ movzb ($sbox,$acc0,1),$acc0 #$t0
movzb ($sbox,$acc1,1),$acc1 #$t1
movzb ($sbox,$s2,1),$s3 #$t3
movzb ($sbox,$s1,1),$s2 #$t2
- shl \$16,$t4
- shl \$16,$t5
+
shl \$16,$acc2
- xor $t4,$t1
xor $t5,$t2
- xor $acc2,$t3
-
shl \$24,$acc0
+ xor $acc2,$t3
shl \$24,$acc1
- shl \$24,$s3
xor $acc0,$t0
- shl \$24,$s2
+ shl \$24,$s3
xor $acc1,$t1
+ shl \$24,$s2
mov $t0,$s0
mov $t1,$s1
xor $t2,$s2
@@ -465,12 +464,12 @@ sub enctransform()
{ my ($t3,$r20,$r21)=($acc2,"%r8d","%r9d");
$code.=<<___;
- mov $s0,$acc0
- mov $s1,$acc1
- and \$0x80808080,$acc0
- and \$0x80808080,$acc1
- mov $acc0,$t0
- mov $acc1,$t1
+ mov \$0x80808080,$t0
+ mov \$0x80808080,$t1
+ and $s0,$t0
+ and $s1,$t1
+ mov $t0,$acc0
+ mov $t1,$acc1
shr \$7,$t0
lea ($s0,$s0),$r20
shr \$7,$t1
@@ -488,25 +487,25 @@ $code.=<<___;
xor $r20,$s0
xor $r21,$s1
- mov $s2,$acc0
- mov $s3,$acc1
+ mov \$0x80808080,$t2
rol \$24,$s0
+ mov \$0x80808080,$t3
rol \$24,$s1
- and \$0x80808080,$acc0
- and \$0x80808080,$acc1
+ and $s2,$t2
+ and $s3,$t3
xor $r20,$s0
xor $r21,$s1
- mov $acc0,$t2
- mov $acc1,$t3
+ mov $t2,$acc0
ror \$16,$t0
+ mov $t3,$acc1
ror \$16,$t1
- shr \$7,$t2
lea ($s2,$s2),$r20
+ shr \$7,$t2
xor $t0,$s0
- xor $t1,$s1
shr \$7,$t3
- lea ($s3,$s3),$r21
+ xor $t1,$s1
ror \$8,$t0
+ lea ($s3,$s3),$r21
ror \$8,$t1
sub $t2,$acc0
sub $t3,$acc1
@@ -522,23 +521,23 @@ $code.=<<___;
xor $acc0,$r20
xor $acc1,$r21
+ ror \$16,$t2
xor $r20,$s2
+ ror \$16,$t3
xor $r21,$s3
rol \$24,$s2
+ mov 0($sbox),$acc0 # prefetch Te4
rol \$24,$s3
xor $r20,$s2
- xor $r21,$s3
- mov 0($sbox),$acc0 # prefetch Te4
- ror \$16,$t2
- ror \$16,$t3
mov 64($sbox),$acc1
- xor $t2,$s2
- xor $t3,$s3
+ xor $r21,$s3
mov 128($sbox),$r20
+ xor $t2,$s2
ror \$8,$t2
+ xor $t3,$s3
ror \$8,$t3
- mov 192($sbox),$r21
xor $t2,$s2
+ mov 192($sbox),$r21
xor $t3,$s3
___
}
@@ -935,70 +934,69 @@ $code.=<<___;
movzb `&lo("$s0")`,$t0
movzb `&lo("$s1")`,$t1
movzb `&lo("$s2")`,$t2
- movzb ($sbox,$t0,1),$t0
- movzb ($sbox,$t1,1),$t1
- movzb ($sbox,$t2,1),$t2
-
movzb `&lo("$s3")`,$t3
movzb `&hi("$s3")`,$acc0
movzb `&hi("$s0")`,$acc1
+ shr \$16,$s3
+ movzb `&hi("$s1")`,$acc2
+ movzb ($sbox,$t0,1),$t0
+ movzb ($sbox,$t1,1),$t1
+ movzb ($sbox,$t2,1),$t2
movzb ($sbox,$t3,1),$t3
- movzb ($sbox,$acc0,1),$t4 #$t0
- movzb ($sbox,$acc1,1),$t5 #$t1
- movzb `&hi("$s1")`,$acc2
+ movzb ($sbox,$acc0,1),$t4 #$t0
movzb `&hi("$s2")`,$acc0
- shr \$16,$s2
+ movzb ($sbox,$acc1,1),$t5 #$t1
movzb ($sbox,$acc2,1),$acc2 #$t2
movzb ($sbox,$acc0,1),$acc0 #$t3
- shr \$16,$s3
- movzb `&lo("$s2")`,$acc1
- shl \$8,$t4
+ shr \$16,$s2
shl \$8,$t5
- movzb ($sbox,$acc1,1),$acc1 #$t0
- xor $t4,$t0
- xor $t5,$t1
-
- movzb `&lo("$s3")`,$t4
+ shl \$8,$t4
+ movzb `&lo("$s2")`,$acc1
shr \$16,$s0
+ xor $t4,$t0
shr \$16,$s1
- movzb `&lo("$s0")`,$t5
+ movzb `&lo("$s3")`,$t4
+
shl \$8,$acc2
+ xor $t5,$t1
shl \$8,$acc0
- movzb ($sbox,$t4,1),$t4 #$t1
- movzb ($sbox,$t5,1),$t5 #$t2
+ movzb `&lo("$s0")`,$t5
+ movzb ($sbox,$acc1,1),$acc1 #$t0
xor $acc2,$t2
- xor $acc0,$t3
-
movzb `&lo("$s1")`,$acc2
- movzb `&hi("$s1")`,$acc0
+
shl \$16,$acc1
+ xor $acc0,$t3
+ movzb ($sbox,$t4,1),$t4 #$t1
+ movzb `&hi("$s1")`,$acc0
movzb ($sbox,$acc2,1),$acc2 #$t3
- movzb ($sbox,$acc0,1),$acc0 #$t0
xor $acc1,$t0
-
+ movzb ($sbox,$t5,1),$t5 #$t2
movzb `&hi("$s2")`,$acc1
+
+ shl \$16,$acc2
shl \$16,$t4
shl \$16,$t5
- movzb ($sbox,$acc1,1),$s1 #$t1
+ xor $acc2,$t3
+ movzb `&hi("$s3")`,$acc2
xor $t4,$t1
+ shr \$8,$s0
xor $t5,$t2
- movzb `&hi("$s3")`,$acc1
- shr \$8,$s0
- shl \$16,$acc2
- movzb ($sbox,$acc1,1),$s2 #$t2
+ movzb ($sbox,$acc0,1),$acc0 #$t0
+ movzb ($sbox,$acc1,1),$s1 #$t1
+ movzb ($sbox,$acc2,1),$s2 #$t2
movzb ($sbox,$s0,1),$s3 #$t3
- xor $acc2,$t3
+ mov $t0,$s0
shl \$24,$acc0
shl \$24,$s1
shl \$24,$s2
- xor $acc0,$t0
+ xor $acc0,$s0
shl \$24,$s3
xor $t1,$s1
- mov $t0,$s0
xor $t2,$s2
xor $t3,$s3
___
@@ -1013,12 +1011,12 @@ sub dectransform()
my $prefetch = shift;
$code.=<<___;
- mov $tp10,$acc0
- mov $tp18,$acc8
- and $mask80,$acc0
- and $mask80,$acc8
- mov $acc0,$tp40
- mov $acc8,$tp48
+ mov $mask80,$tp40
+ mov $mask80,$tp48
+ and $tp10,$tp40
+ and $tp18,$tp48
+ mov $tp40,$acc0
+ mov $tp48,$acc8
shr \$7,$tp40
lea ($tp10,$tp10),$tp20
shr \$7,$tp48
@@ -1029,15 +1027,15 @@ $code.=<<___;
and $maskfe,$tp28
and $mask1b,$acc0
and $mask1b,$acc8
- xor $tp20,$acc0
- xor $tp28,$acc8
- mov $acc0,$tp20
- mov $acc8,$tp28
-
- and $mask80,$acc0
- and $mask80,$acc8
- mov $acc0,$tp80
- mov $acc8,$tp88
+ xor $acc0,$tp20
+ xor $acc8,$tp28
+ mov $mask80,$tp80
+ mov $mask80,$tp88
+
+ and $tp20,$tp80
+ and $tp28,$tp88
+ mov $tp80,$acc0
+ mov $tp88,$acc8
shr \$7,$tp80
lea ($tp20,$tp20),$tp40
shr \$7,$tp88
@@ -1048,15 +1046,15 @@ $code.=<<___;
and $maskfe,$tp48
and $mask1b,$acc0
and $mask1b,$acc8
- xor $tp40,$acc0
- xor $tp48,$acc8
- mov $acc0,$tp40
- mov $acc8,$tp48
-
- and $mask80,$acc0
- and $mask80,$acc8
- mov $acc0,$tp80
- mov $acc8,$tp88
+ xor $acc0,$tp40
+ xor $acc8,$tp48
+ mov $mask80,$tp80
+ mov $mask80,$tp88
+
+ and $tp40,$tp80
+ and $tp48,$tp88
+ mov $tp80,$acc0
+ mov $tp88,$acc8
shr \$7,$tp80
xor $tp10,$tp20 # tp2^=tp1
shr \$7,$tp88
@@ -1081,51 +1079,51 @@ $code.=<<___;
mov $tp10,$acc0
mov $tp18,$acc8
xor $tp80,$tp40 # tp4^tp1^=tp8
- xor $tp88,$tp48 # tp4^tp1^=tp8
shr \$32,$acc0
+ xor $tp88,$tp48 # tp4^tp1^=tp8
shr \$32,$acc8
xor $tp20,$tp80 # tp8^=tp8^tp2^tp1=tp2^tp1
- xor $tp28,$tp88 # tp8^=tp8^tp2^tp1=tp2^tp1
rol \$8,`&LO("$tp10")` # ROTATE(tp1^tp8,8)
+ xor $tp28,$tp88 # tp8^=tp8^tp2^tp1=tp2^tp1
rol \$8,`&LO("$tp18")` # ROTATE(tp1^tp8,8)
xor $tp40,$tp80 # tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2
+ rol \$8,`&LO("$acc0")` # ROTATE(tp1^tp8,8)
xor $tp48,$tp88 # tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2
- rol \$8,`&LO("$acc0")` # ROTATE(tp1^tp8,8)
rol \$8,`&LO("$acc8")` # ROTATE(tp1^tp8,8)
xor `&LO("$tp80")`,`&LO("$tp10")`
- xor `&LO("$tp88")`,`&LO("$tp18")`
shr \$32,$tp80
+ xor `&LO("$tp88")`,`&LO("$tp18")`
shr \$32,$tp88
xor `&LO("$tp80")`,`&LO("$acc0")`
xor `&LO("$tp88")`,`&LO("$acc8")`
mov $tp20,$tp80
- mov $tp28,$tp88
- shr \$32,$tp80
- shr \$32,$tp88
rol \$24,`&LO("$tp20")` # ROTATE(tp2^tp1^tp8,24)
+ mov $tp28,$tp88
rol \$24,`&LO("$tp28")` # ROTATE(tp2^tp1^tp8,24)
- rol \$24,`&LO("$tp80")` # ROTATE(tp2^tp1^tp8,24)
- rol \$24,`&LO("$tp88")` # ROTATE(tp2^tp1^tp8,24)
+ shr \$32,$tp80
xor `&LO("$tp20")`,`&LO("$tp10")`
+ shr \$32,$tp88
xor `&LO("$tp28")`,`&LO("$tp18")`
+ rol \$24,`&LO("$tp80")` # ROTATE(tp2^tp1^tp8,24)
mov $tp40,$tp20
+ rol \$24,`&LO("$tp88")` # ROTATE(tp2^tp1^tp8,24)
mov $tp48,$tp28
+ shr \$32,$tp20
xor `&LO("$tp80")`,`&LO("$acc0")`
+ shr \$32,$tp28
xor `&LO("$tp88")`,`&LO("$acc8")`
`"mov 0($sbox),$mask80" if ($prefetch)`
- shr \$32,$tp20
- shr \$32,$tp28
- `"mov 64($sbox),$maskfe" if ($prefetch)`
rol \$16,`&LO("$tp40")` # ROTATE(tp4^tp1^tp8,16)
+ `"mov 64($sbox),$maskfe" if ($prefetch)`
rol \$16,`&LO("$tp48")` # ROTATE(tp4^tp1^tp8,16)
`"mov 128($sbox),$mask1b" if ($prefetch)`
rol \$16,`&LO("$tp20")` # ROTATE(tp4^tp1^tp8,16)
- rol \$16,`&LO("$tp28")` # ROTATE(tp4^tp1^tp8,16)
`"mov 192($sbox),$tp80" if ($prefetch)`
xor `&LO("$tp40")`,`&LO("$tp10")`
+ rol \$16,`&LO("$tp28")` # ROTATE(tp4^tp1^tp8,16)
xor `&LO("$tp48")`,`&LO("$tp18")`
`"mov 256($sbox),$tp88" if ($prefetch)`
xor `&LO("$tp20")`,`&LO("$acc0")`
@@ -1301,10 +1299,6 @@ private_AES_set_encrypt_key:
call _x86_64_AES_set_encrypt_key
- mov 8(%rsp),%r15
- mov 16(%rsp),%r14
- mov 24(%rsp),%r13
- mov 32(%rsp),%r12
mov 40(%rsp),%rbp
mov 48(%rsp),%rbx
add \$56,%rsp
diff --git a/crypto/aes/asm/vpaes-x86.pl b/crypto/aes/asm/vpaes-x86.pl
index 1533e2c304..433912ff57 100644
--- a/crypto/aes/asm/vpaes-x86.pl
+++ b/crypto/aes/asm/vpaes-x86.pl
@@ -27,9 +27,9 @@
#
# aes-586.pl vpaes-x86.pl
#
-# Core 2(**) 29.1/42.3/18.3 22.0/25.6(***)
-# Nehalem 27.9/40.4/18.1 10.3/12.0
-# Atom 102./119./60.1 64.5/85.3(***)
+# Core 2(**) 28.1/41.4/18.3 21.9/25.2(***)
+# Nehalem 27.9/40.4/18.1 10.2/11.9
+# Atom 70.7/92.1/60.1 61.1/81.0(***)
#
# (*) "Hyper-threading" in the context refers rather to cache shared
# among multiple cores, than to specifically Intel HTT. As vast
@@ -40,8 +40,8 @@
# (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe.
#
# (***) Less impressive improvement on Core 2 and Atom is due to slow
-# pshufb, yet it's respectable +32%/65% improvement on Core 2
-# and +58%/40% on Atom (as implied, over "hyper-threading-safe"
+# pshufb, yet it's respectable +28%/64% improvement on Core 2
+# and +15% on Atom (as implied, over "hyper-threading-safe"
# code path).
#
# <appro@openssl.org>
@@ -183,35 +183,35 @@ $k_dsbo=0x2c0; # decryption sbox final output
&movdqa ("xmm1","xmm6")
&movdqa ("xmm2",&QWP($k_ipt,$const));
&pandn ("xmm1","xmm0");
- &movdqu ("xmm5",&QWP(0,$key));
- &psrld ("xmm1",4);
&pand ("xmm0","xmm6");
+ &movdqu ("xmm5",&QWP(0,$key));
&pshufb ("xmm2","xmm0");
&movdqa ("xmm0",&QWP($k_ipt+16,$const));
- &pshufb ("xmm0","xmm1");
&pxor ("xmm2","xmm5");
- &pxor ("xmm0","xmm2");
+ &psrld ("xmm1",4);
&add ($key,16);
+ &pshufb ("xmm0","xmm1");
&lea ($base,&DWP($k_mc_backward,$const));
+ &pxor ("xmm0","xmm2");
&jmp (&label("enc_entry"));
&set_label("enc_loop",16);
# middle of middle round
&movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sb1u
- &pshufb ("xmm4","xmm2"); # 4 = sb1u
- &pxor ("xmm4","xmm5"); # 4 = sb1u + k
&movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sb1t
+ &pshufb ("xmm4","xmm2"); # 4 = sb1u
&pshufb ("xmm0","xmm3"); # 0 = sb1t
- &pxor ("xmm0","xmm4"); # 0 = A
+ &pxor ("xmm4","xmm5"); # 4 = sb1u + k
&movdqa ("xmm5",&QWP($k_sb2,$const)); # 4 : sb2u
- &pshufb ("xmm5","xmm2"); # 4 = sb2u
+ &pxor ("xmm0","xmm4"); # 0 = A
&movdqa ("xmm1",&QWP(-0x40,$base,$magic));# .Lk_mc_forward[]
+ &pshufb ("xmm5","xmm2"); # 4 = sb2u
&movdqa ("xmm2",&QWP($k_sb2+16,$const));# 2 : sb2t
- &pshufb ("xmm2","xmm3"); # 2 = sb2t
- &pxor ("xmm2","xmm5"); # 2 = 2A
&movdqa ("xmm4",&QWP(0,$base,$magic)); # .Lk_mc_backward[]
+ &pshufb ("xmm2","xmm3"); # 2 = sb2t
&movdqa ("xmm3","xmm0"); # 3 = A
+ &pxor ("xmm2","xmm5"); # 2 = 2A
&pshufb ("xmm0","xmm1"); # 0 = B
&add ($key,16); # next key
&pxor ("xmm0","xmm2"); # 0 = 2A+B
@@ -220,30 +220,30 @@ $k_dsbo=0x2c0; # decryption sbox final output
&pxor ("xmm3","xmm0"); # 3 = 2A+B+D
&pshufb ("xmm0","xmm1"); # 0 = 2B+C
&and ($magic,0x30); # ... mod 4
- &pxor ("xmm0","xmm3"); # 0 = 2A+3B+C+D
&sub ($round,1); # nr--
+ &pxor ("xmm0","xmm3"); # 0 = 2A+3B+C+D
&set_label("enc_entry");
# top of round
&movdqa ("xmm1","xmm6"); # 1 : i
+ &movdqa ("xmm5",&QWP($k_inv+16,$const));# 2 : a/k
&pandn ("xmm1","xmm0"); # 1 = i<<4
&psrld ("xmm1",4); # 1 = i
&pand ("xmm0","xmm6"); # 0 = k
- &movdqa ("xmm5",&QWP($k_inv+16,$const));# 2 : a/k
&pshufb ("xmm5","xmm0"); # 2 = a/k
- &pxor ("xmm0","xmm1"); # 0 = j
&movdqa ("xmm3","xmm7"); # 3 : 1/i
+ &pxor ("xmm0","xmm1"); # 0 = j
&pshufb ("xmm3","xmm1"); # 3 = 1/i
- &pxor ("xmm3","xmm5"); # 3 = iak = 1/i + a/k
&movdqa ("xmm4","xmm7"); # 4 : 1/j
+ &pxor ("xmm3","xmm5"); # 3 = iak = 1/i + a/k
&pshufb ("xmm4","xmm0"); # 4 = 1/j
- &pxor ("xmm4","xmm5"); # 4 = jak = 1/j + a/k
&movdqa ("xmm2","xmm7"); # 2 : 1/iak
+ &pxor ("xmm4","xmm5"); # 4 = jak = 1/j + a/k
&pshufb ("xmm2","xmm3"); # 2 = 1/iak
- &pxor ("xmm2","xmm0"); # 2 = io
&movdqa ("xmm3","xmm7"); # 3 : 1/jak
- &movdqu ("xmm5",&QWP(0,$key));
+ &pxor ("xmm2","xmm0"); # 2 = io
&pshufb ("xmm3","xmm4"); # 3 = 1/jak
+ &movdqu ("xmm5",&QWP(0,$key));
&pxor ("xmm3","xmm1"); # 3 = jo
&jnz (&label("enc_loop"));
@@ -265,8 +265,8 @@ $k_dsbo=0x2c0; # decryption sbox final output
## Same API as encryption core.
##
&function_begin_B("_vpaes_decrypt_core");
- &mov ($round,&DWP(240,$key));
&lea ($base,&DWP($k_dsbd,$const));
+ &mov ($round,&DWP(240,$key));
&movdqa ("xmm1","xmm6");
&movdqa ("xmm2",&QWP($k_dipt-$k_dsbd,$base));
&pandn ("xmm1","xmm0");
@@ -292,62 +292,61 @@ $k_dsbo=0x2c0; # decryption sbox final output
## Inverse mix columns
##
&movdqa ("xmm4",&QWP(-0x20,$base)); # 4 : sb9u
+ &movdqa ("xmm1",&QWP(-0x10,$base)); # 0 : sb9t
&pshufb ("xmm4","xmm2"); # 4 = sb9u
+ &pshufb ("