summaryrefslogtreecommitdiffstats
path: root/crypto/aes/asm/aes-586.pl
diff options
context:
space:
mode:
authorAndy Polyakov <appro@openssl.org>2006-08-07 09:05:52 +0000
committerAndy Polyakov <appro@openssl.org>2006-08-07 09:05:52 +0000
commit6c69aa532eee8a2cd1228b7810a926f2b752b864 (patch)
treee1a5ba678ca982768e4639d4647b16e9f9a88e96 /crypto/aes/asm/aes-586.pl
parent6264c9b2a9e19119f59112f4e47a79c9dc062148 (diff)
Revised AES_cbc_encrypt in x86 assembler module.
Diffstat (limited to 'crypto/aes/asm/aes-586.pl')
-rwxr-xr-xcrypto/aes/asm/aes-586.pl919
1 files changed, 593 insertions, 326 deletions
diff --git a/crypto/aes/asm/aes-586.pl b/crypto/aes/asm/aes-586.pl
index ea66589a06..4401cee9e3 100755
--- a/crypto/aes/asm/aes-586.pl
+++ b/crypto/aes/asm/aes-586.pl
@@ -6,7 +6,7 @@
# forms are granted according to the OpenSSL license.
# ====================================================================
#
-# Version 4.2.
+# Version 4.3.
#
# You might fail to appreciate this module performance from the first
# try. If compared to "vanilla" linux-ia32-icc target, i.e. considered
@@ -114,11 +114,79 @@
# cycles window. Why just SSE? Because it's needed on hyper-threading
# CPU! Which is also why it's prefetched with 64 byte stride. Best
# part is that it has no negative effect on performance:-)
+#
+# Version 4.3 implements switch between compact and non-compact block
+# functions in AES_cbc_encrypt depending on how much data was asked
+# to process in one stroke.
+#
+# Timing attacks are classified in two classes: synchronous when
+# attacker consciously initiates cryptographic operation and collect
+# timing data of various character afterwards, and asynchronous when
+# malicious code is executed on same CPU simultaneously with AES,
+# instruments itself and performs statistical analysis of this data.
+#
+# As far as synchronous attacks go the root to the AES timing
+# vulnerability is twofold. Firstly, of 256 S-box elements at most 160
+# are referred to in single 128-bit block operation. Well, in C
+# implementation with 4 distinct tables it's actually as little as 40
+# references per 256 elements table, but anyway... Secondly, even
+# though S-box elements are clustered into smaller amount of cache-
+# lines, smaller than 160 and even 40, it turned out that for certain
+# plain-text pattern[s] or simply put chosen plain-text and given key
+# few cache-lines remain unaccessed during block operation. Now, if
+# attacker can figure out this access pattern, he can deduct the key
+# [or at least part of it]. The natural way to mitigate this kind of
+# attacks is to minimize the amount of cache-lines in S-box and/or
+# prefetch them to ensure that every one is accessed for more uniform
+# timing. But note that *if* plain-text was concealed in such way that
+# input to block function is distributed *uniformly*, then attack
+# wouldn't apply. Now note that some encryption modes, most notably
+# CBC, do masks the plain-text in this exact way [secure cipher output
+# is distributed uniformly]. Yes, one still might find input that
+# would reveal the information about given key, but if amount of
+# candidate inputs to be tried is larger than amount possible key
+# combinations then attack becomes infeasible. This is why revised
+# AES_cbc_encrypt "dares" to switch to larger S-box when larger chunk
+# of data is to be processed in one stroke. The current size limit of
+# 512 bytes is chosen to provide same [diminishigly low] probability
+# for cache-line to remain untouched in large chunk operation with
+# large S-box as for single block operation with compact S-box and
+# surely needs more careful consideration...
+#
+# As for asynchronous attacks. There are two flavours: attacker code
+# being interleaved with AES on hyper-threading CPU at *instruction*
+# level, and two processes time sharing single core. As for latter.
+# Two vectors. 1. Given that attacker process has higher priority,
+# yield execution to process performing AES just before timer fires
+# off the scheduler, immediately regain control of CPU and analyze the
+# cache state. For this attack to be efficient attacker would have to
+# effectively slow down the operation by several *orders* of magnitute,
+# by ratio of time slice to duration of handful of AES rounds, which
+# unlikely to remain unnoticed. Not to mention that this also means
+# that he would spend correspondigly more time to collect enough
+# statistical data to mount the attack. It's probably appropriate to
+# say that if adeversary reckons that this attack is beneficial and
+# risks to be noticed, you probably have larger problems having him
+# mere opportunity. In other words suggested code design expects you
+# to preclude/mitigate this attack by overall system security design.
+# 2. Attacker manages to make his code interrupt driven. In order for
+# this kind of attack to be feasible, interrupt rate has to be high
+# enough, again comparable to duration of handful of AES rounds. But
+# is there interrupt source of such rate? Hardly, not even 1Gbps NIC
+# generates interrupts at such raging rate...
+#
+# And now back to the former, hyper-threading CPU or more specifically
+# Intel P4. Recall that asynchronous attack implies that malicious
+# code instruments itself. And naturally instrumentation granularity
+# has be noticeably lower than duration of codepath accessing S-box.
+# Given that all cache-lines are accessed during that time that is.
+# Current implementation accesses *all* cache-lines within ~50 cycles
+# window, which is actually *less* than RDTSC latency on Intel P4!
push(@INC,"perlasm","../../perlasm");
require "x86asm.pl";
-&asm_init($ARGV[0],"aes-586.pl",$ARGV[$#ARGV] eq "386");
+&asm_init($ARGV[0],"aes-586.pl",$x86only = $ARGV[$#ARGV] eq "386");
$s0="eax";
$s1="ebx";
@@ -128,14 +196,26 @@ $key="edi";
$acc="esi";
$tbl="ebp";
+# stack frame layout in _[x86|sse]_AES_* routines, frame is allocated
+# by caller
+$__ra=&DWP(0,"esp"); # return address
+$__s0=&DWP(4,"esp"); # s0 backing store
+$__s1=&DWP(8,"esp"); # s1 backing store
+$__s2=&DWP(12,"esp"); # s2 backing store
+$__s3=&DWP(16,"esp"); # s3 backing store
+$__key=&DWP(20,"esp"); # pointer to key schedule
+$__end=&DWP(24,"esp"); # pointer to end of key schedule
+$__tbl=&DWP(28,"esp"); # %ebp backing store
+
+# stack frame layout in AES_[en|crypt] routines, which differs from
+# above by 4 and overlaps by %ebp backing store
+$_tbl=&DWP(24,"esp");
+$_esp=&DWP(28,"esp");
+
sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } }
-$compromise=0; # $compromise=128 abstains from copying key
- # schedule to stack when encrypting inputs
- # shorter than 128 bytes at the cost of
- # risksing aliasing with S-boxes. In return
- # you get way better, up to +70%, small block
- # performance.
+$speed_limit=512; # chunks smaller than $speed_limit are
+ # processed with compact routine in CBC mode
$small_footprint=1; # $small_footprint=1 code is ~5% slower [on
# recent µ-archs], but ~5 times smaller!
# I favor compact code to minimize cache
@@ -206,7 +286,7 @@ sub encvert()
&movz ($v0,&HB($v1));
&and ($v1,0xFF);
&xor ($s[3],&DWP(2,$te,$v1,8)); # s1>>16
- &mov ($key,&DWP(20,"esp")); # reincarnate v1 as key
+ &mov ($key,$__key); # reincarnate v1 as key
&xor ($s[2],&DWP(1,$te,$v0,8)); # s1>>24
}
@@ -226,7 +306,7 @@ sub enchoriz()
&xor ($v1,&DWP(2,$te,$v0,8)); # 10
&movz ($v0,&HB($s3)); # 13,12,15*,14
&xor ($v1,&DWP(1,$te,$v0,8)); # 15, t[0] collected
- &mov (&DWP(4,"esp"),$v1); # t[0] saved
+ &mov ($__s0,$v1); # t[0] saved
&movz ($v0,&LB($s1)); # 7, 6, 5, 4*
&shr ($s1,16); # -, -, 7, 6
@@ -256,11 +336,11 @@ sub enchoriz()
&shr ($s0,24); # 1*
&mov ($s2,&DWP(1,$te,$v0,8)); # 11
&xor ($s2,&DWP(3,$te,$s0,8)); # 1
- &mov ($s0,&DWP(4,"esp")); # s[0]=t[0]
+ &mov ($s0,$__s0); # s[0]=t[0]
&movz ($v0,&LB($s3)); # 13,12, 7, 6*
&shr ($s3,16); # , ,13,12
&xor ($s2,&DWP(2,$te,$v0,8)); # 6
- &mov ($key,&DWP(20,"esp")); # reincarnate v0 as key
+ &mov ($key,$__key); # reincarnate v0 as key
&and ($s3,0xff); # , ,13,12*
&mov ($s3,&DWP(0,$te,$s3,8)); # 12
&xor ($s3,$s2); # s[2]=t[3] collected
@@ -350,7 +430,7 @@ sub enccompact()
# $Fn is used in first compact round and its purpose is to
# void restoration of some values from stack, so that after
# 4xenccompact with extra argument $key value is left there...
- if ($i==3) { &$Fn ($key,&DWP(20,"esp")); }##%edx
+ if ($i==3) { &$Fn ($key,$__key); }##%edx
else { &mov ($out,$s[0]); }
&and ($out,0xFF);
if ($i==1) { &shr ($s[0],16); }#%ebx[1]
@@ -363,7 +443,7 @@ sub enccompact()
&shl ($tmp,8);
&xor ($out,$tmp);
- if ($i==3) { $tmp=$s[2]; &mov ($s[1],&DWP(4,"esp")); }##%ebx
+ if ($i==3) { $tmp=$s[2]; &mov ($s[1],$__s0); }##%ebx
else { &mov ($tmp,$s[2]);
&shr ($tmp,16); }
if ($i==2) { &and ($s[1],0xFF); }#%edx[2]
@@ -372,7 +452,7 @@ sub enccompact()
&shl ($tmp,16);
&xor ($out,$tmp);
- if ($i==3) { $tmp=$s[3]; &mov ($s[2],&DWP(8,"esp")); }##%ecx
+ if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }##%ecx
elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2]
else { &mov ($tmp,$s[3]);
&shr ($tmp,24); }
@@ -414,7 +494,7 @@ sub enctransform()
&public_label("AES_Te");
&function_begin_B("_x86_AES_encrypt_compact");
# note that caller is expected to allocate stack frame for me!
- &mov (&DWP(20,"esp"),$key); # save key
+ &mov ($__key,$key); # save key
&xor ($s0,&DWP(0,$key)); # xor with key
&xor ($s1,&DWP(4,$key));
@@ -424,7 +504,7 @@ sub enctransform()
&mov ($acc,&DWP(240,$key)); # load key->rounds
&lea ($acc,&DWP(-2,$acc,$acc));
&lea ($acc,&DWP(0,$key,$acc,8));
- &mov (&DWP(24,"esp"),$acc); # end of key schedule
+ &mov ($__end,$acc); # end of key schedule
# prefetch Te4
&mov ($key,&DWP(0-128,$tbl));
@@ -446,16 +526,16 @@ sub enctransform()
&enctransform(3);
&enctransform(0);
&enctransform(1);
- &mov ($key,&DWP(20,"esp"));
- &mov ($tbl,&DWP(28,"esp"));
+ &mov ($key,$__key);
+ &mov ($tbl,$__tbl);
&add ($key,16); # advance rd_key
&xor ($s0,&DWP(0,$key));
&xor ($s1,&DWP(4,$key));
&xor ($s2,&DWP(8,$key));
&xor ($s3,&DWP(12,$key));
- &cmp ($key,&DWP(24,"esp"));
- &mov (&DWP(20,"esp"),$key);
+ &cmp ($key,$__end);
+ &mov ($__key,$key);
&jb (&label("loop"));
&enccompact(0,$tbl,$s0,$s1,$s2,$s3);
@@ -604,6 +684,7 @@ sub sse_enccompact()
&punpckldq ("mm4","mm5"); # t[2,3] collected
}
+ if (!$x86only) {
&public_label("AES_Te");
&function_begin_B("_sse_AES_encrypt_compact");
&pxor ("mm0",&QWP(0,$key)); # 7, 6, 5, 4, 3, 2, 1, 0
@@ -613,7 +694,7 @@ sub sse_enccompact()
&mov ($acc,&DWP(240,$key)); # load key->rounds
&lea ($acc,&DWP(-2,$acc,$acc));
&lea ($acc,&DWP(0,$key,$acc,8));
- &mov (&DWP(24,"esp"),$acc); # end of key schedule
+ &mov ($__end,$acc); # end of key schedule
&mov ($s0,0x1b1b1b1b); # magic constant
&mov (&DWP(8,"esp"),$s0);
@@ -632,7 +713,7 @@ sub sse_enccompact()
&set_label("loop",16);
&sse_enccompact();
&add ($key,16);
- &cmp ($key,&DWP(24,"esp"));
+ &cmp ($key,$__end);
&ja (&label("out"));
&movq ("mm2",&QWP(8,"esp"));
@@ -673,6 +754,7 @@ sub sse_enccompact()
&ret ();
&function_end_B("_sse_AES_encrypt_compact");
+ }
######################################################################
# Vanilla block function.
@@ -684,7 +766,7 @@ sub encstep()
my $out = $i==3?$s[0]:$acc;
# lines marked with #%e?x[i] denote "reordered" instructions...
- if ($i==3) { &mov ($key,&DWP(20,"esp")); }##%edx
+ if ($i==3) { &mov ($key,$__key); }##%edx
else { &mov ($out,$s[0]);
&and ($out,0xFF); }
if ($i==1) { &shr ($s[0],16); }#%ebx[1]
@@ -695,14 +777,14 @@ sub encstep()
&movz ($tmp,&HB($s[1]));
&xor ($out,&DWP(3,$te,$tmp,8));
- if ($i==3) { $tmp=$s[2]; &mov ($s[1],&DWP(4,"esp")); }##%ebx
+ if ($i==3) { $tmp=$s[2]; &mov ($s[1],$__s0); }##%ebx
else { &mov ($tmp,$s[2]);
&shr ($tmp,16); }
if ($i==2) { &and ($s[1],0xFF); }#%edx[2]
&and ($tmp,0xFF);
&xor ($out,&DWP(2,$te,$tmp,8));
- if ($i==3) { $tmp=$s[3]; &mov ($s[2],&DWP(8,"esp")); }##%ecx
+ if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }##%ecx
elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2]
else { &mov ($tmp,$s[3]);
&shr ($tmp,24) }
@@ -717,7 +799,7 @@ sub enclast()
my $tmp = $key;
my $out = $i==3?$s[0]:$acc;
- if ($i==3) { &mov ($key,&DWP(20,"esp")); }##%edx
+ if ($i==3) { &mov ($key,$__key); }##%edx
else { &mov ($out,$s[0]); }
&and ($out,0xFF);
if ($i==1) { &shr ($s[0],16); }#%ebx[1]
@@ -731,7 +813,7 @@ sub enclast()
&and ($tmp,0x0000ff00);
&xor ($out,$tmp);
- if ($i==3) { $tmp=$s[2]; &mov ($s[1],&DWP(4,"esp")); }##%ebx
+ if ($i==3) { $tmp=$s[2]; &mov ($s[1],$__s0); }##%ebx
else { &mov ($tmp,$s[2]);
&shr ($tmp,16); }
if ($i==2) { &and ($s[1],0xFF); }#%edx[2]
@@ -740,7 +822,7 @@ sub enclast()
&and ($tmp,0x00ff0000);
&xor ($out,$tmp);
- if ($i==3) { $tmp=$s[3]; &mov ($s[2],&DWP(8,"esp")); }##%ecx
+ if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }##%ecx
elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2]
else { &mov ($tmp,$s[3]);
&shr ($tmp,24); }
@@ -760,7 +842,7 @@ sub enclast()
}
# note that caller is expected to allocate stack frame for me!
- &mov (&DWP(20,"esp"),$key); # save key
+ &mov ($__key,$key); # save key
&xor ($s0,&DWP(0,$key)); # xor with key
&xor ($s1,&DWP(4,$key));
@@ -772,7 +854,7 @@ sub enclast()
if ($small_footprint) {
&lea ($acc,&DWP(-2,$acc,$acc));
&lea ($acc,&DWP(0,$key,$acc,8));
- &mov (&DWP(24,"esp"),$acc); # end of key schedule
+ &mov ($__end,$acc); # end of key schedule
&set_label("loop",16);
if ($vertical_spin) {
@@ -788,8 +870,8 @@ sub enclast()
&xor ($s1,&DWP(4,$key));
&xor ($s2,&DWP(8,$key));
&xor ($s3,&DWP(12,$key));
- &cmp ($key,&DWP(24,"esp"));
- &mov (&DWP(20,"esp"),$key);
+ &cmp ($key,$__end);
+ &mov ($__key,$key);
&jb (&label("loop"));
}
else {
@@ -814,7 +896,7 @@ sub enclast()
&xor ($s3,&DWP(16*$i+12,$key));
}
&add ($key,32);
- &mov (&DWP(20,"esp"),$key); # advance rd_key
+ &mov ($__key,$key); # advance rd_key
&set_label("12rounds",4);
for ($i=1;$i<3;$i++) {
if ($vertical_spin) {
@@ -831,7 +913,7 @@ sub enclast()
&xor ($s3,&DWP(16*$i+12,$key));
}
&add ($key,32);
- &mov (&DWP(20,"esp"),$key); # advance rd_key
+ &mov ($__key,$key); # advance rd_key
&set_label("10rounds",4);
for ($i=1;$i<10;$i++) {
if ($vertical_spin) {
@@ -1089,40 +1171,42 @@ sub enclast()
&and ($s1,0x3C0); # modulo 1024, but aligned to cache-line
&sub ("esp",$s1);
&add ("esp",4); # 4 is reserved for caller's return address
- &mov (&DWP(28,"esp"),$s0); # save stack pointer
+ &mov ($_esp,$s0); # save stack pointer
&call (&label("pic_point")); # make it PIC!
&set_label("pic_point");
&blindpop($tbl);
- &picmeup($s0,"OPENSSL_ia32cap_P",$tbl,&label("pic_point"));
+ &picmeup($s0,"OPENSSL_ia32cap_P",$tbl,&label("pic_point")) if (!$x86only);
&lea ($tbl,&DWP(&label("AES_Te")."-".&label("pic_point"),$tbl));
+
# pick Te4 copy which can't "overlap" with stack frame or key schedule
&lea ($s1,&DWP(768-4,"esp"));
&sub ($s1,$tbl);
&and ($s1,0x300);
&lea ($tbl,&DWP(2048+128,$tbl,$s1));
- &bt (&DWP(0,$s0),25); # check for SSE bit
+ if (!$x86only) {
+ &bt (&DWP(0,$s0),25); # check for SSE bit
&jnc (&label("x86"));
&movq ("mm0",&QWP(0,$acc));
&movq ("mm4",&QWP(8,$acc));
&call ("_sse_AES_encrypt_compact");
- &mov ("esp",&DWP(28,"esp")); # restore stack pointer
+ &mov ("esp",$_esp); # restore stack pointer
&mov ($acc,&wparam(1)); # load out
&movq (&QWP(0,$acc),"mm0"); # write output data
&movq (&QWP(8,$acc),"mm4");
&emms ();
&function_end_A();
-
+ }
&set_label("x86",16);
- &mov (&DWP(24,"esp"),$tbl);
+ &mov ($_tbl,$tbl);
&mov ($s0,&DWP(0,$acc)); # load input data
&mov ($s1,&DWP(4,$acc));
&mov ($s2,&DWP(8,$acc));
&mov ($s3,&DWP(12,$acc));
&call ("_x86_AES_encrypt_compact");
- &mov ("esp",&DWP(28,"esp")); # restore stack pointer
+ &mov ("esp",$_esp); # restore stack pointer
&mov ($acc,&wparam(1)); # load out
&mov (&DWP(0,$acc),$s0); # write output data
&mov (&DWP(4,$acc),$s1);
@@ -1147,7 +1231,7 @@ sub deccompact()
# void restoration of some values from stack, so that after
# 4xdeccompact with extra argument $key, $s0 and $s1 values
# are left there...
- if($i==3) { &$Fn ($key,&DWP(20,"esp")); }
+ if($i==3) { &$Fn ($key,$__key); }
else { &mov ($out,$s[0]); }
&and ($out,0xFF);
&movz ($out,&BP(-128,$td,$out,1));
@@ -1166,14 +1250,14 @@ sub deccompact()
&shl ($tmp,16);
&xor ($out,$tmp);
- if ($i==3) { $tmp=$s[3]; &$Fn ($s[2],&DWP(8,"esp")); }
+ if ($i==3) { $tmp=$s[3]; &$Fn ($s[2],$__s1); }
else { &mov ($tmp,$s[3]); }
&shr ($tmp,24);
&movz ($tmp,&BP(-128,$td,$tmp,1));
&shl ($tmp,24);
&xor ($out,$tmp);
if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
- if ($i==3) { &$Fn ($s[3],&DWP(4,"esp")); }
+ if ($i==3) { &$Fn ($s[3],$__s0); }
}
# must be called with 2,3,0,1 as argument sequence!!!
@@ -1233,17 +1317,17 @@ sub dectransform()
&xor ($s[$i],$tp4); # ^= ROTATE(tp8^tp4^tp1,16)
&xor ($s[$i],$tp8); # ^= ROTATE(tp8,8)
- &mov ($s[0],&DWP(4,"esp")) if($i==2); #prefetch $s0
- &mov ($s[1],&DWP(8,"esp")) if($i==3); #prefetch $s1
- &mov ($s[2],&DWP(12,"esp")) if($i==1);
- &mov ($s[3],&DWP(16,"esp")) if($i==1);
+ &mov ($s[0],$__s0) if($i==2); #prefetch $s0
+ &mov ($s[1],$__s1) if($i==3); #prefetch $s1
+ &mov ($s[2],$__s2) if($i==1);
+ &mov ($s[3],$__s3) if($i==1);
&mov (&DWP(4+4*$i,"esp"),$s[$i]) if($i>=2);
}
&public_label("AES_Td");
&function_begin_B("_x86_AES_decrypt_compact");
# note that caller is expected to allocate stack frame for me!
- &mov (&DWP(20,"esp"),$key); # save key
+ &mov ($__key,$key); # save key
&xor ($s0,&DWP(0,$key)); # xor with key
&xor ($s1,&DWP(4,$key));
@@ -1254,7 +1338,7 @@ sub dectransform()
&lea ($acc,&DWP(-2,$acc,$acc));
&lea ($acc,&DWP(0,$key,$acc,8));
- &mov (&DWP(24,"esp"),$acc); # end of key schedule
+ &mov ($__end,$acc); # end of key schedule
# prefetch Td4
&mov ($key,&DWP(0-128,$tbl));
@@ -1276,16 +1360,16 @@ sub dectransform()
&dectransform(3);
&dectransform(0);
&dectransform(1);
- &mov ($key,&DWP(20,"esp"));
- &mov ($tbl,&DWP(28,"esp"));
+ &mov ($key,$__key);
+ &mov ($tbl,$__tbl);
&add ($key,16); # advance rd_key
&xor ($s0,&DWP(0,$key));
&xor ($s1,&DWP(4,$key));
&xor ($s2,&DWP(8,$key));
&xor ($s3,&DWP(12,$key));
- &cmp ($key,&DWP(24,"esp"));
- &mov (&DWP(20,"esp"),$key);
+ &cmp ($key,$__end);
+ &mov ($__key,$key);
&jb (&label("loop"));
&deccompact(0,$tbl,$s0,$s3,$s2,$s1);
@@ -1392,6 +1476,7 @@ sub sse_deccompact()
&punpckldq ("mm4","mm5"); # t[2,3] collected
}
+ if (!$x86only) {
&public_label("AES_Td");
&function_begin_B("_sse_AES_decrypt_compact");
&pxor ("mm0",&QWP(0,$key)); # 7, 6, 5, 4, 3, 2, 1, 0
@@ -1401,7 +1486,7 @@ sub sse_deccompact()
&mov ($acc,&DWP(240,$key)); # load key->rounds
&lea ($acc,&DWP(-2,$acc,$acc));
&lea ($acc,&DWP(0,$key,$acc,8));
- &mov (&DWP(24,"esp"),$acc); # end of key schedule
+ &mov ($__end,$acc); # end of key schedule
&mov ($s0,0x1b1b1b1b); # magic constant
&mov (&DWP(8,"esp"),$s0);
@@ -1420,7 +1505,7 @@ sub sse_deccompact()
&set_label("loop",16);
&sse_deccompact();
&add ($key,16);
- &cmp ($key,&DWP(24,"esp"));
+ &cmp ($key,$__end);
&ja (&label("out"));
# ROTATE(x^y,N) == ROTATE(x,N)^ROTATE(y,N)
@@ -1493,6 +1578,7 @@ sub sse_deccompact()
&ret ();
&function_end_B("_sse_AES_decrypt_compact");
+ }
######################################################################
# Vanilla block function.
@@ -1507,7 +1593,7 @@ sub decstep()
# optimal... or rather that all attempts to reorder didn't
# result in better performance [which by the way is not a
# bit lower than ecryption].
- if($i==3) { &mov ($key,&DWP(20,"esp")); }
+ if($i==3) { &mov ($key,$__key); }
else { &mov ($out,$s[0]); }
&and ($out,0xFF);
&mov ($out,&DWP(0,$td,$out,8));
@@ -1522,12 +1608,12 @@ sub decstep()
&and ($tmp,0xFF);
&xor ($out,&DWP(2,$td,$tmp,8));
- if ($i==3) { $tmp=$s[3]; &mov ($s[2],&DWP(8,"esp")); }
+ if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }
else { &mov ($tmp,$s[3]); }
&shr ($tmp,24);
&xor ($out,&DWP(1,$td,$tmp,8));
if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
- if ($i==3) { &mov ($s[3],&DWP(4,"esp")); }
+ if ($i==3) { &mov ($s[3],$__s0); }
&comment();
}
@@ -1546,7 +1632,7 @@ sub declast()
&mov ($tmp,&DWP(192-128,$td));
&mov ($acc,&DWP(224-128,$td));
&lea ($td,&DWP(-128,$td)); }
- if($i==3) { &mov ($key,&DWP(20,"esp")); }
+ if($i==3) { &mov ($key,$__key); }
else { &mov ($out,$s[0]); }
&and ($out,0xFF);
&movz ($out,&BP(0,$td,$out,1));
@@ -1565,21 +1651,21 @@ sub declast()
&shl ($tmp,16);
&xor ($out,$tmp);
- if ($i==3) { $tmp=$s[3]; &mov ($s[2],&DWP(8,"esp")); }
+ if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }
else { &mov ($tmp,$s[3]); }
&shr ($tmp,24);
&movz ($tmp,&BP(0,$td,$tmp,1));
&shl ($tmp,24);
&xor ($out,$tmp);
if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
- if ($i==3) { &mov ($s[3],&DWP(4,"esp"));
+ if ($i==3) { &mov ($s[3],$__s0);
&lea ($td,&DWP(-2048,$td)); }
}
&public_label("AES_Td");
&function_begin_B("_x86_AES_decrypt");
# note that caller is expected to allocate stack frame for me!
- &mov (&DWP(20,"esp"),$key); # save key
+ &mov ($__key,$key); # save key
&xor ($s0,&DWP(0,$key)); # xor with key
&xor ($s1,&DWP(4,$key));
@@ -1591,7 +1677,7 @@ sub declast()
if ($small_footprint) {
&lea ($acc,&DWP(-2,$acc,$acc));
&lea ($acc,&DWP(0,$key,$acc,8));
- &mov (&DWP(24,"esp"),$acc); # end of key schedule
+ &mov ($__end,$acc); # end of key schedule
&set_label("loop",16);
&decstep(0,$tbl,$s0,$s3,$s2,$s1);
&decstep(1,$tbl,$s1,$s0,$s3,$s2);
@@ -1602,8 +1688,8 @@ sub declast()
&xor ($s1,&DWP(4,$key));
&xor ($s2,&DWP(8,$key));
&xor ($s3,&DWP(12,$key));
- &cmp ($key,&DWP(24,"esp"));
- &mov (&DWP(20,"esp"),$key);
+ &cmp ($key,$__end);
+ &mov ($__key,$key);
&jb (&label("loop"));
}
else {
@@ -1624,7 +1710,7 @@ sub declast()
&xor ($s3,&DWP(16*$i+12,$key));
}
&add ($key,32);
- &mov (&DWP(20,"esp"),$key); # advance rd_key
+ &mov ($__key,$key); # advance rd_key
&set_label("12rounds",4);
for ($i=1;$i<3;$i++) {
&decstep(0,$tbl,$s0,$s3,$s2,$s1);
@@ -1637,7 +1723,7 @@ sub declast()
&xor ($s3,&DWP(16*$i+12,$key));
}
&add ($key,32);
- &mov (&DWP(20,"esp"),$key); # advance rd_key
+ &mov ($__key,$key); # advance rd_key
&set_label("10rounds",4);
for ($i=1;$i<10;$i++) {
&decstep(0,$tbl,$s0,$s3,$s2,$s1);
@@ -1881,40 +1967,42 @@ sub declast()
&and ($s1,0x3C0); # modulo 1024, but aligned to cache-line
&sub ("esp",$s1);
&add ("esp",4); # 4 is reserved for caller's return address
- &mov (&DWP(28,"esp"),$s0); # save stack pointer
+ &mov ($_esp,$s0); # save stack pointer
&call (&label("pic_point")); # make it PIC!
&set_label("pic_point");
&blindpop($tbl);
- &picmeup($s0,"OPENSSL_ia32cap_P",$tbl,&label("pic_point"));
+ &picmeup($s0,"OPENSSL_ia32cap_P",$tbl,&label("pic_point")) if(!$x86only);
&lea ($tbl,&DWP(&label("AES_Td")."-".&label("pic_point"),$tbl));
+
# pick Td4 copy which can't "overlap" with stack frame or key schedule
&lea ($s1,&DWP(768-4,"esp"));
&sub ($s1,$tbl);
&and ($s1,0x300);
&lea ($tbl,&DWP(2048+128,$tbl,$s1));
- &bt (&DWP(0,$s0),25); # check for SSE bit
+ if (!$x86only) {
+ &bt (&DWP(0,$s0),25); # check for SSE bit
&jnc (&label("x86"));
&movq ("mm0",&QWP(0,$acc));
&movq ("mm4",&QWP(8,$acc));
&call ("_sse_AES_decrypt_compact");
- &mov ("esp",&DWP(28,"esp")); # restore stack pointer
+ &mov ("esp",$_esp); # restore stack pointer
&mov ($acc,&wparam(1)); # load out
&movq (&QWP(0,$acc),"mm0"); # write output data
&movq (&QWP(8,$acc),"mm4");
&emms ();
&function_end_A();
-
+ }
&set_label("x86",16);
- &mov (&DWP(24,"esp"),$tbl);
+ &mov ($_tbl,$tbl);
&mov ($s0,&DWP(0,$acc)); # load input data
&mov ($s1,&DWP(4,$acc));
&mov ($s2,&DWP(8,$acc));
&mov ($s3,&DWP(12,$acc));
&call ("_x86_AES_decrypt_compact");
- &mov ("esp",&DWP(28,"esp")); # restore stack pointer
+ &mov ("esp",$_esp); # restore stack pointer
&mov ($acc,&wparam(1)); # load out
&mov (&DWP(0,$acc),$s0); # write output data
&mov (&DWP(4,$acc),$s1);
@@ -1927,130 +2015,138 @@ sub declast()
# unsigned char *ivp,const int enc);
{
# stack frame layout
-# -4(%esp) 0(%esp) return address
-# 0(%esp) 4(%esp) s0 backup
-# 4(%esp) 8(%esp) s1 backup
-# 8(%esp) 12(%esp) s2 backup
-# 12(%esp) 16(%esp) s3 backup
-# 16(%esp) 20(%esp) key backup
-# 20(%esp) 24(%esp) end of key schedule
-# 24(%esp) 28(%esp) ebp backup
-my $_esp=&DWP(28,"esp"); #saved %esp
-my $_inp=&DWP(32,"esp"); #copy of wparam(0)
-my $_out=&DWP(36,"esp"); #copy of wparam(1)
-my $_len=&DWP(40,"esp"); #copy of wparam(2)
-my $_key=&DWP(44,"esp"); #copy of wparam(3)
-my $_ivp=&DWP(48,"esp"); #copy of wparam(4)
-my $_tmp=&DWP(52,"esp"); #volatile variable
-my $ivec=&DWP(56,"esp"); #ivec[16]
-my $aes_key=&DWP(72,"esp"); #copy of aes_key
-my $mark=&DWP(72+240,"esp"); #copy of aes_key->rounds
+# -4(%esp) # return address 0(%esp)
+# 0(%esp) # s0 backing store 4(%esp)
+# 4(%esp) # s1 backing store 8(%esp)
+# 8(%esp) # s2 backing store 12(%esp)
+# 12(%esp) # s3 backing store 16(%esp)
+# 16(%esp) # key backup 20(%esp)
+# 20(%esp) # end of key schedule 24(%esp)
+# 24(%esp) # %ebp backup 28(%esp)
+# 28(%esp) # %esp backup
+my $_inp=&DWP(32,"esp"); # copy of wparam(0)
+my $_out=&DWP(36,"esp"); # copy of wparam(1)
+my $_len=&DWP(40,"esp"); # copy of wparam(2)
+my $_key=&DWP(44,"esp"); # copy of wparam(3)
+my $_ivp=&DWP(48,"esp"); # copy of wparam(4)
+my $_tmp=&DWP(52,"esp"); # volatile variable
+#
+my $ivec=&DWP(60,"esp"); # ivec[16]
+my $aes_key=&DWP(76,"esp"); # copy of aes_key
+my $mark=&DWP(76+240,"esp"); # copy of aes_key->rounds
&public_label("AES_Te");
&public_label("AES_Td");
&function_begin("AES_cbc_encrypt");
&mov ($s2 eq "ecx"? $s2 : "",&wparam(2)); # load len
&cmp ($s2,0);
- &je (&label("enc_out"));
+ &je (&label("drop_out"));
&call (&label("pic_point")); # make it PIC!
&set_label("pic_point");
&blindpop($tbl);
-
- &pushf ();
- &cld ();
+ &picmeup($s0,"OPENSSL_ia32cap_P",$tbl,&label("pic_point")) if(!$x86only);
&cmp (&wparam(5),0);
- &je (&label("DECRYPT"));
-
&lea ($tbl,&DWP(&label("AES_Te")."-".&label("pic_point"),$tbl));
+ &jne (&label("picked_te"));
+ &lea ($tbl,&DWP(&label("AES_Td")."-".&label("AES_Te"),$tbl));
+ &set_label("picked_te");
- # allocate aligned stack frame...
- &lea ($key,&DWP(-76-244,"esp"));
- &and ($key,-64);
+ # one can argue if this is required
+ &pushf ();
+ &cld ();
- # ... and make sure it doesn't alias with AES_Te modulo 4096
+ &cmp ($s2,$speed_limit);
+ &jb (&label("slow_way"));
+ &test ($s2,15);
+ &jnz (&label("slow_way"));
+ if (!$x86only) {
+ &bt (&DWP(0,$s0),28); # check for hyper-threading bit
+ &jc (&label("slow_way"));
+ }
+ # pre-allocate aligned stack frame...
+ &lea ($acc,&DWP(-80-244,"esp"));
+ &and ($acc,-64);
+
+ # ... and make sure it doesn't alias with $tbl modulo 4096
&mov ($s0,$tbl);
- &lea ($s1,&DWP(2048,$tbl));
- &mov ($s3,$key);
+ &lea ($s1,&DWP(2048+256,$tbl));
+ &mov ($s3,$acc);
&and ($s0,0xfff); # s = %ebp&0xfff
- &and ($s1,0xfff); # e = (%ebp+2048)&0xfff
+ &and ($s1,0xfff); # e = (%ebp+2048+256)&0xfff
&and ($s3,0xfff); # p = %esp&0xfff
&cmp ($s3,$s1); # if (p>=e) %esp =- (p-e);
- &jb (&label("te_break_out"));
+ &jb (&label("tbl_break_out"));
&sub ($s3,$s1);
- &sub ($key,$s3);
- &jmp (&label("te_ok"));
- &set_label("te_break_out"); # else %esp -= (p-s)&0xfff + framesz;
+ &sub ($acc,$s3);
+ &jmp (&label("tbl_ok"));
+ &set_label("tbl_break_out",4); # else %esp -= (p-s)&0xfff + framesz;
&sub ($s3,$s0);
&and ($s3,0xfff);
- &add ($s3,72+256);
- &sub ($key,$s3);
- &align (4);
- &set_label("te_ok");
-
- &mov ($s0,&wparam(0)); # load inp
- &mov ($s1,&wparam(1)); # load out
- &mov ($s3,&wparam(3)); # load key
- &mov ($acc,&wparam(4)); # load ivp
+ &add ($s3,384);
+ &sub ($acc,$s3);
+ &set_label("tbl_ok",4);
- &exch ("esp",$key);
+ &lea ($s3,&wparam(0)); # obtain pointer to parameter block
+ &exch ("esp",$acc); # allocate stack frame
&add ("esp",4); # reserve for return address!
- &mov ($_esp,$key); # save %esp
+ &mov ($_tbl,$tbl); # save %ebp
+ &mov ($_esp,$acc); # save %esp
+
+ &mov ($s0,&DWP(0,$s3)); # load inp
+ &mov ($s1,&DWP(4,$s3)); # load out
+ #&mov ($s2,&DWP(8,$s3)); # load len
+ &mov ($key,&DWP(12,$s3)); # load key
+ &mov ($acc,&DWP(16,$s3)); # load ivp
+ &mov ($s3,&DWP(20,$s3)); # load enc flag
&mov ($_inp,$s0); # save copy of inp
&mov ($_out,$s1); # save copy of out
&mov ($_len,$s2); # save copy of len
- &mov ($_key,$s3); # save copy of key
+ &mov ($_key,$key); # save copy of key
&mov ($_ivp,$acc); # save copy of ivp
&mov ($mark,0); # copy of aes_key->rounds = 0;
- if ($compromise) {
- &cmp ($s2,$compromise);
- &jb (&label("skip_ecopy"));
- }
# do we copy key schedule to stack?
- &mov ($s1 eq "ebx" ? $s1 : "",$s3);
+ &mov ($s1 eq "ebx" ? $s1 : "",$key);
&mov ($s2 eq "ecx" ? $s2 : "",244/4);
&sub ($s1,$tbl);
- &mov ("esi",$s3);
+ &mov ("esi",$key);
&and ($s1,0xfff);
&lea ("edi",$aes_key);
- &cmp ($s1,2048);
- &jb (&label("do_ecopy"));
+ &cmp ($s1,2048+256);
+ &jb (&label("do_copy"));
&cmp ($s1,4096-244);
- &jb (&label("skip_ecopy"));
- &align (4);
- &set_label("do_ecopy");
+ &jb (&label("skip_copy"));
+ &set_label("do_copy",4);
&mov ($_key,"edi");
&data_word(0xA5F3F689); # rep movsd
- &set_label("skip_ecopy");
+ &set_label("skip_copy");
- &mov ($acc,$s0);
&mov ($key,16);
- &align (4);
- &set_label("prefetch_te");
+ &set_label("prefetch_tbl",4);
&mov ($s0,&DWP(0,$tbl));
&mov ($s1,&DWP(32,$tbl));
&mov ($s2,&DWP(64,$tbl));
- &mov ($s3,&DWP(96,$tbl));
+ &mov ($acc,&DWP(96,$tbl));
&lea ($tbl,&DWP(128,$tbl));
- &dec ($key);
- &jnz (&label("prefetch_te"));
+ &sub ($key,1);
+ &jnz (&label("prefetch_tbl"));
&sub ($tbl,2048);
- &mov (&DWP(24,"esp"),$tbl);
- &mov ($s2,$_len);
+ &mov ($acc,$_inp);
&mov ($key,$_ivp);
- &test ($s2,0xFFFFFFF0);
- &jz (&label("enc_tail")); # short input...
+ &cmp ($s3,0);
+ &je (&label("fast_decrypt"));
+
+#----------------------------- ENCRYPT -----------------------------#
&mov ($s0,&DWP(0,$key)); # load iv
&mov ($s1,&DWP(4,$key));
- &align (4);
- &set_label("enc_loop");
+ &set_label("fast_enc_loop",16);
&mov ($s2,&DWP(8,$key));
&mov ($s3,&DWP(12,$key));
@@ -2070,22 +2166,16 @@ my $mark=&DWP(72+240,"esp"); #copy of aes_key->rounds
&mov (&DWP(8,$key),$s2);
&mov (&DWP(12,$key),$s3);
+ &lea ($acc,&DWP(16,$acc)); # advance inp
&mov ($s2,$_len); # load len
-
- &lea ($acc,&DWP(16,$acc));
&mov ($_inp,$acc); # save inp
-
- &lea ($s3,&DWP(16,$key));
+ &lea ($s3,&DWP(16,$key)); # advance out
&mov ($_out,$s3); # save out
-
- &sub ($s2,16);
- &test ($s2,0xFFFFFFF0);
+ &sub ($s2,16); # decrease len
&mov ($_len,$s2); # save len
- &jnz (&label("enc_loop"));
- &test ($s2,15);
- &jnz (&label("enc_tail"));
+ &jnz (&label("fast_enc_loop"));
&mov ($acc,$_ivp); # load ivp
- &mov ($s2,&DWP(8,$key)); # restore last dwords
+ &mov ($s2,&DWP(8,$key)); # restore last 2 dwords
&mov ($s3,&DWP(12,$key));
&mov (&DWP(0,$acc),$s0); # save ivec
&mov (&DWP(4,$acc),$s1);
@@ -2094,7 +2184,6 @@ my $mark=&DWP(72+240,"esp"); #copy of aes_key->rounds
&cmp ($mark,0); # was the key schedule copied?
&mov ("edi",$_key);
- &mov ("esp",$_esp);
&je (&label("skip_ezero"));
# zero copy of key schedule
&mov ("ecx",240/4);
@@ -2102,126 +2191,22 @@ my $mark=&DWP(72+240,"esp"); #copy of aes_key->rounds
&align (4);
&data_word(0xABF3F689); # rep stosd
&set_label("skip_ezero")
+ &mov ("esp",$_esp);
&popf ();
- &set_label("enc_out");
+ &set_label("drop_out");
&function_end_A();
&pushf (); # kludge, never executed
- &align (4);
- &set_label("enc_tail");
- &push ($key eq "edi" ? $key : ""); # push ivp
- &mov ($key,$_out); # load out
- &mov ($s1,16);
- &sub ($s1,$s2);
- &cmp ($key,$acc); # compare with inp
- &je (&label("enc_in_place"));
- &align (4);
- &data_word(0xA4F3F689); # rep movsb # copy input
- &jmp (&label("enc_skip_in_place"));
- &set_label("enc_in_place");
- &lea ($key,&DWP(0,$key,$s2));
- &set_label("enc_skip_in_place");
- &mov ($s2,$s1);
- &xor ($s0,$s0);
- &align (4);
- &data_word(0xAAF