summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndy Polyakov <appro@openssl.org>2017-02-03 12:07:16 +0100
committerAndy Polyakov <appro@openssl.org>2017-02-06 08:21:42 +0100
commit384e6de4c7e35e37fb3d6fbeb32ddcb5eb0d3d3f (patch)
tree3eba1883b51094452284e267b6772b261db622b4
parente1dbf7f431b996010844e220d3200cbf2122dbb3 (diff)
x86_64 assembly pack: Win64 SEH face-lift.
- harmonize handlers with guidelines and themselves; - fix some bugs in handlers; - add missing handlers in chacha and ecp_nistz256 modules; Reviewed-by: Rich Salz <rsalz@openssl.org>
-rwxr-xr-xcrypto/aes/asm/aes-x86_64.pl42
-rw-r--r--crypto/aes/asm/aesni-sha256-x86_64.pl55
-rw-r--r--crypto/aes/asm/aesni-x86_64.pl274
-rw-r--r--crypto/aes/asm/bsaes-x86_64.pl161
-rwxr-xr-xcrypto/bn/asm/rsaz-avx2.pl2
-rw-r--r--crypto/bn/asm/x86_64-gf2m.pl17
-rwxr-xr-xcrypto/bn/asm/x86_64-mont.pl12
-rwxr-xr-xcrypto/bn/asm/x86_64-mont5.pl4
-rwxr-xr-xcrypto/chacha/asm/chacha-x86_64.pl515
-rwxr-xr-xcrypto/ec/asm/ecp_nistz256-x86_64.pl530
-rw-r--r--crypto/modes/asm/ghash-x86_64.pl35
-rwxr-xr-xcrypto/sha/asm/sha1-x86_64.pl127
-rwxr-xr-xcrypto/sha/asm/sha512-x86_64.pl91
-rw-r--r--crypto/whrlpool/asm/wp-x86_64.pl19
14 files changed, 1268 insertions, 616 deletions
diff --git a/crypto/aes/asm/aes-x86_64.pl b/crypto/aes/asm/aes-x86_64.pl
index ae7fde20fe..5eecfdf8cf 100755
--- a/crypto/aes/asm/aes-x86_64.pl
+++ b/crypto/aes/asm/aes-x86_64.pl
@@ -599,6 +599,7 @@ $code.=<<___;
.hidden asm_AES_encrypt
asm_AES_encrypt:
AES_encrypt:
+ mov %rsp,%rax
push %rbx
push %rbp
push %r12
@@ -607,7 +608,6 @@ AES_encrypt:
push %r15
# allocate frame "above" key schedule
- mov %rsp,%r10
lea -63(%rdx),%rcx # %rdx is key argument
and \$-64,%rsp
sub %rsp,%rcx
@@ -617,7 +617,7 @@ AES_encrypt:
sub \$32,%rsp
mov %rsi,16(%rsp) # save out
- mov %r10,24(%rsp) # save real stack pointer
+ mov %rax,24(%rsp) # save original stack pointer
.Lenc_prologue:
mov %rdx,$key
@@ -649,13 +649,13 @@ AES_encrypt:
mov $s2,8($out)
mov $s3,12($out)
- mov (%rsi),%r15
- mov 8(%rsi),%r14
- mov 16(%rsi),%r13
- mov 24(%rsi),%r12
- mov 32(%rsi),%rbp
- mov 40(%rsi),%rbx
- lea 48(%rsi),%rsp
+ mov -48(%rsi),%r15
+ mov -40(%rsi),%r14
+ mov -32(%rsi),%r13
+ mov -24(%rsi),%r12
+ mov -16(%rsi),%rbp
+ mov -8(%rsi),%rbx
+ lea (%rsi),%rsp
.Lenc_epilogue:
ret
.size AES_encrypt,.-AES_encrypt
@@ -1197,6 +1197,7 @@ $code.=<<___;
.hidden asm_AES_decrypt
asm_AES_decrypt:
AES_decrypt:
+ mov %rsp,%rax
push %rbx
push %rbp
push %r12
@@ -1205,7 +1206,6 @@ AES_decrypt:
push %r15
# allocate frame "above" key schedule
- mov %rsp,%r10
lea -63(%rdx),%rcx # %rdx is key argument
and \$-64,%rsp
sub %rsp,%rcx
@@ -1215,7 +1215,7 @@ AES_decrypt:
sub \$32,%rsp
mov %rsi,16(%rsp) # save out
- mov %r10,24(%rsp) # save real stack pointer
+ mov %rax,24(%rsp) # save original stack pointer
.Ldec_prologue:
mov %rdx,$key
@@ -1249,13 +1249,13 @@ AES_decrypt:
mov $s2,8($out)
mov $s3,12($out)
- mov (%rsi),%r15
- mov 8(%rsi),%r14
- mov 16(%rsi),%r13
- mov 24(%rsi),%r12
- mov 32(%rsi),%rbp
- mov 40(%rsi),%rbx
- lea 48(%rsi),%rsp
+ mov -48(%rsi),%r15
+ mov -40(%rsi),%r14
+ mov -32(%rsi),%r13
+ mov -24(%rsi),%r12
+ mov -16(%rsi),%rbp
+ mov -8(%rsi),%rbx
+ lea (%rsi),%rsp
.Ldec_epilogue:
ret
.size AES_decrypt,.-AES_decrypt
@@ -1675,10 +1675,9 @@ AES_cbc_encrypt:
mov %r9d,%r9d # clear upper half of enc
lea .LAES_Te(%rip),$sbox
+ lea .LAES_Td(%rip),%r10
cmp \$0,%r9
- jne .Lcbc_picked_te
- lea .LAES_Td(%rip),$sbox
-.Lcbc_picked_te:
+ cmoveq %r10,$sbox
mov OPENSSL_ia32cap_P(%rip),%r10d
cmp \$$speed_limit,%rdx
@@ -2580,7 +2579,6 @@ block_se_handler:
jae .Lin_block_prologue
mov 24(%rax),%rax # pull saved real stack pointer
- lea 48(%rax),%rax # adjust...
mov -8(%rax),%rbx
mov -16(%rax),%rbp
diff --git a/crypto/aes/asm/aesni-sha256-x86_64.pl b/crypto/aes/asm/aesni-sha256-x86_64.pl
index ba4964a850..2d6424fecd 100644
--- a/crypto/aes/asm/aesni-sha256-x86_64.pl
+++ b/crypto/aes/asm/aesni-sha256-x86_64.pl
@@ -341,13 +341,13 @@ $code.=<<___;
${func}_xop:
.Lxop_shortcut:
mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
+ mov %rsp,%rax # copy %rsp
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
- mov %rsp,%r11 # copy %rsp
sub \$`$framesz+$win64*16*10`,%rsp
and \$-64,%rsp # align stack frame
@@ -363,7 +363,7 @@ ${func}_xop:
mov $ivp,$_ivp
mov $ctx,$_ctx
mov $in0,$_in0
- mov %r11,$_rsp
+ mov %rax,$_rsp
___
$code.=<<___ if ($win64);
movaps %xmm6,`$framesz+16*0`(%rsp)
@@ -617,13 +617,13 @@ $code.=<<___ if ($win64);
movaps `$framesz+16*9`(%rsp),%xmm15
___
$code.=<<___;
- mov (%rsi),%r15
- mov 8(%rsi),%r14
- mov 16(%rsi),%r13
- mov 24(%rsi),%r12
- mov 32(%rsi),%rbp
- mov 40(%rsi),%rbx
- lea 48(%rsi),%rsp
+ mov -48(%rsi),%r15
+ mov -40(%rsi),%r14
+ mov -32(%rsi),%r13
+ mov -24(%rsi),%r12
+ mov -16(%rsi),%rbp
+ mov -8(%rsi),%rbx
+ lea (%rsi),%rsp
.Lepilogue_xop:
ret
.size ${func}_xop,.-${func}_xop
@@ -639,13 +639,13 @@ $code.=<<___;
${func}_avx:
.Lavx_shortcut:
mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
+ mov %rsp,%rax # copy %rsp
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
- mov %rsp,%r11 # copy %rsp
sub \$`$framesz+$win64*16*10`,%rsp
and \$-64,%rsp # align stack frame
@@ -661,7 +661,7 @@ ${func}_avx:
mov $ivp,$_ivp
mov $ctx,$_ctx
mov $in0,$_in0
- mov %r11,$_rsp
+ mov %rax,$_rsp
___
$code.=<<___ if ($win64);
movaps %xmm6,`$framesz+16*0`(%rsp)
@@ -868,13 +868,13 @@ $code.=<<___ if ($win64);
movaps `$framesz+16*9`(%rsp),%xmm15
___
$code.=<<___;
- mov (%rsi),%r15
- mov 8(%rsi),%r14
- mov 16(%rsi),%r13
- mov 24(%rsi),%r12
- mov 32(%rsi),%rbp
- mov 40(%rsi),%rbx
- lea 48(%rsi),%rsp
+ mov -48(%rsi),%r15
+ mov -40(%rsi),%r14
+ mov -32(%rsi),%r13
+ mov -24(%rsi),%r12
+ mov -16(%rsi),%rbp
+ mov -8(%rsi),%rbx
+ lea (%rsi),%rsp
.Lepilogue_avx:
ret
.size ${func}_avx,.-${func}_avx
@@ -935,13 +935,13 @@ $code.=<<___;
${func}_avx2:
.Lavx2_shortcut:
mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
+ mov %rsp,%rax # copy %rsp
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
- mov %rsp,%r11 # copy %rsp
sub \$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp
and \$-256*$SZ,%rsp # align stack frame
add \$`2*$SZ*($rounds-8)`,%rsp
@@ -958,7 +958,7 @@ ${func}_avx2:
mov $ivp,$_ivp
mov $ctx,$_ctx
mov $in0,$_in0
- mov %r11,$_rsp
+ mov %rax,$_rsp
___
$code.=<<___ if ($win64);
movaps %xmm6,`$framesz+16*0`(%rsp)
@@ -1205,13 +1205,13 @@ $code.=<<___ if ($win64);
movaps `$framesz+16*9`(%rsp),%xmm15
___
$code.=<<___;
- mov (%rsi),%r15
- mov 8(%rsi),%r14
- mov 16(%rsi),%r13
- mov 24(%rsi),%r12
- mov 32(%rsi),%rbp
- mov 40(%rsi),%rbx
- lea 48(%rsi),%rsp
+ mov -48(%rsi),%r15
+ mov -40(%rsi),%r14
+ mov -32(%rsi),%r13
+ mov -24(%rsi),%r12
+ mov -16(%rsi),%rbp
+ mov -8(%rsi),%rbx
+ lea (%rsi),%rsp
.Lepilogue_avx2:
ret
.size ${func}_avx2,.-${func}_avx2
@@ -1569,7 +1569,6 @@ ___
$code.=<<___;
mov %rax,%rsi # put aside Rsp
mov 16*$SZ+7*8(%rax),%rax # pull $_rsp
- lea 48(%rax),%rax
mov -8(%rax),%rbx
mov -16(%rax),%rbp
diff --git a/crypto/aes/asm/aesni-x86_64.pl b/crypto/aes/asm/aesni-x86_64.pl
index 443f2f7542..8ae6dbfa4f 100644
--- a/crypto/aes/asm/aesni-x86_64.pl
+++ b/crypto/aes/asm/aesni-x86_64.pl
@@ -1172,7 +1172,7 @@ ___
# with zero-round key xor.
{
my ($in0,$in1,$in2,$in3,$in4,$in5)=map("%xmm$_",(10..15));
-my ($key0,$ctr)=("${key_}d","${ivp}d");
+my ($key0,$ctr)=("%ebp","${ivp}d");
my $frame_size = 0x80 + ($win64?160:0);
$code.=<<___;
@@ -1201,26 +1201,25 @@ $code.=<<___;
.align 16
.Lctr32_bulk:
- lea (%rsp),%rax
+ lea (%rsp),$key_ # use $key_ as frame pointer
push %rbp
sub \$$frame_size,%rsp
and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
___
$code.=<<___ if ($win64);
- movaps %xmm6,-0xa8(%rax) # offload everything
- movaps %xmm7,-0x98(%rax)
- movaps %xmm8,-0x88(%rax)
- movaps %xmm9,-0x78(%rax)
- movaps %xmm10,-0x68(%rax)
- movaps %xmm11,-0x58(%rax)
- movaps %xmm12,-0x48(%rax)
- movaps %xmm13,-0x38(%rax)
- movaps %xmm14,-0x28(%rax)
- movaps %xmm15,-0x18(%rax)
+ movaps %xmm6,-0xa8($key_) # offload everything
+ movaps %xmm7,-0x98($key_)
+ movaps %xmm8,-0x88($key_)
+ movaps %xmm9,-0x78($key_)
+ movaps %xmm10,-0x68($key_)
+ movaps %xmm11,-0x58($key_)
+ movaps %xmm12,-0x48($key_)
+ movaps %xmm13,-0x38($key_)
+ movaps %xmm14,-0x28($key_)
+ movaps %xmm15,-0x18($key_)
.Lctr32_body:
___
$code.=<<___;
- lea -8(%rax),%rbp
# 8 16-byte words on top of stack are counter values
# xor-ed with zero-round key
@@ -1692,26 +1691,26 @@ $code.=<<___ if (!$win64);
pxor %xmm15,%xmm15
___
$code.=<<___ if ($win64);
- movaps -0xa0(%rbp),%xmm6
- movaps %xmm0,-0xa0(%rbp) # clear stack
- movaps -0x90(%rbp),%xmm7
- movaps %xmm0,-0x90(%rbp)
- movaps -0x80(%rbp),%xmm8
- movaps %xmm0,-0x80(%rbp)
- movaps -0x70(%rbp),%xmm9
- movaps %xmm0,-0x70(%rbp)
- movaps -0x60(%rbp),%xmm10
- movaps %xmm0,-0x60(%rbp)
- movaps -0x50(%rbp),%xmm11
- movaps %xmm0,-0x50(%rbp)
- movaps -0x40(%rbp),%xmm12
- movaps %xmm0,-0x40(%rbp)
- movaps -0x30(%rbp),%xmm13
- movaps %xmm0,-0x30(%rbp)
- movaps -0x20(%rbp),%xmm14
- movaps %xmm0,-0x20(%rbp)
- movaps -0x10(%rbp),%xmm15
- movaps %xmm0,-0x10(%rbp)
+ movaps -0xa8($key_),%xmm6
+ movaps %xmm0,-0xa8($key_) # clear stack
+ movaps -0x98($key_),%xmm7
+ movaps %xmm0,-0x98($key_)
+ movaps -0x88($key_),%xmm8
+ movaps %xmm0,-0x88($key_)
+ movaps -0x78($key_),%xmm9
+ movaps %xmm0,-0x78($key_)
+ movaps -0x68($key_),%xmm10
+ movaps %xmm0,-0x68($key_)
+ movaps -0x58($key_),%xmm11
+ movaps %xmm0,-0x58($key_)
+ movaps -0x48($key_),%xmm12
+ movaps %xmm0,-0x48($key_)
+ movaps -0x38($key_),%xmm13
+ movaps %xmm0,-0x38($key_)
+ movaps -0x28($key_),%xmm14
+ movaps %xmm0,-0x28($key_)
+ movaps -0x18($key_),%xmm15
+ movaps %xmm0,-0x18($key_)
movaps %xmm0,0x00(%rsp)
movaps %xmm0,0x10(%rsp)
movaps %xmm0,0x20(%rsp)
@@ -1722,8 +1721,8 @@ $code.=<<___ if ($win64);
movaps %xmm0,0x70(%rsp)
___
$code.=<<___;
- lea (%rbp),%rsp
- pop %rbp
+ mov -8($key_),%rbp
+ lea ($key_),%rsp
.Lctr32_epilogue:
ret
.size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
@@ -1740,32 +1739,32 @@ my @tweak=map("%xmm$_",(10..15));
my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]);
my ($key2,$ivp,$len_)=("%r8","%r9","%r9");
my $frame_size = 0x70 + ($win64?160:0);
+my $key_ = "%rbp"; # override so that we can use %r11 as FP
$code.=<<___;
.globl aesni_xts_encrypt
.type aesni_xts_encrypt,\@function,6
.align 16
aesni_xts_encrypt:
- lea (%rsp),%rax
+ lea (%rsp),%r11 # frame pointer
push %rbp
sub \$$frame_size,%rsp
and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
___
$code.=<<___ if ($win64);
- movaps %xmm6,-0xa8(%rax) # offload everything
- movaps %xmm7,-0x98(%rax)
- movaps %xmm8,-0x88(%rax)
- movaps %xmm9,-0x78(%rax)
- movaps %xmm10,-0x68(%rax)
- movaps %xmm11,-0x58(%rax)
- movaps %xmm12,-0x48(%rax)
- movaps %xmm13,-0x38(%rax)
- movaps %xmm14,-0x28(%rax)
- movaps %xmm15,-0x18(%rax)
+ movaps %xmm6,-0xa8(%r11) # offload everything
+ movaps %xmm7,-0x98(%r11)
+ movaps %xmm8,-0x88(%r11)
+ movaps %xmm9,-0x78(%r11)
+ movaps %xmm10,-0x68(%r11)
+ movaps %xmm11,-0x58(%r11)
+ movaps %xmm12,-0x48(%r11)
+ movaps %xmm13,-0x38(%r11)
+ movaps %xmm14,-0x28(%r11)
+ movaps %xmm15,-0x18(%r11)
.Lxts_enc_body:
___
$code.=<<___;
- lea -8(%rax),%rbp
movups ($ivp),$inout0 # load clear-text tweak
mov 240(%r8),$rounds # key2->rounds
mov 240($key),$rnds_ # key1->rounds
@@ -2183,26 +2182,26 @@ $code.=<<___ if (!$win64);
pxor %xmm15,%xmm15
___
$code.=<<___ if ($win64);
- movaps -0xa0(%rbp),%xmm6
- movaps %xmm0,-0xa0(%rbp) # clear stack
- movaps -0x90(%rbp),%xmm7
- movaps %xmm0,-0x90(%rbp)
- movaps -0x80(%rbp),%xmm8
- movaps %xmm0,-0x80(%rbp)
- movaps -0x70(%rbp),%xmm9
- movaps %xmm0,-0x70(%rbp)
- movaps -0x60(%rbp),%xmm10
- movaps %xmm0,-0x60(%rbp)
- movaps -0x50(%rbp),%xmm11
- movaps %xmm0,-0x50(%rbp)
- movaps -0x40(%rbp),%xmm12
- movaps %xmm0,-0x40(%rbp)
- movaps -0x30(%rbp),%xmm13
- movaps %xmm0,-0x30(%rbp)
- movaps -0x20(%rbp),%xmm14
- movaps %xmm0,-0x20(%rbp)
- movaps -0x10(%rbp),%xmm15
- movaps %xmm0,-0x10(%rbp)
+ movaps -0xa8(%r11),%xmm6
+ movaps %xmm0,-0xa8(%r11) # clear stack
+ movaps -0x98(%r11),%xmm7
+ movaps %xmm0,-0x98(%r11)
+ movaps -0x88(%r11),%xmm8
+ movaps %xmm0,-0x88(%r11)
+ movaps -0x78(%r11),%xmm9
+ movaps %xmm0,-0x78(%r11)
+ movaps -0x68(%r11),%xmm10
+ movaps %xmm0,-0x68(%r11)
+ movaps -0x58(%r11),%xmm11
+ movaps %xmm0,-0x58(%r11)
+ movaps -0x48(%r11),%xmm12
+ movaps %xmm0,-0x48(%r11)
+ movaps -0x38(%r11),%xmm13
+ movaps %xmm0,-0x38(%r11)
+ movaps -0x28(%r11),%xmm14
+ movaps %xmm0,-0x28(%r11)
+ movaps -0x18(%r11),%xmm15
+ movaps %xmm0,-0x18(%r11)
movaps %xmm0,0x00(%rsp)
movaps %xmm0,0x10(%rsp)
movaps %xmm0,0x20(%rsp)
@@ -2212,8 +2211,8 @@ $code.=<<___ if ($win64);
movaps %xmm0,0x60(%rsp)
___
$code.=<<___;
- lea (%rbp),%rsp
- pop %rbp
+ mov -8(%r11),%rbp
+ lea (%r11),%rsp
.Lxts_enc_epilogue:
ret
.size aesni_xts_encrypt,.-aesni_xts_encrypt
@@ -2224,26 +2223,25 @@ $code.=<<___;
.type aesni_xts_decrypt,\@function,6
.align 16
aesni_xts_decrypt:
- lea (%rsp),%rax
+ lea (%rsp),%r11 # frame pointer
push %rbp
sub \$$frame_size,%rsp
and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
___
$code.=<<___ if ($win64);
- movaps %xmm6,-0xa8(%rax) # offload everything
- movaps %xmm7,-0x98(%rax)
- movaps %xmm8,-0x88(%rax)
- movaps %xmm9,-0x78(%rax)
- movaps %xmm10,-0x68(%rax)
- movaps %xmm11,-0x58(%rax)
- movaps %xmm12,-0x48(%rax)
- movaps %xmm13,-0x38(%rax)
- movaps %xmm14,-0x28(%rax)
- movaps %xmm15,-0x18(%rax)
+ movaps %xmm6,-0xa8(%r11) # offload everything
+ movaps %xmm7,-0x98(%r11)
+ movaps %xmm8,-0x88(%r11)
+ movaps %xmm9,-0x78(%r11)
+ movaps %xmm10,-0x68(%r11)
+ movaps %xmm11,-0x58(%r11)
+ movaps %xmm12,-0x48(%r11)
+ movaps %xmm13,-0x38(%r11)
+ movaps %xmm14,-0x28(%r11)
+ movaps %xmm15,-0x18(%r11)
.Lxts_dec_body:
___
$code.=<<___;
- lea -8(%rax),%rbp
movups ($ivp),$inout0 # load clear-text tweak
mov 240($key2),$rounds # key2->rounds
mov 240($key),$rnds_ # key1->rounds
@@ -2687,26 +2685,26 @@ $code.=<<___ if (!$win64);
pxor %xmm15,%xmm15
___
$code.=<<___ if ($win64);
- movaps -0xa0(%rbp),%xmm6
- movaps %xmm0,-0xa0(%rbp) # clear stack
- movaps -0x90(%rbp),%xmm7
- movaps %xmm0,-0x90(%rbp)
- movaps -0x80(%rbp),%xmm8
- movaps %xmm0,-0x80(%rbp)
- movaps -0x70(%rbp),%xmm9
- movaps %xmm0,-0x70(%rbp)
- movaps -0x60(%rbp),%xmm10
- movaps %xmm0,-0x60(%rbp)
- movaps -0x50(%rbp),%xmm11
- movaps %xmm0,-0x50(%rbp)
- movaps -0x40(%rbp),%xmm12
- movaps %xmm0,-0x40(%rbp)
- movaps -0x30(%rbp),%xmm13
- movaps %xmm0,-0x30(%rbp)
- movaps -0x20(%rbp),%xmm14
- movaps %xmm0,-0x20(%rbp)
- movaps -0x10(%rbp),%xmm15
- movaps %xmm0,-0x10(%rbp)
+ movaps -0xa8(%r11),%xmm6
+ movaps %xmm0,-0xa8(%r11) # clear stack
+ movaps -0x98(%r11),%xmm7
+ movaps %xmm0,-0x98(%r11)
+ movaps -0x88(%r11),%xmm8
+ movaps %xmm0,-0x88(%r11)
+ movaps -0x78(%r11),%xmm9
+ movaps %xmm0,-0x78(%r11)
+ movaps -0x68(%r11),%xmm10
+ movaps %xmm0,-0x68(%r11)
+ movaps -0x58(%r11),%xmm11
+ movaps %xmm0,-0x58(%r11)
+ movaps -0x48(%r11),%xmm12
+ movaps %xmm0,-0x48(%r11)
+ movaps -0x38(%r11),%xmm13
+ movaps %xmm0,-0x38(%r11)
+ movaps -0x28(%r11),%xmm14
+ movaps %xmm0,-0x28(%r11)
+ movaps -0x18(%r11),%xmm15
+ movaps %xmm0,-0x18(%r11)
movaps %xmm0,0x00(%rsp)
movaps %xmm0,0x10(%rsp)
movaps %xmm0,0x20(%rsp)
@@ -2716,8 +2714,8 @@ $code.=<<___ if ($win64);
movaps %xmm0,0x60(%rsp)
___
$code.=<<___;
- lea (%rbp),%rsp
- pop %rbp
+ mov -8(%r11),%rbp
+ lea (%r11),%rsp
.Lxts_dec_epilogue:
ret
.size aesni_xts_decrypt,.-aesni_xts_decrypt
@@ -2943,6 +2941,7 @@ $code.=<<___ if (!$win64);
pxor %xmm13,%xmm13
pxor %xmm14,%xmm14
pxor %xmm15,%xmm15
+ lea 0x28(%rsp),%rax
___
$code.=<<___ if ($win64);
movaps 0x00(%rsp),%xmm6
@@ -2967,14 +2966,14 @@ $code.=<<___ if ($win64);
movaps %xmm0,0x90(%rsp)
lea 0xa0+0x28(%rsp),%rax
.Locb_enc_pop:
- lea 0xa0(%rsp),%rsp
___
$code.=<<___;
- pop %r14
- pop %r13
- pop %r12
- pop %rbp
- pop %rbx
+ mov -40(%rax),%r14
+ mov -32(%rax),%r13
+ mov -24(%rax),%r12
+ mov -16(%rax),%rbp
+ mov -8(%rax),%rbx
+ lea (%rax),%rsp
.Locb_enc_epilogue:
ret
.size aesni_ocb_encrypt,.-aesni_ocb_encrypt
@@ -3410,6 +3409,7 @@ $code.=<<___ if (!$win64);
pxor %xmm13,%xmm13
pxor %xmm14,%xmm14
pxor %xmm15,%xmm15
+ lea 0x28(%rsp),%rax
___
$code.=<<___ if ($win64);
movaps 0x00(%rsp),%xmm6
@@ -3434,14 +3434,14 @@ $code.=<<___ if ($win64);
movaps %xmm0,0x90(%rsp)
lea 0xa0+0x28(%rsp),%rax
.Locb_dec_pop:
- lea 0xa0(%rsp),%rsp
___
$code.=<<___;
- pop %r14
- pop %r13
- pop %r12
- pop %rbp
- pop %rbx
+ mov -40(%rax),%r14
+ mov -32(%rax),%r13
+ mov -24(%rax),%r12
+ mov -16(%rax),%rbp
+ mov -8(%rax),%rbx
+ lea (%rax),%rsp
.Locb_dec_epilogue:
ret
.size aesni_ocb_decrypt,.-aesni_ocb_decrypt
@@ -3650,7 +3650,6 @@ ___
{
my $frame_size = 0x10 + ($win64?0xa0:0); # used in decrypt
my ($iv,$in0,$in1,$in2,$in3,$in4)=map("%xmm$_",(10..15));
-my $inp_=$key_;
$code.=<<___;
.globl ${PREFIX}_cbc_encrypt
@@ -3732,7 +3731,7 @@ $code.=<<___;
jmp .Lcbc_ret
.align 16
.Lcbc_decrypt_bulk:
- lea (%rsp),%rax
+ lea (%rsp),%r11 # frame pointer
push %rbp
sub \$$frame_size,%rsp
and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
@@ -3750,8 +3749,11 @@ $code.=<<___ if ($win64);
movaps %xmm15,0xa0(%rsp)
.Lcbc_decrypt_body:
___
+
+my $inp_=$key_="%rbp"; # reassign $key_
+
$code.=<<___;
- lea -8(%rax),%rbp
+ mov $key,$key_ # [re-]backup $key [after reassignment]
movups ($ivp),$iv
mov $rnds_,$rounds
cmp \$0x50,$len
@@ -3791,7 +3793,7 @@ $code.=<<___;
pxor $rndkey0,$inout1
$movkey 0x10-0x70($key),$rndkey1
pxor $rndkey0,$inout2
- xor $inp_,$inp_
+ mov \$-1,$inp_
cmp \$0x70,$len # is there at least 0x60 bytes ahead?
pxor $rndkey0,$inout3
pxor $rndkey0,$inout4
@@ -3807,8 +3809,8 @@ $code.=<<___;
aesdec $rndkey1,$inout4
aesdec $rndkey1,$inout5
aesdec $rndkey1,$inout6
- setnc ${inp_}b
- shl \$7,$inp_
+ adc \$0,$inp_
+ and \$128,$inp_
aesdec $rndkey1,$inout7
add $inp,$inp_
$movkey 0x30-0x70($key),$rndkey1
@@ -4172,8 +4174,8 @@ $code.=<<___ if ($win64);
movaps %xmm0,0xa0(%rsp)
___
$code.=<<___;
- lea (%rbp),%rsp
- pop %rbp
+ mov -8(%r11),%rbp
+ lea (%r11),%rsp
.Lcbc_ret:
ret
.size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
@@ -4744,13 +4746,16 @@ ctr_xts_se_handler:
cmp %r10,%rbx # context->Rip>=epilogue label
jae .Lcommon_seh_tail
- mov 160($context),%rax # pull context->Rbp
- lea -0xa0(%rax),%rsi # %xmm save area
+ mov 208($context),%rax # pull context->R11
+
+ lea -0xa8(%rax),%rsi # %xmm save area
lea 512($context),%rdi # & context.Xmm6
mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
.long 0xa548f3fc # cld; rep movsq
- jmp .Lcommon_rbp_tail
+ mov -8(%rax),%rbp # restore saved %rbp
+ mov %rbp,160($context) # restore context->Rbp
+ jmp .Lcommon_seh_tail
.size ctr_xts_se_handler,.-ctr_xts_se_handler
.type ocb_se_handler,\@abi-omnipotent
@@ -4834,9 +4839,13 @@ cbc_se_handler:
cmp %r10,%rbx # context->Rip<"prologue" label
jb .Lcommon_seh_tail
+ mov 120($context),%rax # pull context->Rax
+
lea .Lcbc_decrypt_body(%rip),%r10
cmp %r10,%rbx # context->Rip<cbc_decrypt_body
- jb .Lrestore_cbc_rax
+ jb .Lcommon_seh_tail
+
+ mov 152($context),%rax # pull context->Rsp
lea .Lcbc_ret(%rip),%r10
cmp %r10,%rbx # context->Rip>="epilogue" label
@@ -4847,15 +4856,10 @@ cbc_se_handler:
mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
.long 0xa548f3fc # cld; rep movsq
-.Lcommon_rbp_tail:
- mov 160($context),%rax # pull context->Rbp
- mov (%rax),%rbp # restore saved %rbp
- lea 8(%rax),%rax # adjust stack pointer
- mov %rbp,160($context) # restore context->Rbp
- jmp .Lcommon_seh_tail
+ mov 208($context),%rax # pull context->R11
-.Lrestore_cbc_rax:
- mov 120($context),%rax
+ mov -8(%rax),%rbp # restore saved %rbp
+ mov %rbp,160($context) # restore context->Rbp
.Lcommon_seh_tail:
mov 8(%rax),%rdi
diff --git a/crypto/aes/asm/bsaes-x86_64.pl b/crypto/aes/asm/bsaes-x86_64.pl
index 0d49947d81..cce5795302 100644
--- a/crypto/aes/asm/bsaes-x86_64.pl
+++ b/crypto/aes/asm/bsaes-x86_64.pl
@@ -1334,7 +1334,7 @@ $code.=<<___;
cmp %rax, %rbp
jb .Lecb_enc_bzero
- lea (%rbp),%rsp # restore %rsp
+ lea 0x78(%rbp),%rax
___
$code.=<<___ if ($win64);
movaps 0x40(%rbp), %xmm6
@@ -1347,17 +1347,17 @@ $code.=<<___ if ($win64);
movaps 0xb0(%rbp), %xmm13
movaps 0xc0(%rbp), %xmm14
movaps 0xd0(%rbp), %xmm15
- lea 0xa0(%rbp), %rsp
+ lea 0xa0(%rax), %rax
+.Lecb_enc_tail:
___
$code.=<<___;
- mov 0x48(%rsp), %r15
- mov 0x50(%rsp), %r14
- mov 0x58(%rsp), %r13
- mov 0x60(%rsp), %r12
- mov 0x68(%rsp), %rbx
- mov 0x70(%rsp), %rax
- lea 0x78(%rsp), %rsp
- mov %rax, %rbp
+ mov -48(%rax), %r15
+ mov -40(%rax), %r14
+ mov -32(%rax), %r13
+ mov -24(%rax), %r12
+ mov -16(%rax), %rbx
+ mov -8(%rax), %rbp
+ lea (%rax), %rsp # restore %rsp
.Lecb_enc_epilogue:
ret
.size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
@@ -1536,7 +1536,7 @@ $code.=<<___;
cmp %rax, %rbp
jb .Lecb_dec_bzero
- lea (%rbp),%rsp # restore %rsp
+ lea 0x78(%rbp),%rax
___
$code.=<<___ if ($win64);
movaps 0x40(%rbp), %xmm6
@@ -1549,17 +1549,17 @@ $code.=<<___ if ($win64);
movaps 0xb0(%rbp), %xmm13
movaps 0xc0(%rbp), %xmm14
movaps 0xd0(%rbp), %xmm15
- lea 0xa0(%rbp), %rsp
+ lea 0xa0(%rax), %rax
+.Lecb_dec_tail:
___
$code.=<<___;
- mov 0x48(%rsp), %r15
- mov 0x50(%rsp), %r14
- mov 0x58(%rsp), %r13
- mov 0x60(%rsp), %r12
- mov 0x68(%rsp), %rbx
- mov 0x70(%rsp), %rax
- lea 0x78(%rsp), %rsp
- mov %rax, %rbp
+ mov -48(%rax), %r15
+ mov -40(%rax), %r14
+ mov -32(%rax), %r13
+ mov -24(%rax), %r12
+ mov -16(%rax), %rbx
+ mov -8(%rax), %rbp
+ lea (%rax), %rsp # restore %rsp
.Lecb_dec_epilogue:
ret
.size bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks
@@ -1826,7 +1826,7 @@ $code.=<<___;
cmp %rax, %rbp
ja .Lcbc_dec_bzero
- lea (%rbp),%rsp # restore %rsp
+ lea 0x78(%rbp),%rax
___
$code.=<<___ if ($win64);
movaps 0x40(%rbp), %xmm6
@@ -1839,17 +1839,17 @@ $code.=<<___ if ($win64);
movaps 0xb0(%rbp), %xmm13
movaps 0xc0(%rbp), %xmm14
movaps 0xd0(%rbp), %xmm15
- lea 0xa0(%rbp), %rsp
+ lea 0xa0(%rax), %rax
+.Lcbc_dec_tail:
___
$code.=<<___;
- mov 0x48(%rsp), %r15
- mov 0x50(%rsp), %r14
- mov 0x58(%rsp), %r13
- mov 0x60(%rsp), %r12
- mov 0x68(%rsp), %rbx
- mov 0x70(%rsp), %rax
- lea 0x78(%rsp), %rsp
- mov %rax, %rbp
+ mov -48(%rax), %r15
+ mov -40(%rax), %r14
+ mov -32(%rax), %r13
+ mov -24(%rax), %r12
+ mov -16(%rax), %rbx
+ mov -8(%rax), %rbp
+ lea (%rax), %rsp # restore %rsp
.Lcbc_dec_epilogue:
ret
.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
@@ -2058,7 +2058,7 @@ $code.=<<___;
cmp %rax, %rbp
ja .Lctr_enc_bzero
- lea (%rbp),%rsp # restore %rsp
+ lea 0x78(%rbp),%rax
___
$code.=<<___ if ($win64);
movaps 0x40(%rbp), %xmm6
@@ -2071,17 +2071,17 @@ $code.=<<___ if ($win64);
movaps 0xb0(%rbp), %xmm13
movaps 0xc0(%rbp), %xmm14
movaps 0xd0(%rbp), %xmm15
- lea 0xa0(%rbp), %rsp
+ lea 0xa0(%rax), %rax
+.Lctr_enc_tail:
___
$code.=<<___;
- mov 0x48(%rsp), %r15
- mov 0x50(%rsp), %r14
- mov 0x58(%rsp), %r13
- mov 0x60(%rsp), %r12
- mov 0x68(%rsp), %rbx
- mov 0x70(%rsp), %rax
- lea 0x78(%rsp), %rsp
- mov %rax, %rbp
+ mov -48(%rax), %r15
+ mov -40(%rax), %r14
+ mov -32(%rax), %r13
+ mov -24(%rax), %r12
+ mov -16(%rax), %rbx
+ mov -8(%rax), %rbp
+ lea (%rax), %rsp # restore %rsp
.Lctr_enc_epilogue:
ret
.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
@@ -2448,7 +2448,7 @@ $code.=<<___;
cmp %rax, %rbp
ja .Lxts_enc_bzero
- lea (%rbp),%rsp # restore %rsp
+ lea 0x78(%rbp),%rax
___
$code.=<<___ if ($win64);
movaps 0x40(%rbp), %xmm6
@@ -2461,17 +2461,17 @@ $code.=<<___ if ($win64);
movaps 0xb0(%rbp), %xmm13
movaps 0xc0(%rbp), %xmm14
movaps 0xd0(%rbp), %xmm15
- lea 0xa0(%rbp), %rsp
+ lea 0xa0(%rax), %rax
+.Lxts_enc_tail:
___
$code.=<<___;
- mov 0x48(%rsp), %r15
- mov 0x50(%rsp), %r14
- mov 0x58(%rsp), %r13
- mov 0x60(%rsp), %r12
- mov 0x68(%rsp), %rbx
- mov 0x70(%rsp), %rax
- lea 0x78(%rsp), %rsp
- mov %rax, %rbp
+ mov -48(%rax), %r15
+ mov -40(%rax), %r14
+ mov -32(%rax), %r13
+ mov -24(%rax), %r12
+ mov -16(%rax), %rbx
+ mov -8(%rax), %rbp
+ lea (%rax), %rsp # restore %rsp
.Lxts_enc_epilogue:
ret
.size bsaes_xts_encrypt,.-bsaes_xts_encrypt
@@ -2855,7 +2855,7 @@ $code.=<<___;
cmp %rax, %rbp
ja .Lxts_dec_bzero
- lea (%rbp),%rsp # restore %rsp
+ lea 0x78(%rbp),%rax
___
$code.=<<___ if ($win64);
movaps 0x40(%rbp), %xmm6
@@ -2868,17 +2868,17 @@ $code.=<<___ if ($win64);
movaps 0xb0(%rbp), %xmm13
movaps 0xc0(%rbp), %xmm14
movaps 0xd0(%rbp), %xmm15
- lea 0xa0(%rbp), %rsp
+ lea 0xa0(%rax), %rax
+.Lxts_dec_tail:
___
$code.=<<___;
- mov 0x48(%rsp), %r15
- mov 0x50(%rsp), %r14
- mov 0x58(%rsp), %r13
- mov 0x60(%rsp), %r12
- mov 0x68(%rsp), %rbx
- mov 0x70(%rsp), %rax
- lea 0x78(%rsp), %rsp
- mov %rax, %rbp
+ mov -48(%rax), %r15
+ mov -40(%rax), %r14
+ mov -32(%rax), %r13
+ mov -24(%rax), %r12
+ mov -16(%rax), %rbx
+ mov -8(%rax), %rbp
+ lea (%rax), %rsp # restore %rsp
.Lxts_dec_epilogue:
ret
.size bsaes_xts_decrypt,.-bsaes_xts_decrypt
@@ -2974,31 +2974,34 @@ se_handler:
mov 0(%r11),%r10d # HandlerData[0]
lea (%rsi,%r10),%r10 # prologue label
- cmp %r10,%rbx # context->Rip<prologue label
- jb .Lin_prologue
-
- mov 152($context),%rax # pull context->Rsp
+ cmp %r10,%rbx # context->Rip<=prologue label
+ jbe .Lin_prologue
mov 4(%r11),%r10d # HandlerData[1]
lea (%rsi,%r10),%r10 # epilogue label
cmp %r10,%rbx # context->Rip>=epilogue label
jae .Lin_prologue
+ mov 8(%r11),%r10d # HandlerData[2]
+ lea (%rsi,%r10),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=tail label
+ jae .Lin_tail
+
mov 160($context),%rax # pull context->Rbp
lea 0x40(%rax),%rsi # %xmm save area
lea 512($context),%rdi # &context.Xmm6
mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
.long 0xa548f3fc # cld; rep movsq
- lea 0xa0(%rax),%rax # adjust stack pointer
-
- mov 0x70(%rax),%rbp
- mov 0x68(%rax),%rbx
- mov 0x60(%rax),%r12
- mov 0x58(%rax),%r13
- mov 0x50(%rax),%r14
- mov 0x48(%rax),%r15
- lea 0x78(%rax),%rax # adjust stack pointer
+ lea 0xa0+0x78(%rax),%rax # adjust stack pointer
+
+.Lin_tail:
+ mov -48(%rax),%rbp
+ mov -40(%rax),%rbx
+ mov -32(%rax),%r12
+ mov -24(%rax),%r13
+ mov -16(%rax),%r14
+ mov -8(%rax),%r15
mov %rbx,144($context) # restore context->Rbx
mov %rbp,160($context) # restore context->Rbp
mov %r12,216($context) # restore context->R12
@@ -3079,28 +3082,40 @@ $code.=<<___ if ($ecb);
.byte 9,0,0,0
.rva se_handler
.rva .Lecb_enc_body,.Lecb_enc_epilogue # HandlerData[]
+ .rva .Lecb_enc_tail
+ .long 0
.Lecb_dec_info:
.byte 9,0,0,0
.rva se_handler
.rva .Lecb_dec_body,.Lecb_dec_epilogue # HandlerData[]
+ .rva .Lecb_dec_tail
+ .long 0
___
$code.=<<___;
.Lcbc_dec_info:
.byte 9,0,0,0
.rva se_handler
.rva .Lcbc_dec_body,.Lcbc_dec_epilogue # HandlerData[]
+ .rva .Lcbc_dec_tail
+ .long 0
.Lctr_enc_info:
.byte 9,0,0,0
.rva se_handler
.rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[]
+ .rva .Lctr_enc_tail
+ .long 0
.Lxts_enc_info:
.byte 9,0,0,0
.rva se_handler
.rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[]
+ .rva .Lxts_enc_tail
+ .long 0
.Lxts_dec_info:
.byte 9,0,0,0
.rva se_handler
.rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[]
+ .rva .Lxts_dec_tail
+ .long 0
___
}
diff --git a/crypto/bn/asm/rsaz-avx2.pl b/crypto/bn/asm/rsaz-avx2.pl
index f34c84f452..e620285e61 100755
--- a/crypto/bn/asm/rsaz-avx2.pl
+++ b/crypto/bn/asm/rsaz-avx2.pl
@@ -1738,11 +1738,11 @@ $code.=<<___ if ($win64);
movaps -0x38(%r11),%xmm13
movaps -0x28(%r11),%xmm14
movaps -0x18(%r11),%xmm15
-.LSEH_end_rsaz_1024_gather5:
___
$code.=<<___;
lea (%r11),%rsp
ret
+.LSEH_end_rsaz_1024_gather5:
.size rsaz_1024_gather5_avx2,.-rsaz_1024_gather5_avx2
___
}
diff --git a/crypto/bn/asm/x86_64-gf2m.pl b/crypto/bn/asm/x86_64-gf2m.pl
index d962f62033..d237c1e3d2 100644
--- a/crypto/bn/asm/x86_64-gf2m.pl
+++ b/crypto/bn/asm/x86_64-gf2m.pl
@@ -174,8 +174,9 @@ $code.=<<___;
.type bn_GF2m_mul_2x2,\@abi-omnipotent
.align 16
bn_GF2m_mul_2x2:
- mov OPENSSL_ia32cap_P(%rip),%rax
- bt \$33,%rax
+ mov %rsp,%rax
+ mov OPENSS