summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--crypto/aes/asm/aesni-x86.pl211
-rw-r--r--crypto/aes/asm/aesni-x86_64.pl306
-rw-r--r--crypto/engine/eng_aesni.c178
3 files changed, 633 insertions, 62 deletions
diff --git a/crypto/aes/asm/aesni-x86.pl b/crypto/aes/asm/aesni-x86.pl
index 72faa78d1f..8c1426cd5b 100644
--- a/crypto/aes/asm/aesni-x86.pl
+++ b/crypto/aes/asm/aesni-x86.pl
@@ -23,7 +23,8 @@ require "x86asm.pl";
&asm_init($ARGV[0],$0);
-$movekey = eval($RREFIX eq "aseni" ? "*movaps" : "*movups");
+if ($PREFIX eq "aesni") { $movekey=*movaps; }
+else { $movekey=*movups; }
$len="eax";
$rounds="ecx";
@@ -41,7 +42,7 @@ $rndkey1="xmm4";
$ivec="xmm5";
$in0="xmm6";
$in1="xmm7"; $inout3="xmm7";
-
+
# Inline version of internal aesni_[en|de]crypt1
sub aesni_inline_generate1
{ my $p=shift;
@@ -104,7 +105,7 @@ sub aesni_generate1 # fully unrolled loop
&ret();
&function_end_B("_aesni_${p}rypt1");
}
-
+
# void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key);
&aesni_generate1("enc") if (!$inline);
&function_begin_B("${PREFIX}_encrypt");
@@ -136,7 +137,7 @@ sub aesni_generate1 # fully unrolled loop
&movups (&QWP(0,"eax"),$inout0);
&ret ();
&function_end_B("${PREFIX}_decrypt");
-
+
# _aesni_[en|de]crypt[34] are private interfaces, N denotes interleave
# factor. Why 3x subroutine is used in loops? Even though aes[enc|dec]
# latency is 6, it turned out that it can be scheduled only every
@@ -229,8 +230,9 @@ sub aesni_generate4
&aesni_generate3("dec");
&aesni_generate4("enc") if ($PREFIX eq "aesni");
&aesni_generate4("dec");
-
+
if ($PREFIX eq "aesni") {
+######################################################################
# void aesni_ecb_encrypt (const void *in, void *out,
# size_t length, const AES_KEY *key,
# int enc);
@@ -249,8 +251,9 @@ if ($PREFIX eq "aesni") {
&mov ($rounds_,$rounds); # backup $rounds
&jz (&label("ecb_decrypt"));
- &sub ($len,0x40);
+ &cmp ($len,0x40);
&jbe (&label("ecb_enc_tail"));
+ &sub ($len,0x40);
&jmp (&label("ecb_enc_loop3"));
&set_label("ecb_enc_loop3",16);
@@ -268,14 +271,13 @@ if ($PREFIX eq "aesni") {
&movups (&QWP(-0x10,$out),$inout2);
&ja (&label("ecb_enc_loop3"));
-&set_label("ecb_enc_tail");
&add ($len,0x40);
&jz (&label("ecb_ret"));
- &cmp ($len,0x10);
- &movups ($inout0,&QWP(0,$inp));
- &je (&label("ecb_enc_one"));
+&set_label("ecb_enc_tail");
&cmp ($len,0x20);
+ &movups ($inout0,&QWP(0,$inp));
+ &jb (&label("ecb_enc_one"));
&movups ($inout1,&QWP(0x10,$inp));
&je (&label("ecb_enc_two"));
&cmp ($len,0x30);
@@ -309,10 +311,11 @@ if ($PREFIX eq "aesni") {
&movups (&QWP(0x10,$out),$inout1);
&movups (&QWP(0x20,$out),$inout2);
&jmp (&label("ecb_ret"));
-
+######################################################################
&set_label("ecb_decrypt",16);
- &sub ($len,0x40);
+ &cmp ($len,0x40);
&jbe (&label("ecb_dec_tail"));
+ &sub ($len,0x40);
&jmp (&label("ecb_dec_loop3"));
&set_label("ecb_dec_loop3",16);
@@ -330,14 +333,13 @@ if ($PREFIX eq "aesni") {
&movups (&QWP(-0x10,$out),$inout2);
&ja (&label("ecb_dec_loop3"));
-&set_label("ecb_dec_tail");
&add ($len,0x40);
&jz (&label("ecb_ret"));
- &cmp ($len,0x10);
- &movups ($inout0,&QWP(0,$inp));
- &je (&label("ecb_dec_one"));
+&set_label("ecb_dec_tail");
&cmp ($len,0x20);
+ &movups ($inout0,&QWP(0,$inp));
+ &jb (&label("ecb_dec_one"));
&movups ($inout1,&QWP(0x10,$inp));
&je (&label("ecb_dec_two"));
&cmp ($len,0x30);
@@ -373,8 +375,173 @@ if ($PREFIX eq "aesni") {
&set_label("ecb_ret");
&function_end("aesni_ecb_encrypt");
-}
+
+######################################################################
+# handles only complete blocks, operates on 32-bit counter and
+# does not update *ivec! (see engine/eng_aesni.c for details)
+#
+# void aesni_ctr32_encrypt_blocks (const void *in, void *out,
+# size_t blocks, const AES_KEY *key,
+# const char *ivec);
+&function_begin("aesni_ctr32_encrypt_blocks");
+ &mov ($inp,&wparam(0));
+ &mov ($out,&wparam(1));
+ &mov ($len,&wparam(2));
+ &mov ($key,&wparam(3));
+ &mov ($rounds_,&wparam(4));
+ &mov ($key_,"esp");
+ &sub ("esp",60);
+ &and ("esp",-16); # align stack
+ &mov (&DWP(48,"esp"),$key_);
+
+ &movups ($inout3,&QWP(0,$rounds_)); # load ivec
+
+ # compose byte-swap control mask for pshufb on stack
+ &mov (&DWP(0,"esp"),0x0c0d0e0f);
+ &mov (&DWP(4,"esp"),0x08090a0b);
+ &mov (&DWP(8,"esp"),0x04050607);
+ &mov (&DWP(12,"esp"),0x00010203);
+
+ # compose counter increment vector on stack
+ &mov ($rounds,3);
+ &xor ($key_,$key_);
+ &mov (&DWP(16,"esp"),$rounds);
+ &mov (&DWP(20,"esp"),$rounds);
+ &mov (&DWP(24,"esp"),$rounds);
+ &mov (&DWP(28,"esp"),$key_);
+
+ &pextrd ($rounds_,$inout3,3); # pull 32-bit counter
+ &pinsrd ($inout3,$key_,3); # wipe 32-bit counter
+
+ &mov ($rounds,&DWP(240,$key)); # key->rounds
+ &movaps ($rndkey0,&QWP(0,"esp")); # load byte-swap mask
+
+ # $ivec is vector of 3 32-bit counters
+ &pxor ($ivec,$ivec);
+ &bswap ($rounds_);
+ &pinsrd ($ivec,$rounds_,0);
+ &inc ($rounds_);
+ &pinsrd ($ivec,$rounds_,1);
+ &inc ($rounds_);
+ &pinsrd ($ivec,$rounds_,2);
+
+ &cmp ($len,4);
+ &pshufb ($ivec,$rndkey0); # byte swap
+ &jbe (&label("ctr32_tail"));
+ &movaps (&QWP(32,"esp"),$inout3); # save counter-less ivec
+ &mov ($rounds_,$rounds);
+ &mov ($key_,$key);
+ &sub ($len,4);
+ &jmp (&label("ctr32_loop3"));
+
+&set_label("ctr32_loop3",16);
+ &pshufd ($inout0,$ivec,3<<6); # place counter to upper dword
+ &pshufd ($inout1,$ivec,2<<6);
+ &pshufd ($inout2,$ivec,1<<6);
+ &por ($inout0,$inout3); # merge counter-less ivec
+ &por ($inout1,$inout3);
+ &por ($inout2,$inout3);
+
+ &call ("_aesni_encrypt3");
+
+ &movaps($rndkey0,&QWP(0,"esp")); # load byte-swap mask
+ &movups ($in0,&QWP(0,$inp));
+ &movups ($in1,&QWP(0x10,$inp));
+ &movups ($rndkey1,&QWP(0x20,$inp));
+ &pshufb($ivec,$rndkey0); # byte swap
+ &paddd ($ivec,&QWP(16,"esp")); # counter increment
+ &pxor ($in0,$inout0);
+ &pxor ($in1,$inout1);
+ &pxor ($rndkey1,$inout2);
+ &movups (&QWP(0,$out),$in0);
+ &movups (&QWP(0x10,$out),$in1);
+ &movups (&QWP(0x20,$out),$rndkey1);
+ &movaps ($inout3,&QWP(32,"esp")); # load counter-less ivec
+ &pshufb($ivec,$rndkey0); # byte swap
+
+ &sub ($len,3);
+ &lea ($inp,&DWP(0x30,$inp));
+ &lea ($out,&DWP(0x30,$out));
+ &mov ($key,$key_);
+ &mov ($rounds,$rounds_);
+ &ja (&label("ctr32_loop3"));
+
+ &add ($len,4);
+ &pextrd ($rounds_,$ivec,1); # might need last counter value
+ &jz (&label("ctr32_ret"));
+ &bswap ($rounds_);
+
+&set_label("ctr32_tail");
+ &cmp ($len,2);
+ &pshufd ($inout0,$ivec,3<<6);
+ &pshufd ($inout1,$ivec,2<<6);
+ &pshufd ($inout2,$ivec,1<<6);
+ &por ($inout0,$inout3);
+ &jb (&label("ctr32_one"));
+ &por ($inout1,$inout3);
+ &je (&label("ctr32_two"));
+ &cmp ($len,3);
+ &por ($inout2,$inout3);
+ &je (&label("ctr32_three"));
+
+ &inc ($rounds_); # compose last counter value
+ &bswap ($rounds_);
+ &pinsrd ($inout3,$rounds_,3);
+
+ &call ("_aesni_encrypt4");
+
+ &movups ($in0,&QWP(0,$inp));
+ &movups ($rndkey1,&QWP(0x10,$inp));
+ &movups ($rndkey0,&QWP(0x20,$inp));
+ &movups ($ivec,&QWP(0x30,$inp));
+ &pxor ($in0,$inout0);
+ &pxor ($rndkey1,$inout1);
+ &pxor ($rndkey0,$inout2);
+ &pxor ($ivec,$inout3);
+ &movups (&QWP(0,$out),$in0);
+ &movups (&QWP(0x10,$out),$rndkey1);
+ &movups (&QWP(0x20,$out),$rndkey0);
+ &movups (&QWP(0x30,$out),$ivec);
+ &jmp (&label("ctr32_ret"));
+
+&set_label("ctr32_one",16);
+ if ($inline)
+ { &aesni_inline_generate1("enc"); }
+ else
+ { &call ("_aesni_encrypt1"); }
+ &movups ($in0,&QWP(0,$inp));
+ &pxor ($in0,$inout0);
+ &movups (&QWP(0,$out),$in0);
+ &jmp (&label("ctr32_ret"));
+&set_label("ctr32_two",16);
+ &call ("_aesni_encrypt3");
+ &movups ($in0,&QWP(0,$inp));
+ &movups ($in1,&QWP(0x10,$inp));
+ &pxor ($in0,$inout0);
+ &pxor ($in1,$inout1);
+ &movups (&QWP(0,$out),$in0);
+ &movups (&QWP(0x10,$out),$in1);
+ &jmp (&label("ctr32_ret"));
+
+&set_label("ctr32_three",16);
+ &call ("_aesni_encrypt3");
+ &movups ($in0,&QWP(0,$inp));
+ &movups ($in1,&QWP(0x10,$inp));
+ &movups ($rndkey1,&QWP(0x20,$inp));
+ &pxor ($in0,$inout0);
+ &pxor ($in1,$inout1);
+ &pxor ($rndkey1,$inout2);
+ &movups (&QWP(0,$out),$in0);
+ &movups (&QWP(0x10,$out),$in1);
+ &movups (&QWP(0x20,$out),$rndkey1);
+
+&set_label("ctr32_ret");
+ &mov ("esp",&DWP(48,"esp"));
+&function_end("aesni_ctr32_encrypt_blocks");
+}
+
+######################################################################
# void $PREFIX_cbc_encrypt (const void *inp, void *out,
# size_t length, const AES_KEY *key,
# unsigned char *ivp,const int enc);
@@ -431,10 +598,11 @@ if ($PREFIX eq "aesni") {
&mov ($inp,$out); # $inp and $out are the same
&mov ($key,$key_); # restore $key
&jmp (&label("cbc_enc_loop"));
-
+######################################################################
&set_label("cbc_decrypt",16);
- &sub ($len,0x40);
+ &cmp ($len,0x40);
&jbe (&label("cbc_dec_tail"));
+ &sub ($len,0x40);
&jmp (&label("cbc_dec_loop3"));
&set_label("cbc_dec_loop3",16);
@@ -458,10 +626,10 @@ if ($PREFIX eq "aesni") {
&movups (&QWP(-0x10,$out),$inout2);
&ja (&label("cbc_dec_loop3"));
-&set_label("cbc_dec_tail");
&add ($len,0x40);
&jz (&label("cbc_ret"));
+&set_label("cbc_dec_tail");
&movups ($inout0,&QWP(0,$inp));
&cmp ($len,0x10);
&movaps ($in0,$inout0);
@@ -539,7 +707,8 @@ if ($PREFIX eq "aesni") {
&mov ($key_,&wparam(4));
&movups (&QWP(0,$key_),$ivec); # output IV
&function_end("${PREFIX}_cbc_encrypt");
-
+
+######################################################################
# Mechanical port from aesni-x86_64.pl.
#
# _aesni_set_encrypt_key is private interface,
diff --git a/crypto/aes/asm/aesni-x86_64.pl b/crypto/aes/asm/aesni-x86_64.pl
index cdc076e24f..d8697519e4 100644
--- a/crypto/aes/asm/aesni-x86_64.pl
+++ b/crypto/aes/asm/aesni-x86_64.pl
@@ -41,7 +41,7 @@ $inp="%rdi";
$out="%rsi";
$len="%rdx";
$key="%rcx"; # input to and changed by aesni_[en|de]cryptN !!!
-$ivp="%r8"; # cbc
+$ivp="%r8"; # cbc, ctr
$rnds_="%r10d"; # backup copy for $rounds
$key_="%r11"; # backup copy for $key
@@ -51,7 +51,7 @@ $inout0="%xmm0"; $inout1="%xmm1";
$inout2="%xmm2"; $inout3="%xmm3";
$rndkey0="%xmm4"; $rndkey1="%xmm5";
-$iv="%xmm6"; $in0="%xmm7"; # used in CBC decrypt
+$iv="%xmm6"; $in0="%xmm7"; # used in CBC decrypt, CTR
$in1="%xmm8"; $in2="%xmm9";
# Inline version of internal aesni_[en|de]crypt1.
@@ -214,6 +214,7 @@ ___
&aesni_generate4("dec");
if ($PREFIX eq "aesni") {
+########################################################################
# void aesni_ecb_encrypt (const void *in, void *out,
# size_t length, const AES_KEY *key,
# int enc);
@@ -232,8 +233,9 @@ aesni_ecb_encrypt:
mov $rounds,$rnds_ # backup $rounds
jz .Lecb_decrypt
#--------------------------- ECB ENCRYPT ------------------------------#
- sub \$0x40,$len
+ cmp \$0x40,$len
jbe .Lecb_enc_tail
+ sub \$0x40,$len
jmp .Lecb_enc_loop3
.align 16
.Lecb_enc_loop3:
@@ -251,14 +253,13 @@ aesni_ecb_encrypt:
movups $inout2,-0x10($out)
ja .Lecb_enc_loop3
-.Lecb_enc_tail:
add \$0x40,$len
jz .Lecb_ret
- cmp \$0x10,$len
- movups ($inp),$inout0
- je .Lecb_enc_one
+.Lecb_enc_tail:
cmp \$0x20,$len
+ movups ($inp),$inout0
+ jb .Lecb_enc_one
movups 0x10($inp),$inout1
je .Lecb_enc_two
cmp \$0x30,$len
@@ -294,8 +295,9 @@ $code.=<<___;
#--------------------------- ECB DECRYPT ------------------------------#
.align 16
.Lecb_decrypt:
- sub \$0x40,$len
+ cmp \$0x40,$len
jbe .Lecb_dec_tail
+ sub \$0x40,$len
jmp .Lecb_dec_loop3
.align 16
.Lecb_dec_loop3:
@@ -313,14 +315,13 @@ $code.=<<___;
movups $inout2,-0x10($out)
ja .Lecb_dec_loop3
-.Lecb_dec_tail:
add \$0x40,$len
jz .Lecb_ret
- cmp \$0x10,$len
- movups ($inp),$inout0
- je .Lecb_dec_one
+.Lecb_dec_tail:
cmp \$0x20,$len
+ movups ($inp),$inout0
+ jb .Lecb_dec_one
movups 0x10($inp),$inout1
je .Lecb_dec_two
cmp \$0x30,$len
@@ -357,8 +358,175 @@ $code.=<<___;
ret
.size aesni_ecb_encrypt,.-aesni_ecb_encrypt
___
+######################################################################
+# handles only complete blocks, operates on 32-bit counter and
+# does not update *ivec! (see engine/eng_aesni.c for details)
+#
+# void aesni_ctr32_encrypt_blocks (const void *in, void *out,
+# size_t blocks, const AES_KEY *key,
+# const char *ivec);
+$increment="%xmm10";
+$bswap_mask="%xmm11";
+
+$code.=<<___;
+.globl aesni_ctr32_encrypt_blocks
+.type aesni_ctr32_encrypt_blocks,\@function,5
+.align 16
+aesni_ctr32_encrypt_blocks:
+___
+$code.=<<___ if ($win64);
+ lea -0x68(%rsp),%rsp
+ movaps %xmm6,(%rsp)
+ movaps %xmm7,0x10(%rsp)
+ movaps %xmm8,0x20(%rsp)
+ movaps %xmm9,0x30(%rsp)
+ movaps %xmm10,0x40(%rsp)
+ movaps %xmm11,0x50(%rsp)
+
+.Lctr32_body:
+___
+$code.=<<___;
+ movups ($ivp),$inout3
+ movaps .Lincrement(%rip),$increment
+ movaps .Lbswap_mask(%rip),$bswap_mask
+ xor $rounds,$rounds
+ pextrd \$3,$inout3,$rnds_ # pull 32-bit counter
+ pinsrd \$3,$rounds,$inout3 # wipe 32-bit counter
+
+ mov 240($key),$rounds # key->rounds
+ pxor $iv,$iv # vector of 3 32-bit counters
+ bswap $rnds_
+ pinsrd \$0,$rnds_,$iv
+ inc $rnds_
+ pinsrd \$1,$rnds_,$iv
+ inc $rnds_
+ pinsrd \$2,$rnds_,$iv
+
+ cmp \$4,$len
+ pshufb $bswap_mask,$iv
+ jbe .Lctr32_tail
+ mov $rounds,$rnds_
+ mov $key,$key_
+ sub \$4,$len
+ jmp .Lctr32_loop3
+
+.align 16
+.Lctr32_loop3:
+ pshufd \$`3<<6`,$iv,$inout0 # place counter to upper dword
+ pshufd \$`2<<6`,$iv,$inout1
+ pshufd \$`1<<6`,$iv,$inout2
+ movups ($inp),$in0
+ movups 0x10($inp),$in1
+ movups 0x20($inp),$in2
+ por $inout3,$inout0 # merge counter-less ivec
+ por $inout3,$inout1
+ por $inout3,$inout2
+ pshufb $bswap_mask,$iv
+
+ call _aesni_encrypt3
+
+ paddd $increment,$iv
+ pxor $inout0,$in0
+ pxor $inout1,$in1
+ pxor $inout2,$in2
+ pshufb $bswap_mask,$iv
+ movups $in0,($out)
+ movups $in1,0x10($out)
+ movups $in2,0x20($out)
+
+ sub \$3,$len
+ lea 0x30($inp),$inp
+ lea 0x30($out),$out
+ mov $key_,$key
+ mov $rnds_,$rounds
+ ja .Lctr32_loop3
+
+ add \$4,$len
+ pextrd \$1,$iv,$rnds_ # migh need last counter value
+ jz .Lctr32_done
+ bswap $rnds_
+
+.Lctr32_tail:
+ cmp \$2,$len
+ pshufd \$`3<<6`,$iv,$inout0
+ pshufd \$`2<<6`,$iv,$inout1
+ pshufd \$`1<<6`,$iv,$inout2
+ por $inout3,$inout0
+ movups ($inp),$in0
+ jb .Lctr32_one
+ por $inout3,$inout1
+ movups 0x10($inp),$in1
+ je .Lctr32_two
+ cmp \$3,$len
+ por $inout3,$inout2
+ movups 0x20($inp),$in2
+ je .Lctr32_three
+
+ inc $rnds_ # compose last counter value
+ bswap $rnds_
+ pinsrd \$3,$rnds_,$inout3
+ movups 0x30($inp),$iv
+
+ call _aesni_encrypt4
+
+ pxor $inout0,$in0
+ pxor $inout1,$in1
+ pxor $inout2,$in2
+ pxor $inout3,$iv
+ movups $in0,($out)
+ movups $in1,0x10($out)
+ movups $in2,0x20($out)
+ movups $iv,0x30($out)
+ jmp .Lctr32_done
+
+.align 16
+.Lctr32_one:
+___
+ &aesni_generate1("enc",$key,$rounds);
+$code.=<<___;
+ pxor $inout0,$in0
+ movups $in0,($out)
+ jmp .Lctr32_done
+
+.align 16
+.Lctr32_two:
+ call _aesni_encrypt3
+ pxor $inout0,$in0
+ pxor $inout1,$in1
+ movups $in0,($out)
+ movups $in1,0x10($out)
+ jmp .Lctr32_done
+
+.align 16
+.Lctr32_three:
+ call _aesni_encrypt3
+ pxor $inout0,$in0
+ pxor $inout1,$in1
+ pxor $inout2,$in2
+ movups $in0,($out)
+ movups $in1,0x10($out)
+ movups $in2,0x20($out)
+
+.Lctr32_done:
+___
+
+$code.=<<___ if ($win64);
+ movaps (%rsp),%xmm6
+ movaps 0x10(%rsp),%xmm7
+ movaps 0x20(%rsp),%xmm8
+ movaps 0x30(%rsp),%xmm9
+ movaps 0x40(%rsp),%xmm10
+ movaps 0x50(%rsp),%xmm11
+ lea 0x68(%rsp),%rsp
+___
+$code.=<<___;
+.Lctr32_ret:
+ ret
+.size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
+___
}
+########################################################################
# void $PREFIX_cbc_encrypt (const void *inp, void *out,
# size_t length, const AES_KEY *key,
# unsigned char *ivp,const int enc);
@@ -429,9 +597,10 @@ $code.=<<___ if ($win64);
___
$code.=<<___;
movups ($ivp),$iv
- sub \$0x40,$len
+ cmp \$0x40,$len
mov $rnds_,$rounds
jbe .Lcbc_dec_tail
+ sub \$0x40,$len
jmp .Lcbc_dec_loop3
.align 16
.Lcbc_dec_loop3:
@@ -456,11 +625,11 @@ $code.=<<___;
movups $inout2,-0x10($out)
ja .Lcbc_dec_loop3
-.Lcbc_dec_tail:
add \$0x40,$len
movups $iv,($ivp)
jz .Lcbc_dec_ret
+.Lcbc_dec_tail:
movups ($inp),$inout0
cmp \$0x10,$len
movaps $inout0,$in0
@@ -796,6 +965,11 @@ ___
}
$code.=<<___;
+.align 64
+.Lbswap_mask:
+ .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.Lincrement:
+ .long 3,3,3,0
.asciz "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"
.align 64
___
@@ -810,9 +984,11 @@ $disp="%r9";
$code.=<<___;
.extern __imp_RtlVirtualUnwind
-.type cbc_se_handler,\@abi-omnipotent
+___
+$code.=<<___ if ($PREFIX eq "aesni");
+.type ecb_se_handler,\@abi-omnipotent
.align 16
-cbc_se_handler:
+ecb_se_handler:
push %rsi
push %rdi
push %rbx
@@ -825,30 +1001,48 @@ cbc_se_handler:
sub \$64,%rsp
mov 152($context),%rax # pull context->Rsp
+ mov 8(%rax),%rdi
+ mov 16(%rax),%rsi
+ mov %rsi,168($context) # restore context->Rsi
+ mov %rdi,176($context) # restore context->Rdi
+
+ jmp .Lcommon_seh_exit
+.size ecb_se_handler,.-ecb_se_handler
+
+.type ctr32_se_handler,\@abi-omnipotent
+.align 16
+ctr32_se_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
mov 248($context),%rbx # pull context->Rip
- lea .Lcbc_decrypt(%rip),%r10
+ lea .Lctr32_body(%rip),%r10
cmp %r10,%rbx # context->Rip<"prologue" label
- jb .Lin_prologue
+ jb .Lin_ctr32_prologue
- lea .Lcbc_decrypt_body(%rip),%r10
- cmp %r10,%rbx # context->Rip<cbc_decrypt_body
- jb .Lrestore_rax
+ mov 152($context),%rax # pull context->Rsp
- lea .Lcbc_ret(%rip),%r10
- cmp %r10,%rbx # context->Rip>="epilogue" label
- jae .Lin_prologue
+ lea .Lctr32_ret(%rip),%r10
+ cmp %r10,%rbx
+ jae .Lin_ctr32_prologue
lea 0(%rax),%rsi # top of stack
lea 512($context),%rdi # &context.Xmm6
- mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax)
+ mov \$12,%ecx # 6*sizeof(%xmm0)/sizeof(%rax)
.long 0xa548f3fc # cld; rep movsq
- lea 0x58(%rax),%rax # adjust stack pointer
- jmp .Lin_prologue
+ lea 0x68(%rax),%rax # adjust stack pointer
-.Lrestore_rax:
- mov 120($context),%rax
-.Lin_prologue:
+.Lin_ctr32_prologue:
mov 8(%rax),%rdi
mov 16(%rax),%rsi
mov %rax,152($context) # restore context->Rsp
@@ -856,11 +1050,12 @@ cbc_se_handler:
mov %rdi,176($context) # restore context->Rdi
jmp .Lcommon_seh_exit
-.size cbc_se_handler,.-cbc_se_handler
-
-.type ecb_se_handler,\@abi-omnipotent
+.size ctr32_se_handler,.-ctr32_se_handler
+___
+$code.=<<___;
+.type cbc_se_handler,\@abi-omnipotent
.align 16
-ecb_se_handler:
+cbc_se_handler:
push %rsi
push %rdi
push %rbx
@@ -873,8 +1068,33 @@ ecb_se_handler:
sub \$64,%rsp
mov 152($context),%rax # pull context->Rsp
+ mov 248($context),%rbx # pull context->Rip
+
+ lea .Lcbc_decrypt(%rip),%r10
+ cmp %r10,%rbx # context->Rip<"prologue" label
+ jb .Lin_cbc_prologue
+
+ lea .Lcbc_decrypt_body(%rip),%r10
+ cmp %r10,%rbx # context->Rip<cbc_decrypt_body
+ jb .Lrestore_cbc_rax
+
+ lea .Lcbc_ret(%rip),%r10
+ cmp %r10,%rbx # context->Rip>="epilogue" label
+ jae .Lin_cbc_prologue
+
+ lea 0(%rax),%rsi # top of stack
+ lea 512($context),%rdi # &context.Xmm6
+ mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax)
+ .long 0xa548f3fc # cld; rep movsq
+ lea 0x58(%rax),%rax # adjust stack pointer
+ jmp .Lin_cbc_prologue
+
+.Lrestore_cbc_rax:
+ mov 120($context),%rax
+.Lin_cbc_prologue:
mov 8(%rax),%rdi
mov 16(%rax),%rsi
+ mov %rax,152($context) # restore context->Rsp
mov %rsi,168($context) # restore context->Rsi
mov %rdi,176($context) # restore context->Rdi
@@ -915,10 +1135,17 @@ ecb_se_handler:
.section .pdata
.align 4
- .rva .LSEH_begin_${PREFIX}_ecb_encrypt
- .rva .LSEH_end_${PREFIX}_ecb_encrypt
+___
+$code.=<<___ if ($PREFIX eq "aesni");
+ .rva .LSEH_begin_aesni_ecb_encrypt
+ .rva .LSEH_end_aesni_ecb_encrypt
.rva .LSEH_info_ecb
+ .rva .LSEH_begin_aesni_ctr32_encrypt_blocks
+ .rva .LSEH_end_aesni_ctr32_encrypt_blocks
+ .rva .LSEH_info_ctr32
+___
+$code.=<<___;
.rva .LSEH_begin_${PREFIX}_cbc_encrypt
.rva .LSEH_end_${PREFIX}_cbc_encrypt
.rva .LSEH_info_cbc
@@ -932,9 +1159,16 @@ ecb_se_handler:
.rva .LSEH_info_key
.section .xdata
.align 8
+___
+$code.=<<___ if ($PREFIX eq "aesni");
.LSEH_info_ecb:
.byte 9,0,0,0
.rva ecb_se_handler
+.LSEH_info_ctr32:
+ .byte 9,0,0,0
+ .rva ctr32_se_handler
+___
+$code.=<<___;
.LSEH_info_cbc:
.byte 9,0,0,0
.rva cbc_se_handler
diff --git a/crypto/engine/eng_aesni.c b/crypto/engine/eng_aesni.c
index 2a997cae36..70b2838b4e 100644
--- a/crypto/engine/eng_aesni.c
+++ b/crypto/engine/eng_aesni.c
@@ -111,6 +111,35 @@ void ENGINE_load_aesni (void)
}
#ifdef COMPILE_HW_AESNI
+
+typedef unsigned int u32;
+typedef unsigned char u8;
+
+#if defined(__GNUC__) && __GNUC__>=2
+# define BSWAP4(x) ({ u32 ret=(x); \
+ asm volatile ("bswapl %0" \
+ : "+r"(ret)); ret; })
+#elif defined(_MSC_VER)
+# if _MSC_VER>=1300
+# pragma intrinsic(_byteswap_ulong)
+# define BSWAP4(x) _byteswap_ulong((u32)(x))
+# elif defined(_M_IX86)
+ __inline u32 _bswap4(u32 val) {
+ _asm mov eax,val
+ _asm bswap eax
+ }
+# define BSWAP4(x) _bswap4(x)
+# endif
+#endif
+
+#ifdef BSWAP4
+#define GETU32(p) BSWAP4(*(const u32 *)(p))
+#define PUTU32(p,v) *(u32 *)(p) = BSWAP4(v)
+#else
+#define GETU32(p) ((u32)(p)[0]<<24|(u32)(p)[1]<<16|(u32)(p)[2]<<8|(u32)(p)[3])
+#define PUTU32(p,v) ((p)[0]=(u8)((v)>>24),(p)[1]=(u8)((v)>>16),(p)[2]=(u8)((v)>>8),(p)[3]=(u8)(v))
+#endif
+
int aesni_set_encrypt_key(const unsigned char *userKey, int bits,
AES_KEY *key);
int aesni_set_decrypt_key(const unsigned char *userKey, int bits,
@@ -132,6 +161,12 @@ void aesni_cbc_encrypt(const unsigned char *in,
const AES_KEY *key,
unsigned char *ivec, int enc);
+void aesni_ctr32_encrypt_blocks(const unsigned char *in,
+ unsigned char *out,
+ size_t blocks,
+ const AES_KEY *key,
+ const unsigned char *ivec);
+
/* Function for ENGINE detection and control */
static int aesni_init(ENGINE *e);
@@ -224,16 +259,19 @@ static int aesni_cipher_nids[] = {
NID_aes_128_cbc,
NID_aes_128_cfb,
NID_aes_128_ofb,
+ NID_aes_128_ctr,
NID_aes_192_ecb,
NID_aes_192_cbc,
NID_aes_192_cfb,
NID_aes_192_ofb,
+ NID_aes_192_ctr,
NID_aes_256_ecb,
NID_aes_256_cbc,
NID_aes_256_cfb,
NID_aes_256_ofb,
+ NID_aes_256_ctr,
};
static int aesni_cipher_nids_num =
(sizeof(aesni_cipher_nids)/sizeof(aesni_cipher_nids[0]));
@@ -251,18 +289,28 @@ aesni_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *user_key,
int ret;
AES_KEY *key = AESNI_ALIGN(ctx->cipher_data);
- if ((ctx->cipher->flags & EVP_CIPH_MODE) == EVP_CIPH_CFB_MODE
- || (ctx->cipher->flags & EVP_CIPH_MODE) == EVP_CIPH_OFB_MODE
- || enc)
- ret=aesni_set_encrypt_key(user_key, ctx->key_len * 8, key);
- else
+ if (((ctx->cipher->flags & EVP_CIPH_MODE) == EVP_CIPH_ECB_MODE
+ || (ctx->cipher->flags & EVP_CIPH_MODE) == EVP_CIPH_CBC_MODE)
+ && !enc)
ret=aesni_set_decrypt_key(user_key, ctx->key_len * 8, key);
+ else
+ ret=aesni_set_encrypt_key(user_key, ctx->key_len * 8, key);
if(ret < 0) {
EVPerr(EVP_F_AESNI_INIT_KEY,EVP_R_AES_KEY_SETUP_FAILED);
return 0;
}
+ if (ctx->cipher->flags&EVP_CIPH_CUSTOM_IV)
+ {
+ if (iv!=NULL)
+ memcpy (ctx->iv,iv,ctx->cipher->iv_len);
+ else {
+ EVPerr(EVP_F_AESNI_INIT_KEY,EVP_R_AES_IV_SETUP_FAILED);
+ return 0;
+ }
+ }
+
return 1;
}
@@ -336,6 +384,117 @@ DECLARE_AES_EVP(256,cbc,CBC);
DECLARE_AES_EVP(256,cfb,CFB);
DECLARE_AES_EVP(256,ofb,OFB);
+static void ctr96_inc(unsigned char *counter) {
+ u32 n=12;
+ u8 c;
+
+ do {
+ --n;
+ c = counter[n];
+ ++c;
+ counter[n] = c;
+ if (c) return;
+ } while (n);
+}
+
+static int aesni_counter(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len)
+{
+ AES_KEY *key = AESNI_ALIGN(ctx->cipher_data);
+ u32 n, ctr32;
+ n = ctx->num;
+
+ while (n && len) {
+ *(out++) = *(in++) ^ ctx->buf[n];
+ --len;
+ n = (n+1) % 16;
+ }
+
+ ctr32 = GETU32(ctx->iv+12);
+ while (len>=16) {
+ size_t blocks = len/16;
+ /*
+ * 1<<24 is just a not-so-small yet not-so-large number...
+ */
+ if (blocks > (1U<<24)) blocks = (1U<<24);
+ /*
+ * As aesni_ctr32 operates on 32-bit counter, caller
+ * has to handle overflow. 'if' below detects the
+ * overflow, which is then handled by limiting the
+ * amount of blocks to the exact overflow point...
+ */
+ ctr32 += (u32)blocks;
+ if (ctr32 < blocks) {
+ blocks -= ctr32;
+ ctr32 = 0;
+ }
+ aesni_ctr32_encrypt_blocks(in,out,blocks,key,ctx->iv);
+ /* aesni_ctr32 does not update ctx->iv, caller does: */
+ PUTU32(ctx->iv+12,ctr32);
+ /* ... overflow was detected, propogate carry. */
+ if (ctr32 == 0) ctr96_inc(ctx->iv);
+ blocks *= 16;
+ len -= blocks;
+ out += blocks;
+ in += blocks;
+ }
+ if (len) {
+ aesni_encrypt(ctx->iv,ctx->buf,key);
+ ++ctr32;
+ PUTU32(ctx->iv+12,ctr32);
+ if (ctr32 == 0) ctr96_inc(ctx->iv);
+ while (len--) {
+ out[n] = in[n] ^ ctx->buf[n];
+ ++n;
+ }
+ }
+ ctx->num = n;
+
+ return 1;
+}
+
+static const EVP_CIPHER aesni_128_ctr=
+ {
+ NID_aes_128_ctr,1,16,16,
+ EVP_CIPH_CUSTOM_IV,
+ aesni_init_key,
+ aesni_counter,
+ NULL,
+ sizeof(AESNI_KEY),
+ NULL,
+ NULL,
+ NULL,
+ NULL
+ };
+
+static const EVP_CIPHER aesni_192_ctr=
+ {
+ NID_aes_192_ctr,1,24,16,
+ EVP_CIPH_CUSTOM_IV,
+ aesni_init_key,
+ aesni_counter,
+ NULL,
+ sizeof(AESNI_KEY),
+ NULL,
+ NULL,
+ NULL,
+ NULL
+ };
+
+static const EVP_CIPHER aesni_256_ctr=
+ {
+ NID_aes_256_ctr,1,32,16,
+ EVP_CIPH_CUSTOM_IV,
+ aesni_init_key,
+ aesni_counter,
+ NULL,
+ sizeof(AESNI_KEY),
+ NULL,
+ NULL,
+ NULL,
+ NULL
+ };
+
static int
aesni_ciphers (ENGINE *e, const EVP_CIPHER **cipher,
const int **nids, int nid)
@@ -360,6 +519,9 @@ aesni_ciphers (ENGINE *e, const EVP_CIPHER **cipher,
case NID_aes_128_ofb:
*cipher = &aesni_128_ofb;
break;
+ case NID_aes_128_ctr:
+ *cipher = &aesni_128_ctr;
+ break;
case NID_aes_192_ecb:
*cipher = &aesni_192_ecb;
@@ -373,6 +535,9 @@ aesni_ciphers (ENGINE *e, const EVP_CIPHER **cipher,
case NID_aes_192_ofb:
*cipher = &aesni_192_ofb;
break;
+ case NID_aes_192_ctr:
+ *cipher = &aesni_192_ctr;
+ break;
case NID_aes_256_ecb:
*cipher = &aesni_256_ecb;
@@ -386,6 +551,9 @@ aesni_ciphers (ENGINE *e, const EVP_CIPHER **cipher,
case NID_aes_256_ofb:
*cipher = &aesni_256_ofb;
break;
+ case NID_aes_256_ctr:
+ *cipher = &aesni_256_ctr;
+ break;
default:
/* Sorry, we don't support this NID */