summaryrefslogtreecommitdiffstats
path: root/crypto/aes/asm/aesni-x86.pl
diff options
context:
space:
mode:
Diffstat (limited to 'crypto/aes/asm/aesni-x86.pl')
-rw-r--r--crypto/aes/asm/aesni-x86.pl211
1 files changed, 190 insertions, 21 deletions
diff --git a/crypto/aes/asm/aesni-x86.pl b/crypto/aes/asm/aesni-x86.pl
index 72faa78d1f..8c1426cd5b 100644
--- a/crypto/aes/asm/aesni-x86.pl
+++ b/crypto/aes/asm/aesni-x86.pl
@@ -23,7 +23,8 @@ require "x86asm.pl";
&asm_init($ARGV[0],$0);
-$movekey = eval($RREFIX eq "aseni" ? "*movaps" : "*movups");
+if ($PREFIX eq "aesni") { $movekey=*movaps; }
+else { $movekey=*movups; }
$len="eax";
$rounds="ecx";
@@ -41,7 +42,7 @@ $rndkey1="xmm4";
$ivec="xmm5";
$in0="xmm6";
$in1="xmm7"; $inout3="xmm7";
-
+
# Inline version of internal aesni_[en|de]crypt1
sub aesni_inline_generate1
{ my $p=shift;
@@ -104,7 +105,7 @@ sub aesni_generate1 # fully unrolled loop
&ret();
&function_end_B("_aesni_${p}rypt1");
}
-
+
# void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key);
&aesni_generate1("enc") if (!$inline);
&function_begin_B("${PREFIX}_encrypt");
@@ -136,7 +137,7 @@ sub aesni_generate1 # fully unrolled loop
&movups (&QWP(0,"eax"),$inout0);
&ret ();
&function_end_B("${PREFIX}_decrypt");
-
+
# _aesni_[en|de]crypt[34] are private interfaces, N denotes interleave
# factor. Why 3x subroutine is used in loops? Even though aes[enc|dec]
# latency is 6, it turned out that it can be scheduled only every
@@ -229,8 +230,9 @@ sub aesni_generate4
&aesni_generate3("dec");
&aesni_generate4("enc") if ($PREFIX eq "aesni");
&aesni_generate4("dec");
-
+
if ($PREFIX eq "aesni") {
+######################################################################
# void aesni_ecb_encrypt (const void *in, void *out,
# size_t length, const AES_KEY *key,
# int enc);
@@ -249,8 +251,9 @@ if ($PREFIX eq "aesni") {
&mov ($rounds_,$rounds); # backup $rounds
&jz (&label("ecb_decrypt"));
- &sub ($len,0x40);
+ &cmp ($len,0x40);
&jbe (&label("ecb_enc_tail"));
+ &sub ($len,0x40);
&jmp (&label("ecb_enc_loop3"));
&set_label("ecb_enc_loop3",16);
@@ -268,14 +271,13 @@ if ($PREFIX eq "aesni") {
&movups (&QWP(-0x10,$out),$inout2);
&ja (&label("ecb_enc_loop3"));
-&set_label("ecb_enc_tail");
&add ($len,0x40);
&jz (&label("ecb_ret"));
- &cmp ($len,0x10);
- &movups ($inout0,&QWP(0,$inp));
- &je (&label("ecb_enc_one"));
+&set_label("ecb_enc_tail");
&cmp ($len,0x20);
+ &movups ($inout0,&QWP(0,$inp));
+ &jb (&label("ecb_enc_one"));
&movups ($inout1,&QWP(0x10,$inp));
&je (&label("ecb_enc_two"));
&cmp ($len,0x30);
@@ -309,10 +311,11 @@ if ($PREFIX eq "aesni") {
&movups (&QWP(0x10,$out),$inout1);
&movups (&QWP(0x20,$out),$inout2);
&jmp (&label("ecb_ret"));
-
+######################################################################
&set_label("ecb_decrypt",16);
- &sub ($len,0x40);
+ &cmp ($len,0x40);
&jbe (&label("ecb_dec_tail"));
+ &sub ($len,0x40);
&jmp (&label("ecb_dec_loop3"));
&set_label("ecb_dec_loop3",16);
@@ -330,14 +333,13 @@ if ($PREFIX eq "aesni") {
&movups (&QWP(-0x10,$out),$inout2);
&ja (&label("ecb_dec_loop3"));
-&set_label("ecb_dec_tail");
&add ($len,0x40);
&jz (&label("ecb_ret"));
- &cmp ($len,0x10);
- &movups ($inout0,&QWP(0,$inp));
- &je (&label("ecb_dec_one"));
+&set_label("ecb_dec_tail");
&cmp ($len,0x20);
+ &movups ($inout0,&QWP(0,$inp));
+ &jb (&label("ecb_dec_one"));
&movups ($inout1,&QWP(0x10,$inp));
&je (&label("ecb_dec_two"));
&cmp ($len,0x30);
@@ -373,8 +375,173 @@ if ($PREFIX eq "aesni") {
&set_label("ecb_ret");
&function_end("aesni_ecb_encrypt");
-}
+
+######################################################################
+# handles only complete blocks, operates on 32-bit counter and
+# does not update *ivec! (see engine/eng_aesni.c for details)
+#
+# void aesni_ctr32_encrypt_blocks (const void *in, void *out,
+# size_t blocks, const AES_KEY *key,
+# const char *ivec);
+&function_begin("aesni_ctr32_encrypt_blocks");
+ &mov ($inp,&wparam(0));
+ &mov ($out,&wparam(1));
+ &mov ($len,&wparam(2));
+ &mov ($key,&wparam(3));
+ &mov ($rounds_,&wparam(4));
+ &mov ($key_,"esp");
+ &sub ("esp",60);
+ &and ("esp",-16); # align stack
+ &mov (&DWP(48,"esp"),$key_);
+
+ &movups ($inout3,&QWP(0,$rounds_)); # load ivec
+
+ # compose byte-swap control mask for pshufb on stack
+ &mov (&DWP(0,"esp"),0x0c0d0e0f);
+ &mov (&DWP(4,"esp"),0x08090a0b);
+ &mov (&DWP(8,"esp"),0x04050607);
+ &mov (&DWP(12,"esp"),0x00010203);
+
+ # compose counter increment vector on stack
+ &mov ($rounds,3);
+ &xor ($key_,$key_);
+ &mov (&DWP(16,"esp"),$rounds);
+ &mov (&DWP(20,"esp"),$rounds);
+ &mov (&DWP(24,"esp"),$rounds);
+ &mov (&DWP(28,"esp"),$key_);
+
+ &pextrd ($rounds_,$inout3,3); # pull 32-bit counter
+ &pinsrd ($inout3,$key_,3); # wipe 32-bit counter
+
+ &mov ($rounds,&DWP(240,$key)); # key->rounds
+ &movaps ($rndkey0,&QWP(0,"esp")); # load byte-swap mask
+
+ # $ivec is vector of 3 32-bit counters
+ &pxor ($ivec,$ivec);
+ &bswap ($rounds_);
+ &pinsrd ($ivec,$rounds_,0);
+ &inc ($rounds_);
+ &pinsrd ($ivec,$rounds_,1);
+ &inc ($rounds_);
+ &pinsrd ($ivec,$rounds_,2);
+
+ &cmp ($len,4);
+ &pshufb ($ivec,$rndkey0); # byte swap
+ &jbe (&label("ctr32_tail"));
+ &movaps (&QWP(32,"esp"),$inout3); # save counter-less ivec
+ &mov ($rounds_,$rounds);
+ &mov ($key_,$key);
+ &sub ($len,4);
+ &jmp (&label("ctr32_loop3"));
+
+&set_label("ctr32_loop3",16);
+ &pshufd ($inout0,$ivec,3<<6); # place counter to upper dword
+ &pshufd ($inout1,$ivec,2<<6);
+ &pshufd ($inout2,$ivec,1<<6);
+ &por ($inout0,$inout3); # merge counter-less ivec
+ &por ($inout1,$inout3);
+ &por ($inout2,$inout3);
+
+ &call ("_aesni_encrypt3");
+
+ &movaps($rndkey0,&QWP(0,"esp")); # load byte-swap mask
+ &movups ($in0,&QWP(0,$inp));
+ &movups ($in1,&QWP(0x10,$inp));
+ &movups ($rndkey1,&QWP(0x20,$inp));
+ &pshufb($ivec,$rndkey0); # byte swap
+ &paddd ($ivec,&QWP(16,"esp")); # counter increment
+ &pxor ($in0,$inout0);
+ &pxor ($in1,$inout1);
+ &pxor ($rndkey1,$inout2);
+ &movups (&QWP(0,$out),$in0);
+ &movups (&QWP(0x10,$out),$in1);
+ &movups (&QWP(0x20,$out),$rndkey1);
+ &movaps ($inout3,&QWP(32,"esp")); # load counter-less ivec
+ &pshufb($ivec,$rndkey0); # byte swap
+
+ &sub ($len,3);
+ &lea ($inp,&DWP(0x30,$inp));
+ &lea ($out,&DWP(0x30,$out));
+ &mov ($key,$key_);
+ &mov ($rounds,$rounds_);
+ &ja (&label("ctr32_loop3"));
+
+ &add ($len,4);
+ &pextrd ($rounds_,$ivec,1); # might need last counter value
+ &jz (&label("ctr32_ret"));
+ &bswap ($rounds_);
+
+&set_label("ctr32_tail");
+ &cmp ($len,2);
+ &pshufd ($inout0,$ivec,3<<6);
+ &pshufd ($inout1,$ivec,2<<6);
+ &pshufd ($inout2,$ivec,1<<6);
+ &por ($inout0,$inout3);
+ &jb (&label("ctr32_one"));
+ &por ($inout1,$inout3);
+ &je (&label("ctr32_two"));
+ &cmp ($len,3);
+ &por ($inout2,$inout3);
+ &je (&label("ctr32_three"));
+
+ &inc ($rounds_); # compose last counter value
+ &bswap ($rounds_);
+ &pinsrd ($inout3,$rounds_,3);
+
+ &call ("_aesni_encrypt4");
+
+ &movups ($in0,&QWP(0,$inp));
+ &movups ($rndkey1,&QWP(0x10,$inp));
+ &movups ($rndkey0,&QWP(0x20,$inp));
+ &movups ($ivec,&QWP(0x30,$inp));
+ &pxor ($in0,$inout0);
+ &pxor ($rndkey1,$inout1);
+ &pxor ($rndkey0,$inout2);
+ &pxor ($ivec,$inout3);
+ &movups (&QWP(0,$out),$in0);
+ &movups (&QWP(0x10,$out),$rndkey1);
+ &movups (&QWP(0x20,$out),$rndkey0);
+ &movups (&QWP(0x30,$out),$ivec);
+ &jmp (&label("ctr32_ret"));
+
+&set_label("ctr32_one",16);
+ if ($inline)
+ { &aesni_inline_generate1("enc"); }
+ else
+ { &call ("_aesni_encrypt1"); }
+ &movups ($in0,&QWP(0,$inp));
+ &pxor ($in0,$inout0);
+ &movups (&QWP(0,$out),$in0);
+ &jmp (&label("ctr32_ret"));
+&set_label("ctr32_two",16);
+ &call ("_aesni_encrypt3");
+ &movups ($in0,&QWP(0,$inp));
+ &movups ($in1,&QWP(0x10,$inp));
+ &pxor ($in0,$inout0);
+ &pxor ($in1,$inout1);
+ &movups (&QWP(0,$out),$in0);
+ &movups (&QWP(0x10,$out),$in1);
+ &jmp (&label("ctr32_ret"));
+
+&set_label("ctr32_three",16);
+ &call ("_aesni_encrypt3");
+ &movups ($in0,&QWP(0,$inp));
+ &movups ($in1,&QWP(0x10,$inp));
+ &movups ($rndkey1,&QWP(0x20,$inp));
+ &pxor ($in0,$inout0);
+ &pxor ($in1,$inout1);
+ &pxor ($rndkey1,$inout2);
+ &movups (&QWP(0,$out),$in0);
+ &movups (&QWP(0x10,$out),$in1);
+ &movups (&QWP(0x20,$out),$rndkey1);
+
+&set_label("ctr32_ret");
+ &mov ("esp",&DWP(48,"esp"));
+&function_end("aesni_ctr32_encrypt_blocks");
+}
+
+######################################################################
# void $PREFIX_cbc_encrypt (const void *inp, void *out,
# size_t length, const AES_KEY *key,
# unsigned char *ivp,const int enc);
@@ -431,10 +598,11 @@ if ($PREFIX eq "aesni") {
&mov ($inp,$out); # $inp and $out are the same
&mov ($key,$key_); # restore $key
&jmp (&label("cbc_enc_loop"));
-
+######################################################################
&set_label("cbc_decrypt",16);
- &sub ($len,0x40);
+ &cmp ($len,0x40);
&jbe (&label("cbc_dec_tail"));
+ &sub ($len,0x40);
&jmp (&label("cbc_dec_loop3"));
&set_label("cbc_dec_loop3",16);
@@ -458,10 +626,10 @@ if ($PREFIX eq "aesni") {
&movups (&QWP(-0x10,$out),$inout2);
&ja (&label("cbc_dec_loop3"));
-&set_label("cbc_dec_tail");
&add ($len,0x40);
&jz (&label("cbc_ret"));
+&set_label("cbc_dec_tail");
&movups ($inout0,&QWP(0,$inp));
&cmp ($len,0x10);
&movaps ($in0,$inout0);
@@ -539,7 +707,8 @@ if ($PREFIX eq "aesni") {
&mov ($key_,&wparam(4));
&movups (&QWP(0,$key_),$ivec); # output IV
&function_end("${PREFIX}_cbc_encrypt");
-
+
+######################################################################
# Mechanical port from aesni-x86_64.pl.
#
# _aesni_set_encrypt_key is private interface,