summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndy Polyakov <appro@openssl.org>2011-05-22 18:38:00 +0000
committerAndy Polyakov <appro@openssl.org>2011-05-22 18:38:00 +0000
commitf8501464cc8fd8b7b4983462944a1894b157d735 (patch)
tree31840baeabd26ad8bbb922752e156cf17d00c995
parent96abea332c8b70b77e87390cbe016021971fb83b (diff)
aesni-x86[_64].pl: optimize for Sandy Bridge and add XTS mode.
-rw-r--r--crypto/aes/asm/aesni-x86.pl1634
-rw-r--r--crypto/aes/asm/aesni-x86_64.pl2149
2 files changed, 3161 insertions, 622 deletions
diff --git a/crypto/aes/asm/aesni-x86.pl b/crypto/aes/asm/aesni-x86.pl
index 1a1bf539cd..712149ab4b 100644
--- a/crypto/aes/asm/aesni-x86.pl
+++ b/crypto/aes/asm/aesni-x86.pl
@@ -27,7 +27,21 @@
# Lower ratios for smaller block sizes are perfectly understandable,
# because function call overhead is higher in 32-bit mode. Largest
# 8-KB block performance is virtually same: 32-bit code is less than
-# 1% slower for ECB, CBC and CCM, and ~3% slower otherwise.
+# 1% slower for ECB, CBC and CCM, and ~3% slower otherwise.
+
+# January 2011
+#
+# See aesni-x86_64.pl for details. Unlike x86_64 version this module
+# interleaves at most 6 aes[enc|dec] instructions, because there are
+# not enough registers for 8x interleave [which should be optimal for
+# Sandy Bridge]. Actually, performance results for 6x interleave
+# factor presented in aesni-x86_64.pl (except for CTR) are for this
+# module.
+
+# April 2011
+#
+# Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing
+# one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09.
$PREFIX="aesni"; # if $PREFIX is set to "AES", the script
# generates drop-in replacement for
@@ -51,14 +65,14 @@ $out="edi";
$rounds_="ebx"; # backup copy for $rounds
$key_="ebp"; # backup copy for $key
-$inout0="xmm0";
-$inout1="xmm1";
-$inout2="xmm2";
-$rndkey0="xmm3";
-$rndkey1="xmm4";
-$ivec="xmm5";
-$in0="xmm6";
-$in1="xmm7"; $inout3="xmm7";
+$rndkey0="xmm0";
+$rndkey1="xmm1";
+$inout0="xmm2";
+$inout1="xmm3";
+$inout2="xmm4";
+$inout3="xmm5"; $in1="xmm5";
+$inout4="xmm6"; $in0="xmm6";
+$inout5="xmm7"; $ivec="xmm7";
# AESNI extenstion
sub aeskeygenassist
@@ -80,13 +94,15 @@ sub aesdeclast { aescommon(0xdf,@_); }
# Inline version of internal aesni_[en|de]crypt1
{ my $sn;
sub aesni_inline_generate1
-{ my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout));
+{ my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
$sn++;
- &movdqu ($rndkey0,&QWP(0,$key));
+ &$movekey ($rndkey0,&QWP(0,$key));
&$movekey ($rndkey1,&QWP(16,$key));
+ &xorps ($ivec,$rndkey0) if (defined($ivec));
&lea ($key,&DWP(32,$key));
- &pxor ($inout,$rndkey0);
+ &xorps ($inout,$ivec) if (defined($ivec));
+ &xorps ($inout,$rndkey0) if (!defined($ivec));
&set_label("${p}1_loop_$sn");
eval"&aes${p} ($inout,$rndkey1)";
&dec ($rounds);
@@ -100,9 +116,9 @@ sub aesni_generate1 # fully unrolled loop
{ my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout));
&function_begin_B("_aesni_${p}rypt1");
- &movdqu ($rndkey0,&QWP(0,$key));
+ &movups ($rndkey0,&QWP(0,$key));
&$movekey ($rndkey1,&QWP(0x10,$key));
- &pxor ($inout,$rndkey0);
+ &xorps ($inout,$rndkey0);
&$movekey ($rndkey0,&QWP(0x20,$key));
&lea ($key,&DWP(0x30,$key));
&cmp ($rounds,11);
@@ -147,7 +163,7 @@ sub aesni_generate1 # fully unrolled loop
&function_begin_B("${PREFIX}_encrypt");
&mov ("eax",&wparam(0));
&mov ($key,&wparam(2));
- &movdqu ($inout0,&QWP(0,"eax"));
+ &movups ($inout0,&QWP(0,"eax"));
&mov ($rounds,&DWP(240,$key));
&mov ("eax",&wparam(1));
if ($inline)
@@ -163,7 +179,7 @@ sub aesni_generate1 # fully unrolled loop
&function_begin_B("${PREFIX}_decrypt");
&mov ("eax",&wparam(0));
&mov ($key,&wparam(2));
- &movdqu ($inout0,&QWP(0,"eax"));
+ &movups ($inout0,&QWP(0,"eax"));
&mov ($rounds,&DWP(240,$key));
&mov ("eax",&wparam(1));
if ($inline)
@@ -174,16 +190,19 @@ sub aesni_generate1 # fully unrolled loop
&ret ();
&function_end_B("${PREFIX}_decrypt");
-# _aesni_[en|de]crypt[34] are private interfaces, N denotes interleave
-# factor. Why 3x subroutine is used in loops? Even though aes[enc|dec]
-# latency is 6, it turned out that it can be scheduled only every
-# *second* cycle. Thus 3x interleave is the one providing optimal
+# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
+# factor. Why 3x subroutine were originally used in loops? Even though
+# aes[enc|dec] latency was originally 6, it could be scheduled only
+# every *2nd* cycle. Thus 3x interleave was the one providing optimal
# utilization, i.e. when subroutine's throughput is virtually same as
# of non-interleaved subroutine [for number of input blocks up to 3].
-# This is why it makes no sense to implement 2x subroutine. As soon
-# as/if Intel improves throughput by making it possible to schedule
-# the instructions in question *every* cycles I would have to
-# implement 6x interleave and use it in loop...
+# This is why it makes no sense to implement 2x subroutine.
+# aes[enc|dec] latency in next processor generation is 8, but the
+# instructions can be scheduled every cycle. Optimal interleave for
+# new processor is therefore 8x, but it's unfeasible to accommodate it
+# in XMM registers addreassable in 32-bit mode and therefore 6x is
+# used instead...
+
sub aesni_generate3
{ my $p=shift;
@@ -192,7 +211,7 @@ sub aesni_generate3
&shr ($rounds,1);
&$movekey ($rndkey1,&QWP(16,$key));
&lea ($key,&DWP(32,$key));
- &pxor ($inout0,$rndkey0);
+ &xorps ($inout0,$rndkey0);
&pxor ($inout1,$rndkey0);
&pxor ($inout2,$rndkey0);
&$movekey ($rndkey0,&QWP(0,$key));
@@ -231,13 +250,13 @@ sub aesni_generate4
&$movekey ($rndkey1,&QWP(16,$key));
&shr ($rounds,1);
&lea ($key,&DWP(32,$key));
- &pxor ($inout0,$rndkey0);
+ &xorps ($inout0,$rndkey0);
&pxor ($inout1,$rndkey0);
&pxor ($inout2,$rndkey0);
&pxor ($inout3,$rndkey0);
&$movekey ($rndkey0,&QWP(0,$key));
- &set_label("${p}3_loop");
+ &set_label("${p}4_loop");
eval"&aes${p} ($inout0,$rndkey1)";
eval"&aes${p} ($inout1,$rndkey1)";
&dec ($rounds);
@@ -250,7 +269,7 @@ sub aesni_generate4
eval"&aes${p} ($inout2,$rndkey0)";
eval"&aes${p} ($inout3,$rndkey0)";
&$movekey ($rndkey0,&QWP(0,$key));
- &jnz (&label("${p}3_loop"));
+ &jnz (&label("${p}4_loop"));
eval"&aes${p} ($inout0,$rndkey1)";
eval"&aes${p} ($inout1,$rndkey1)";
@@ -263,10 +282,73 @@ sub aesni_generate4
&ret();
&function_end_B("_aesni_${p}rypt4");
}
+
+sub aesni_generate6
+{ my $p=shift;
+
+ &function_begin_B("_aesni_${p}rypt6");
+ &static_label("_aesni_${p}rypt6_enter");
+ &$movekey ($rndkey0,&QWP(0,$key));
+ &shr ($rounds,1);
+ &$movekey ($rndkey1,&QWP(16,$key));
+ &lea ($key,&DWP(32,$key));
+ &xorps ($inout0,$rndkey0);
+ &pxor ($inout1,$rndkey0); # pxor does better here
+ eval"&aes${p} ($inout0,$rndkey1)";
+ &pxor ($inout2,$rndkey0);
+ eval"&aes${p} ($inout1,$rndkey1)";
+ &pxor ($inout3,$rndkey0);
+ &dec ($rounds);
+ eval"&aes${p} ($inout2,$rndkey1)";
+ &pxor ($inout4,$rndkey0);
+ eval"&aes${p} ($inout3,$rndkey1)";
+ &pxor ($inout5,$rndkey0);
+ eval"&aes${p} ($inout4,$rndkey1)";
+ &$movekey ($rndkey0,&QWP(0,$key));
+ eval"&aes${p} ($inout5,$rndkey1)";
+ &jmp (&label("_aesni_${p}rypt6_enter"));
+
+ &set_label("${p}6_loop",16);
+ eval"&aes${p} ($inout0,$rndkey1)";
+ eval"&aes${p} ($inout1,$rndkey1)";
+ &dec ($rounds);
+ eval"&aes${p} ($inout2,$rndkey1)";
+ eval"&aes${p} ($inout3,$rndkey1)";
+ eval"&aes${p} ($inout4,$rndkey1)";
+ eval"&aes${p} ($inout5,$rndkey1)";
+ &set_label("_aesni_${p}rypt6_enter",16);
+ &$movekey ($rndkey1,&QWP(16,$key));
+ eval"&aes${p} ($inout0,$rndkey0)";
+ eval"&aes${p} ($inout1,$rndkey0)";
+ &lea ($key,&DWP(32,$key));
+ eval"&aes${p} ($inout2,$rndkey0)";
+ eval"&aes${p} ($inout3,$rndkey0)";
+ eval"&aes${p} ($inout4,$rndkey0)";
+ eval"&aes${p} ($inout5,$rndkey0)";
+ &$movekey ($rndkey0,&QWP(0,$key));
+ &jnz (&label("${p}6_loop"));
+
+ eval"&aes${p} ($inout0,$rndkey1)";
+ eval"&aes${p} ($inout1,$rndkey1)";
+ eval"&aes${p} ($inout2,$rndkey1)";
+ eval"&aes${p} ($inout3,$rndkey1)";
+ eval"&aes${p} ($inout4,$rndkey1)";
+ eval"&aes${p} ($inout5,$rndkey1)";
+ eval"&aes${p}last ($inout0,$rndkey0)";
+ eval"&aes${p}last ($inout1,$rndkey0)";
+ eval"&aes${p}last ($inout2,$rndkey0)";
+ eval"&aes${p}last ($inout3,$rndkey0)";
+ eval"&aes${p}last ($inout4,$rndkey0)";
+ eval"&aes${p}last ($inout5,$rndkey0)";
+ &ret();
+ &function_end_B("_aesni_${p}rypt6");
+}
&aesni_generate3("enc") if ($PREFIX eq "aesni");
&aesni_generate3("dec");
&aesni_generate4("enc") if ($PREFIX eq "aesni");
&aesni_generate4("dec");
+&aesni_generate6("enc") if ($PREFIX eq "aesni");
+&aesni_generate6("dec");
if ($PREFIX eq "aesni") {
######################################################################
@@ -278,37 +360,62 @@ if ($PREFIX eq "aesni") {
&mov ($out,&wparam(1));
&mov ($len,&wparam(2));
&mov ($key,&wparam(3));
- &mov ($rounds,&wparam(4));
- &cmp ($len,16);
- &jb (&label("ecb_ret"));
+ &mov ($rounds_,&wparam(4));
&and ($len,-16);
- &test ($rounds,$rounds)
+ &jz (&label("ecb_ret"));
&mov ($rounds,&DWP(240,$key));
+ &test ($rounds_,$rounds_);
+ &jz (&label("ecb_decrypt"));
+
&mov ($key_,$key); # backup $key
&mov ($rounds_,$rounds); # backup $rounds
- &jz (&label("ecb_decrypt"));
+ &cmp ($len,0x60);
+ &jb (&label("ecb_enc_tail"));
+
+ &movdqu ($inout0,&QWP(0,$inp));
+ &movdqu ($inout1,&QWP(0x10,$inp));
+ &movdqu ($inout2,&QWP(0x20,$inp));
+ &movdqu ($inout3,&QWP(0x30,$inp));
+ &movdqu ($inout4,&QWP(0x40,$inp));
+ &movdqu ($inout5,&QWP(0x50,$inp));
+ &lea ($inp,&DWP(0x60,$inp));
+ &sub ($len,0x60);
+ &jmp (&label("ecb_enc_loop6_enter"));
+
+&set_label("ecb_enc_loop6",16);
+ &movups (&QWP(0,$out),$inout0);
+ &movdqu ($inout0,&QWP(0,$inp));
+ &movups (&QWP(0x10,$out),$inout1);
+ &movdqu ($inout1,&QWP(0x10,$inp));
+ &movups (&QWP(0x20,$out),$inout2);
+ &movdqu ($inout2,&QWP(0x20,$inp));
+ &movups (&QWP(0x30,$out),$inout3);
+ &movdqu ($inout3,&QWP(0x30,$inp));
+ &movups (&QWP(0x40,$out),$inout4);
+ &movdqu ($inout4,&QWP(0x40,$inp));
+ &movups (&QWP(0x50,$out),$inout5);
+ &lea ($out,&DWP(0x60,$out));
+ &movdqu ($inout5,&QWP(0x50,$inp));
+ &lea ($inp,&DWP(0x60,$inp));
+&set_label("ecb_enc_loop6_enter");
- &cmp ($len,0x40);
- &jbe (&label("ecb_enc_tail"));
- &sub ($len,0x40);
- &jmp (&label("ecb_enc_loop3"));
+ &call ("_aesni_encrypt6");
-&set_label("ecb_enc_loop3",16);
- &movups ($inout0,&QWP(0,$inp));
- &movups ($inout1,&QWP(0x10,$inp));
- &movups ($inout2,&QWP(0x20,$inp));
- &call ("_aesni_encrypt3");
- &lea ($inp,&DWP(0x30,$inp));
- &movups (&QWP(0,$out),$inout0);
&mov ($key,$key_); # restore $key
- &movups (&QWP(0x10,$out),$inout1);
&mov ($rounds,$rounds_); # restore $rounds
+ &sub ($len,0x60);
+ &jnc (&label("ecb_enc_loop6"));
+
+ &movups (&QWP(0,$out),$inout0);
+ &movups (&QWP(0x10,$out),$inout1);
&movups (&QWP(0x20,$out),$inout2);
- &lea ($out,&DWP(0x30,$out));
- &sub ($len,0x30);
- &ja (&label("ecb_enc_loop3"));
+ &movups (&QWP(0x30,$out),$inout3);
+ &movups (&QWP(0x40,$out),$inout4);
+ &movups (&QWP(0x50,$out),$inout5);
+ &lea ($out,&DWP(0x60,$out));
+ &add ($len,0x60);
+ &jz (&label("ecb_ret"));
- &add ($len,0x40);
&set_label("ecb_enc_tail");
&movups ($inout0,&QWP(0,$inp));
&cmp ($len,0x20);
@@ -316,14 +423,18 @@ if ($PREFIX eq "aesni") {
&movups ($inout1,&QWP(0x10,$inp));
&je (&label("ecb_enc_two"));
&movups ($inout2,&QWP(0x20,$inp));
- &cmp ($len,0x30);
- &je (&label("ecb_enc_three"));
+ &cmp ($len,0x40);
+ &jb (&label("ecb_enc_three"));
&movups ($inout3,&QWP(0x30,$inp));
- &call ("_aesni_encrypt4");
+ &je (&label("ecb_enc_four"));
+ &movups ($inout4,&QWP(0x40,$inp));
+ &xorps ($inout5,$inout5);
+ &call ("_aesni_encrypt6");
&movups (&QWP(0,$out),$inout0);
&movups (&QWP(0x10,$out),$inout1);
&movups (&QWP(0x20,$out),$inout2);
&movups (&QWP(0x30,$out),$inout3);
+ &movups (&QWP(0x40,$out),$inout4);
jmp (&label("ecb_ret"));
&set_label("ecb_enc_one",16);
@@ -335,7 +446,7 @@ if ($PREFIX eq "aesni") {
&jmp (&label("ecb_ret"));
&set_label("ecb_enc_two",16);
- &pxor ($inout2,$inout2);
+ &xorps ($inout2,$inout2);
&call ("_aesni_encrypt3");
&movups (&QWP(0,$out),$inout0);
&movups (&QWP(0x10,$out),$inout1);
@@ -347,29 +458,65 @@ if ($PREFIX eq "aesni") {
&movups (&QWP(0x10,$out),$inout1);
&movups (&QWP(0x20,$out),$inout2);
&jmp (&label("ecb_ret"));
+
+&set_label("ecb_enc_four",16);
+ &call ("_aesni_encrypt4");
+ &movups (&QWP(0,$out),$inout0);
+ &movups (&QWP(0x10,$out),$inout1);
+ &movups (&QWP(0x20,$out),$inout2);
+ &movups (&QWP(0x30,$out),$inout3);
+ &jmp (&label("ecb_ret"));
######################################################################
&set_label("ecb_decrypt",16);
- &cmp ($len,0x40);
- &jbe (&label("ecb_dec_tail"));
- &sub ($len,0x40);
- &jmp (&label("ecb_dec_loop3"));
-
-&set_label("ecb_dec_loop3",16);
- &movups ($inout0,&QWP(0,$inp));
- &movups ($inout1,&QWP(0x10,$inp));
- &movups ($inout2,&QWP(0x20,$inp));
- &call ("_aesni_decrypt3");
- &lea ($inp,&DWP(0x30,$inp));
+ &mov ($key_,$key); # backup $key
+ &mov ($rounds_,$rounds); # backup $rounds
+ &cmp ($len,0x60);
+ &jb (&label("ecb_dec_tail"));
+
+ &movdqu ($inout0,&QWP(0,$inp));
+ &movdqu ($inout1,&QWP(0x10,$inp));
+ &movdqu ($inout2,&QWP(0x20,$inp));
+ &movdqu ($inout3,&QWP(0x30,$inp));
+ &movdqu ($inout4,&QWP(0x40,$inp));
+ &movdqu ($inout5,&QWP(0x50,$inp));
+ &lea ($inp,&DWP(0x60,$inp));
+ &sub ($len,0x60);
+ &jmp (&label("ecb_dec_loop6_enter"));
+
+&set_label("ecb_dec_loop6",16);
&movups (&QWP(0,$out),$inout0);
- &mov ($key,$key_); # restore $key
+ &movdqu ($inout0,&QWP(0,$inp));
&movups (&QWP(0x10,$out),$inout1);
+ &movdqu ($inout1,&QWP(0x10,$inp));
+ &movups (&QWP(0x20,$out),$inout2);
+ &movdqu ($inout2,&QWP(0x20,$inp));
+ &movups (&QWP(0x30,$out),$inout3);
+ &movdqu ($inout3,&QWP(0x30,$inp));
+ &movups (&QWP(0x40,$out),$inout4);
+ &movdqu ($inout4,&QWP(0x40,$inp));
+ &movups (&QWP(0x50,$out),$inout5);
+ &lea ($out,&DWP(0x60,$out));
+ &movdqu ($inout5,&QWP(0x50,$inp));
+ &lea ($inp,&DWP(0x60,$inp));
+&set_label("ecb_dec_loop6_enter");
+
+ &call ("_aesni_decrypt6");
+
+ &mov ($key,$key_); # restore $key
&mov ($rounds,$rounds_); # restore $rounds
+ &sub ($len,0x60);
+ &jnc (&label("ecb_dec_loop6"));
+
+ &movups (&QWP(0,$out),$inout0);
+ &movups (&QWP(0x10,$out),$inout1);
&movups (&QWP(0x20,$out),$inout2);
- &lea ($out,&DWP(0x30,$out));
- &sub ($len,0x30);
- &ja (&label("ecb_dec_loop3"));
+ &movups (&QWP(0x30,$out),$inout3);
+ &movups (&QWP(0x40,$out),$inout4);
+ &movups (&QWP(0x50,$out),$inout5);
+ &lea ($out,&DWP(0x60,$out));
+ &add ($len,0x60);
+ &jz (&label("ecb_ret"));
- &add ($len,0x40);
&set_label("ecb_dec_tail");
&movups ($inout0,&QWP(0,$inp));
&cmp ($len,0x20);
@@ -377,14 +524,18 @@ if ($PREFIX eq "aesni") {
&movups ($inout1,&QWP(0x10,$inp));
&je (&label("ecb_dec_two"));
&movups ($inout2,&QWP(0x20,$inp));
- &cmp ($len,0x30);
- &je (&label("ecb_dec_three"));
+ &cmp ($len,0x40);
+ &jb (&label("ecb_dec_three"));
&movups ($inout3,&QWP(0x30,$inp));
- &call ("_aesni_decrypt4");
+ &je (&label("ecb_dec_four"));
+ &movups ($inout4,&QWP(0x40,$inp));
+ &xorps ($inout5,$inout5);
+ &call ("_aesni_decrypt6");
&movups (&QWP(0,$out),$inout0);
&movups (&QWP(0x10,$out),$inout1);
&movups (&QWP(0x20,$out),$inout2);
&movups (&QWP(0x30,$out),$inout3);
+ &movups (&QWP(0x40,$out),$inout4);
&jmp (&label("ecb_ret"));
&set_label("ecb_dec_one",16);
@@ -396,7 +547,7 @@ if ($PREFIX eq "aesni") {
&jmp (&label("ecb_ret"));
&set_label("ecb_dec_two",16);
- &pxor ($inout2,$inout2);
+ &xorps ($inout2,$inout2);
&call ("_aesni_decrypt3");
&movups (&QWP(0,$out),$inout0);
&movups (&QWP(0x10,$out),$inout1);
@@ -407,6 +558,14 @@ if ($PREFIX eq "aesni") {
&movups (&QWP(0,$out),$inout0);
&movups (&QWP(0x10,$out),$inout1);
&movups (&QWP(0x20,$out),$inout2);
+ &jmp (&label("ecb_ret"));
+
+&set_label("ecb_dec_four",16);
+ &call ("_aesni_decrypt4");
+ &movups (&QWP(0,$out),$inout0);
+ &movups (&QWP(0x10,$out),$inout1);
+ &movups (&QWP(0x20,$out),$inout2);
+ &movups (&QWP(0x30,$out),$inout3);
&set_label("ecb_ret");
&function_end("aesni_ecb_encrypt");
@@ -420,6 +579,7 @@ if ($PREFIX eq "aesni") {
# does not update *ivec! Nor does it finalize CMAC value
# (see engine/eng_aesni.c for details)
#
+{ my $cmac=$inout1;
&function_begin("aesni_ccm64_encrypt_blocks");
&mov ($inp,&wparam(0));
&mov ($out,&wparam(1));
@@ -433,7 +593,7 @@ if ($PREFIX eq "aesni") {
&mov (&DWP(48,"esp"),$key_);
&movdqu ($ivec,&QWP(0,$rounds_)); # load ivec
- &movdqu ($inout1,&QWP(0,$rounds)); # load cmac
+ &movdqu ($cmac,&QWP(0,$rounds)); # load cmac
# compose byte-swap control mask for pshufb on stack
&mov (&DWP(0,"esp"),0x0c0d0e0f);
@@ -458,27 +618,47 @@ if ($PREFIX eq "aesni") {
&movdqa ($inout0,$ivec);
&set_label("ccm64_enc_outer");
- &movdqu ($in0,&QWP(0,$inp));
- &pshufb ($inout0,$inout3);
- &mov ($key,$key_);
- &mov ($rounds,$rounds_);
- &pxor ($inout1,$in0); # cmac^=inp
- &pxor ($inout2,$inout2);
+ &movups ($in0,&QWP(0,$inp));
+ &pshufb ($inout0,$inout3);
+ &mov ($key,$key_);
+ &mov ($rounds,$rounds_);
- &call ("_aesni_encrypt3");
+ &$movekey ($rndkey0,&QWP(0,$key));
+ &shr ($rounds,1);
+ &$movekey ($rndkey1,&QWP(16,$key));
+ &xorps ($in0,$rndkey0);
+ &lea ($key,&DWP(32,$key));
+ &xorps ($inout0,$rndkey0);
+ &xorps ($cmac,$in0); # cmac^=inp
+ &$movekey ($rndkey0,&QWP(0,$key));
+
+&set_label("ccm64_enc2_loop");
+ &aesenc ($inout0,$rndkey1);
+ &dec ($rounds);
+ &aesenc ($cmac,$rndkey1);
+ &$movekey ($rndkey1,&QWP(16,$key));
+ &aesenc ($inout0,$rndkey0);
+ &lea ($key,&DWP(32,$key));
+ &aesenc ($cmac,$rndkey0);
+ &$movekey ($rndkey0,&QWP(0,$key));
+ &jnz (&label("ccm64_enc2_loop"));
+ &aesenc ($inout0,$rndkey1);
+ &aesenc ($cmac,$rndkey1);
+ &aesenclast ($inout0,$rndkey0);
+ &aesenclast ($cmac,$rndkey0);
&paddq ($ivec,&QWP(16,"esp"));
&dec ($len);
&lea ($inp,&DWP(16,$inp));
- &pxor ($in0,$inout0); # inp^=E(ivec)
+ &xorps ($in0,$inout0); # inp^=E(ivec)
&movdqa ($inout0,$ivec);
- &movdqu (&QWP(0,$out),$in0);
+ &movups (&QWP(0,$out),$in0);
&lea ($out,&DWP(16,$out));
&jnz (&label("ccm64_enc_outer"));
&mov ("esp",&DWP(48,"esp"));
&mov ($out,&wparam(5));
- &movdqu (&QWP(0,$out),$inout1);
+ &movups (&QWP(0,$out),$cmac);
&function_end("aesni_ccm64_encrypt_blocks");
&function_begin("aesni_ccm64_decrypt_blocks");
@@ -494,7 +674,7 @@ if ($PREFIX eq "aesni") {
&mov (&DWP(48,"esp"),$key_);
&movdqu ($ivec,&QWP(0,$rounds_)); # load ivec
- &movdqu ($inout1,&QWP(0,$rounds)); # load cmac
+ &movdqu ($cmac,&QWP(0,$rounds)); # load cmac
# compose byte-swap control mask for pshufb on stack
&mov (&DWP(0,"esp"),0x0c0d0e0f);
@@ -524,35 +704,56 @@ if ($PREFIX eq "aesni") {
{ &call ("_aesni_encrypt1"); }
&set_label("ccm64_dec_outer");
- &movdqu ($in0,&QWP(0,$inp));
&paddq ($ivec,&QWP(16,"esp"));
- &dec ($len);
- &lea ($inp,&QWP(16,$inp));
- &pxor ($in0,$inout0);
+ &movups ($in0,&QWP(0,$inp)); # load inp
+ &xorps ($in0,$inout0);
&movdqa ($inout0,$ivec);
+ &lea ($inp,&QWP(16,$inp));
+ &pshufb ($inout0,$inout3);
&mov ($key,$key_);
&mov ($rounds,$rounds_);
- &pshufb ($inout0,$inout3);
- &movdqu (&QWP(0,$out),$in0);
+ &movups (&QWP(0,$out),$in0);
&lea ($out,&DWP(16,$out));
+ &sub ($len,1);
&jz (&label("ccm64_dec_break"));
- &pxor ($inout2,$inout2);
- &call ("_aesni_encrypt3");
+ &$movekey ($rndkey0,&QWP(0,$key));
+ &shr ($rounds,1);
+ &$movekey ($rndkey1,&QWP(16,$key));
+ &xorps ($in0,$rndkey0);
+ &lea ($key,&DWP(32,$key));
+ &xorps ($inout0,$rndkey0);
+ &xorps ($cmac,$in0); # cmac^=out
+ &$movekey ($rndkey0,&QWP(0,$key));
+&set_label("ccm64_dec2_loop");
+ &aesenc ($inout0,$rndkey1);
+ &dec ($rounds);
+ &aesenc ($cmac,$rndkey1);
+ &$movekey ($rndkey1,&QWP(16,$key));
+ &aesenc ($inout0,$rndkey0);
+ &lea ($key,&DWP(32,$key));
+ &aesenc ($cmac,$rndkey0);
+ &$movekey ($rndkey0,&QWP(0,$key));
+ &jnz (&label("ccm64_dec2_loop"));
+ &aesenc ($inout0,$rndkey1);
+ &aesenc ($cmac,$rndkey1);
+ &aesenclast ($inout0,$rndkey0);
+ &aesenclast ($cmac,$rndkey0);
&jmp (&label("ccm64_dec_outer"));
&set_label("ccm64_dec_break",16);
if ($inline)
- { &aesni_inline_generate1("enc",$inout1); }
+ { &aesni_inline_generate1("enc",$cmac,$in0); }
else
- { &call ("_aesni_encrypt1",$inout1); }
+ { &call ("_aesni_encrypt1",$cmac); }
&mov ("esp",&DWP(48,"esp"));
&mov ($out,&wparam(5));
- &movdqu (&QWP(0,$out),$inout1);
+ &movups (&QWP(0,$out),$cmac);
&function_end("aesni_ccm64_decrypt_blocks");
+}
######################################################################
# void aesni_ctr32_encrypt_blocks (const void *in, void *out,
@@ -562,6 +763,14 @@ if ($PREFIX eq "aesni") {
# Handles only complete blocks, operates on 32-bit counter and
# does not update *ivec! (see engine/eng_aesni.c for details)
#
+# stack layout:
+# 0 pshufb mask
+# 16 vector addend: 0,6,6,6
+# 32 counter-less ivec
+# 48 1st triplet of counter vector
+# 64 2nd triplet of counter vector
+# 80 saved %esp
+
&function_begin("aesni_ctr32_encrypt_blocks");
&mov ($inp,&wparam(0));
&mov ($out,&wparam(1));
@@ -569,14 +778,14 @@ if ($PREFIX eq "aesni") {
&mov ($key,&wparam(3));
&mov ($rounds_,&wparam(4));
&mov ($key_,"esp");
- &sub ("esp",60);
+ &sub ("esp",88);
&and ("esp",-16); # align stack
- &mov (&DWP(48,"esp"),$key_);
+ &mov (&DWP(80,"esp"),$key_);
&cmp ($len,1);
&je (&label("ctr32_one_shortcut"));
- &movups ($inout3,&QWP(0,$rounds_)); # load ivec
+ &movdqu ($inout5,&QWP(0,$rounds_)); # load ivec
# compose byte-swap control mask for pshufb on stack
&mov (&DWP(0,"esp"),0x0c0d0e0f);
@@ -585,139 +794,167 @@ if ($PREFIX eq "aesni") {
&mov (&DWP(12,"esp"),0x00010203);
# compose counter increment vector on stack
- &mov ($rounds,3);
+ &mov ($rounds,6);
&xor ($key_,$key_);
&mov (&DWP(16,"esp"),$rounds);
&mov (&DWP(20,"esp"),$rounds);
&mov (&DWP(24,"esp"),$rounds);
&mov (&DWP(28,"esp"),$key_);
- &pextrd ($rounds_,$inout3,3); # pull 32-bit counter
- &pinsrd ($inout3,$key_,3); # wipe 32-bit counter
+ &pextrd ($rounds_,$inout5,3); # pull 32-bit counter
+ &pinsrd ($inout5,$key_,3); # wipe 32-bit counter
&mov ($rounds,&DWP(240,$key)); # key->rounds
- &movdqa ($rndkey0,&QWP(0,"esp")); # load byte-swap mask
- # $ivec is vector of 3 32-bit counters
- &pxor ($ivec,$ivec);
+ # compose 2 vectors of 3x32-bit counters
&bswap ($rounds_);
- &pinsrd ($ivec,$rounds_,0);
+ &pxor ($rndkey1,$rndkey1);
+ &pxor ($rndkey0,$rndkey0);
+ &movdqa ($inout0,&QWP(0,"esp")); # load byte-swap mask
+ &pinsrd ($rndkey1,$rounds_,0);
+ &lea ($key_,&DWP(3,$rounds_));
+ &pinsrd ($rndkey0,$key_,0);
&inc ($rounds_);
- &pinsrd ($ivec,$rounds_,1);
+ &pinsrd ($rndkey1,$rounds_,1);
+ &inc ($key_);
+ &pinsrd ($rndkey0,$key_,1);
&inc ($rounds_);
- &pinsrd ($ivec,$rounds_,2);
- &pshufb ($ivec,$rndkey0); # byte swap
-
- &cmp ($len,4);
- &jbe (&label("ctr32_tail"));
- &movdqa (&QWP(32,"esp"),$inout3); # save counter-less ivec
- &mov ($rounds_,$rounds);
- &mov ($key_,$key);
- &sub ($len,4);
- &jmp (&label("ctr32_loop3"));
-
-&set_label("ctr32_loop3",16);
- &pshufd ($inout0,$ivec,3<<6); # place counter to upper dword
- &pshufd ($inout1,$ivec,2<<6);
- &por ($inout0,$inout3); # merge counter-less ivec
- &pshufd ($inout2,$ivec,1<<6);
- &por ($inout1,$inout3);
- &por ($inout2,$inout3);
-
- # inline _aesni_encrypt3 and interleave last round
- # with own code...
-
- &$movekey ($rndkey0,&QWP(0,$key));
- &shr ($rounds,1);
- &$movekey ($rndkey1,&QWP(16,$key));
- &lea ($key,&DWP(32,$key));
+ &pinsrd ($rndkey1,$rounds_,2);
+ &inc ($key_);
+ &pinsrd ($rndkey0,$key_,2);
+ &movdqa (&QWP(48,"esp"),$rndkey1); # save 1st triplet
+ &pshufb ($rndkey1,$inout0); # byte swap
+ &movdqa (&QWP(64,"esp"),$rndkey0); # save 2nd triplet
+ &pshufb ($rndkey0,$inout0); # byte swap
+
+ &pshufd ($inout0,$rndkey1,3<<6); # place counter to upper dword
+ &pshufd ($inout1,$rndkey1,2<<6);
+ &cmp ($len,6);
+ &jb (&label("ctr32_tail"));
+ &movdqa (&QWP(32,"esp"),$inout5); # save counter-less ivec
+ &shr ($rounds,1);
+ &mov ($key_,$key); # backup $key
+ &mov ($rounds_,$rounds); # backup $rounds
+ &sub ($len,6);
+ &jmp (&label("ctr32_loop6"));
+
+&set_label("ctr32_loop6",16);
+ &pshufd ($inout2,$rndkey1,1<<6);
+ &movdqa ($rndkey1,&QWP(32,"esp")); # pull counter-less ivec
+ &pshufd ($inout3,$rndkey0,3<<6);
+ &por ($inout0,$rndkey1); # merge counter-less ivec
+ &pshufd ($inout4,$rndkey0,2<<6);
+ &por ($inout1,$rndkey1);
+ &pshufd ($inout5,$rndkey0,1<<6);
+ &por ($inout2,$rndkey1);
+ &por ($inout3,$rndkey1);
+ &por ($inout4,$rndkey1);
+ &por ($inout5,$rndkey1);
+
+ # inlining _aesni_encrypt6's prologue gives ~4% improvement...
+ &$movekey ($rndkey0,&QWP(0,$key_));
+ &$movekey ($rndkey1,&QWP(16,$key_));
+ &lea ($key,&DWP(32,$key_));
+ &dec ($rounds);
&pxor ($inout0,$rndkey0);
&pxor ($inout1,$rndkey0);
- &pxor ($inout2,$rndkey0);
- &$movekey ($rndkey0,&QWP(0,$key));
-
-&set_label("ctr32_enc_loop3");
&aesenc ($inout0,$rndkey1);
+ &pxor ($inout2,$rndkey0);
&aesenc ($inout1,$rndkey1);
- &dec ($rounds);
+ &pxor ($inout3,$rndkey0);
&aesenc ($inout2,$rndkey1);
- &$movekey ($rndkey1,&QWP(16,$key));
- &aesenc ($inout0,$rndkey0);
- &aesenc ($inout1,$rndkey0);
- &lea ($key,&DWP(32,$key));
- &aesenc ($inout2,$rndkey0);
+ &pxor ($inout4,$rndkey0);
+ &aesenc ($inout3,$rndkey1);
+ &pxor ($inout5,$rndkey0);
+ &aesenc ($inout4,$rndkey1);
&$movekey ($rndkey0,&QWP(0,$key));
- &jnz (&label("ctr32_enc_loop3"));
+ &aesenc ($inout5,$rndkey1);
- &aesenc ($inout0,$rndkey1);
- &aesenc ($inout1,$rndkey1);
- &aesenc ($inout2,$rndkey1);
- &movdqa ($rndkey1,&QWP(0,"esp")); # load byte-swap mask
+ &call (&label("_aesni_encrypt6_enter"));
+
+ &movups ($rndkey1,&QWP(0,$inp));
+ &movups ($rndkey0,&QWP(0x10,$inp));
+ &xorps ($inout0,$rndkey1);
+ &movups ($rndkey1,&QWP(0x20,$inp));
+ &xorps ($inout1,$rndkey0);
+ &movups (&QWP(0,$out),$inout0);
+ &movdqa ($rndkey0,&QWP(16,"esp")); # load increment
+ &xorps ($inout2,$rndkey1);
+ &movdqa ($rndkey1,&QWP(48,"esp")); # load 1st triplet
+ &movups (&QWP(0x10,$out),$inout1);
+ &movups (&QWP(0x20,$out),$inout2);
+
+ &paddd ($rndkey1,$rndkey0); # 1st triplet increment
+ &paddd ($rndkey0,&QWP(64,"esp")); # 2nd triplet increment
+ &movdqa ($inout0,&QWP(0,"esp")); # load byte swap mask
+
+ &movups ($inout1,&QWP(0x30,$inp));
+ &movups ($inout2,&QWP(0x40,$inp));
+ &xorps ($inout3,$inout1);
+ &movups ($inout1,&QWP(0x50,$inp));
+ &lea ($inp,&DWP(0x60,$inp));
+ &movdqa (&QWP(48,"esp"),$rndkey1); # save 1st triplet
+ &pshufb ($rndkey1,$inout0); # byte swap
+ &xorps ($inout4,$inout2);
+ &movups (&QWP(0x30,$out),$inout3);
+ &xorps ($inout5,$inout1);
+ &movdqa (&QWP(64,"esp"),$rndkey0); # save 2nd triplet
+ &pshufb ($rndkey0,$inout0); # byte swap
+ &movups (&QWP(0x40,$out),$inout4);
+ &pshufd ($inout0,$rndkey1,3<<6);
+ &movups (&QWP(0x50,$out),$inout5);
+ &lea ($out,&DWP(0x60,$out));
- &aesenclast ($inout0,$rndkey0);
- &pshufb ($ivec,$rndkey1); # byte swap
- &movdqu ($in0,&QWP(0,$inp));
- &aesenclast ($inout1,$rndkey0);
- &paddd ($ivec,&QWP(16,"esp")); # counter increment
- &movdqu ($in1,&QWP(0x10,$inp));
- &aesenclast ($inout2,$rndkey0);
- &pshufb ($ivec,$rndkey1); # byte swap
- &movdqu ($rndkey0,&QWP(0x20,$inp));
- &lea ($inp,&DWP(0x30,$inp));
-
- &pxor ($in0,$inout0);
- &mov ($key,$key_);
- &pxor ($in1,$inout1);
- &movdqu (&QWP(0,$out),$in0);
- &pxor ($rndkey0,$inout2);
- &movdqu (&QWP(0x10,$out),$in1);
- &movdqu (&QWP(0x20,$out),$rndkey0);
- &movdqa ($inout3,&QWP(32,"esp")); # load counter-less ivec
-
- &sub ($len,3);
- &lea ($out,&DWP(0x30,$out));
&mov ($rounds,$rounds_);
- &ja (&label("ctr32_loop3"));
+ &pshufd ($inout1,$rndkey1,2<<6);
+ &sub ($len,6);
+ &jnc (&label("ctr32_loop6"));
- &pextrd ($rounds_,$ivec,1); # might need last counter value
- &add ($len,4);
- &bswap ($rounds_);
+ &add ($len,6);
+ &jz (&label("ctr32_ret"));
+ &mov ($key,$key_);
+ &lea ($rounds,&DWP(1,"",$rounds,2)); # restore $rounds
+ &movdqa ($inout5,&QWP(32,"esp")); # pull count-less ivec
&set_label("ctr32_tail");
- &pshufd ($inout0,$ivec,3<<6);
- &pshufd ($inout1,$ivec,2<<6);
- &por ($inout0,$inout3);
+ &por ($inout0,$inout5);
&cmp ($len,2);
&jb (&label("ctr32_one"));
- &lea ($rounds_,&DWP(1,$rounds_));
- &pshufd ($inout2,$ivec,1<<6);
- &por ($inout1,$inout3);
- &je (&label("ctr32_two"));
- &bswap ($rounds_);
- &por ($inout2,$inout3);
- &cmp ($len,3);
- &je (&label("ctr32_three"));
-
- &pinsrd ($inout3,$rounds_,3); # compose last counter value
- &call ("_aesni_encrypt4");
+ &pshufd ($inout2,$rndkey1,1<<6);
+ &por ($inout1,$inout5);
+ &je (&label("ctr32_two"));
- &movdqu ($in0,&QWP(0,$inp));
- &movdqu ($rndkey1,&QWP(0x10,$inp));
- &pxor ($in0,$inout0);
- &movdqu ($rndkey0,&QWP(0x20,$inp));
- &pxor ($rndkey1,$inout1);
- &movdqu ($ivec,&QWP(0x30,$inp));
- &pxor ($rndkey0,$inout2);
- &movdqu (&QWP(0,$out),$in0);
- &pxor ($ivec,$inout3);
- &movdqu (&QWP(0x10,$out),$rndkey1);
- &movdqu (&QWP(0x20,$out),$rndkey0);
- &movdqu (&QWP(0x30,$out),$ivec);
+ &pshufd ($inout3,$rndkey0,3<<6);
+ &por ($inout2,$inout5);
+ &cmp ($len,4);
+ &jb (&label("ctr32_three"));
+
+ &pshufd ($inout4,$rndkey0,2<<6);
+ &por ($inout3,$inout5);
+ &je (&label("ctr32_four"));
+
+ &por ($inout4,$inout5);
+ &call ("_aesni_encrypt6");
+ &movups ($rndkey1,&QWP(0,$inp));
+ &movups ($rndkey0,&QWP(0x10,$inp));
+ &xorps ($inout0,$rndkey1);
+ &movups ($rndkey1,&QWP(0x20,$inp));
+ &xorps ($inout1,$rndkey0);
+ &movups ($rndkey0,&QWP(0x30,$inp));
+ &xorps ($inout2,$rndkey1);
+ &movups ($rndkey1,&QWP(0x40,$inp));
+ &xorps ($inout3,$rndkey0);
+ &movups (&QWP(0,$out),$inout0);
+ &xorps ($inout4,$rndkey1);
+ &movups (&QWP(0x10,$out),$inout1);
+ &movups (&QWP(0x20,$out),$inout2);
+ &movups (&QWP(0x30,$out),$inout3);
+ &movups (&QWP(0x40,$out),$inout4);
&jmp (&label("ctr32_ret"));
&set_label("ctr32_one_shortcut",16);
- &movdqu ($inout0,&QWP(0,$rounds_)); # load ivec
+ &movups ($inout0,&QWP(0,$rounds_)); # load ivec
&mov ($rounds,&DWP(240,$key));
&set_label("ctr32_one");
@@ -725,37 +962,757 @@ if ($PREFIX eq "aesni") {
{ &aesni_inline_generate1("enc"); }
else
{ &call ("_aesni_encrypt1"); }
- &movdqu ($in0,&QWP(0,$inp));
- &pxor ($in0,$inout0);
- &movdqu (&QWP(0,$out),$in0);
+ &movups ($in0,&QWP(0,$inp));
+ &xorps ($in0,$inout0);
+ &movups (&QWP(0,$out),$in0);
&jmp (&label("ctr32_ret"));
&set_label("ctr32_two",16);
- &pxor ($inout2,$inout2);
&call ("_aesni_encrypt3");
- &movdqu ($in0,&QWP(0,$inp));
- &movdqu ($in1,&QWP(0x10,$inp));
- &pxor ($in0,$inout0);
- &pxor ($in1,$inout1);
- &movdqu (&QWP(0,$out),$in0);
- &movdqu (&QWP(0x10,$out),$in1);
+ &movups ($inout3,&QWP(0,$inp));
+ &movups ($inout4,&QWP(0x10,$inp));
+ &xorps ($inout0,$inout3);
+ &xorps ($inout1,$inout4);
+ &movups (&QWP(0,$out),$inout0);
+ &movups (&QWP(0x10,$out),$inout1);
&jmp (&label("ctr32_ret"));
&set_label("ctr32_three",16);
&call ("_aesni_encrypt3");
- &movdqu ($in0,&QWP(0,$inp));
- &movdqu ($in1,&QWP(0x10,$inp));
- &movdqu ($rndkey1,&QWP(0x20,$inp));
- &pxor ($in0,$inout0);
- &pxor ($in1,$inout1);
- &movdqu (&QWP(0,$out),$in0);
- &pxor ($rndkey1,$inout2);
- &movdqu (&QWP(0x10,$out),$in1);
- &movdqu (&QWP(0x20,$out),$rndkey1);
+ &movups ($inout3,&QWP(0,$inp));
+ &movups ($inout4,&QWP(0x10,