aesni-x86[_64].pl: optimize for Sandy Bridge and add XTS mode.

author: Andy Polyakov <appro@openssl.org> 2011-05-22 18:38:00 +0000
committer: Andy Polyakov <appro@openssl.org> 2011-05-22 18:38:00 +0000
commit: f8501464cc8fd8b7b4983462944a1894b157d735 (patch)
tree: 31840baeabd26ad8bbb922752e156cf17d00c995
parent: 96abea332c8b70b77e87390cbe016021971fb83b (diff)
2 files changed, 3161 insertions, 622 deletions
diff --git a/crypto/aes/asm/aesni-x86.pl b/crypto/aes/asm/aesni-x86.pl
index 1a1bf539cd..712149ab4b 100644
--- a/crypto/aes/asm/aesni-x86.pl
+++ b/crypto/aes/asm/aesni-x86.pl
@@ -27,7 +27,21 @@
 # Lower ratios for smaller block sizes are perfectly understandable,
 # because function call overhead is higher in 32-bit mode. Largest
 # 8-KB block performance is virtually same: 32-bit code is less than
-# 1% slower for ECB, CBC and CCM, and ~3% slower otherwise. 
+# 1% slower for ECB, CBC and CCM, and ~3% slower otherwise.
+
+# January 2011
+#
+# See aesni-x86_64.pl for details. Unlike x86_64 version this module
+# interleaves at most 6 aes[enc|dec] instructions, because there are
+# not enough registers for 8x interleave [which should be optimal for
+# Sandy Bridge]. Actually, performance results for 6x interleave
+# factor presented in aesni-x86_64.pl (except for CTR) are for this
+# module.
+
+# April 2011
+#
+# Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing
+# one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09.
 
 $PREFIX="aesni";	# if $PREFIX is set to "AES", the script
 			# generates drop-in replacement for
@@ -51,14 +65,14 @@ $out="edi";
 $rounds_="ebx";	# backup copy for $rounds
 $key_="ebp";	# backup copy for $key
 
-$inout0="xmm0";
-$inout1="xmm1";
-$inout2="xmm2";
-$rndkey0="xmm3";
-$rndkey1="xmm4";
-$ivec="xmm5";
-$in0="xmm6";
-$in1="xmm7";	$inout3="xmm7";
+$rndkey0="xmm0";
+$rndkey1="xmm1";
+$inout0="xmm2";
+$inout1="xmm3";
+$inout2="xmm4";
+$inout3="xmm5";	$in1="xmm5";
+$inout4="xmm6";	$in0="xmm6";
+$inout5="xmm7";	$ivec="xmm7";
 
 # AESNI extenstion
 sub aeskeygenassist
@@ -80,13 +94,15 @@ sub aesdeclast	{ aescommon(0xdf,@_); }
 # Inline version of internal aesni_[en|de]crypt1
 { my $sn;
 sub aesni_inline_generate1
-{ my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout));
+{ my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
   $sn++;
 
-    &movdqu		($rndkey0,&QWP(0,$key));
+    &$movekey		($rndkey0,&QWP(0,$key));
     &$movekey		($rndkey1,&QWP(16,$key));
+    &xorps		($ivec,$rndkey0)	if (defined($ivec));
     &lea		($key,&DWP(32,$key));
-    &pxor		($inout,$rndkey0);
+    &xorps		($inout,$ivec)		if (defined($ivec));
+    &xorps		($inout,$rndkey0)	if (!defined($ivec));
     &set_label("${p}1_loop_$sn");
 	eval"&aes${p}	($inout,$rndkey1)";
 	&dec		($rounds);
@@ -100,9 +116,9 @@ sub aesni_generate1	# fully unrolled loop
 { my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout));
 
     &function_begin_B("_aesni_${p}rypt1");
-	&movdqu		($rndkey0,&QWP(0,$key));
+	&movups		($rndkey0,&QWP(0,$key));
 	&$movekey	($rndkey1,&QWP(0x10,$key));
-	&pxor		($inout,$rndkey0);
+	&xorps		($inout,$rndkey0);
 	&$movekey	($rndkey0,&QWP(0x20,$key));
 	&lea		($key,&DWP(0x30,$key));
 	&cmp		($rounds,11);
@@ -147,7 +163,7 @@ sub aesni_generate1	# fully unrolled loop
 &function_begin_B("${PREFIX}_encrypt");
 	&mov	("eax",&wparam(0));
 	&mov	($key,&wparam(2));
-	&movdqu	($inout0,&QWP(0,"eax"));
+	&movups	($inout0,&QWP(0,"eax"));
 	&mov	($rounds,&DWP(240,$key));
 	&mov	("eax",&wparam(1));
 	if ($inline)
@@ -163,7 +179,7 @@ sub aesni_generate1	# fully unrolled loop
 &function_begin_B("${PREFIX}_decrypt");
 	&mov	("eax",&wparam(0));
 	&mov	($key,&wparam(2));
-	&movdqu	($inout0,&QWP(0,"eax"));
+	&movups	($inout0,&QWP(0,"eax"));
 	&mov	($rounds,&DWP(240,$key));
 	&mov	("eax",&wparam(1));
 	if ($inline)
@@ -174,16 +190,19 @@ sub aesni_generate1	# fully unrolled loop
 	&ret	();
 &function_end_B("${PREFIX}_decrypt");
 
-# _aesni_[en|de]crypt[34] are private interfaces, N denotes interleave
-# factor. Why 3x subroutine is used in loops? Even though aes[enc|dec]
-# latency is 6, it turned out that it can be scheduled only every
-# *second* cycle. Thus 3x interleave is the one providing optimal
+# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
+# factor. Why 3x subroutine were originally used in loops? Even though
+# aes[enc|dec] latency was originally 6, it could be scheduled only
+# every *2nd* cycle. Thus 3x interleave was the one providing optimal
 # utilization, i.e. when subroutine's throughput is virtually same as
 # of non-interleaved subroutine [for number of input blocks up to 3].
-# This is why it makes no sense to implement 2x subroutine. As soon
-# as/if Intel improves throughput by making it possible to schedule
-# the instructions in question *every* cycles I would have to
-# implement 6x interleave and use it in loop...
+# This is why it makes no sense to implement 2x subroutine.
+# aes[enc|dec] latency in next processor generation is 8, but the
+# instructions can be scheduled every cycle. Optimal interleave for
+# new processor is therefore 8x, but it's unfeasible to accommodate it
+# in XMM registers addreassable in 32-bit mode and therefore 6x is
+# used instead...
+
 sub aesni_generate3
 { my $p=shift;
 
@@ -192,7 +211,7 @@ sub aesni_generate3
 	&shr		($rounds,1);
 	&$movekey	($rndkey1,&QWP(16,$key));
 	&lea		($key,&DWP(32,$key));
-	&pxor		($inout0,$rndkey0);
+	&xorps		($inout0,$rndkey0);
 	&pxor		($inout1,$rndkey0);
 	&pxor		($inout2,$rndkey0);
 	&$movekey	($rndkey0,&QWP(0,$key));
@@ -231,13 +250,13 @@ sub aesni_generate4
 	&$movekey	($rndkey1,&QWP(16,$key));
 	&shr		($rounds,1);
 	&lea		($key,&DWP(32,$key));
-	&pxor		($inout0,$rndkey0);
+	&xorps		($inout0,$rndkey0);
 	&pxor		($inout1,$rndkey0);
 	&pxor		($inout2,$rndkey0);
 	&pxor		($inout3,$rndkey0);
 	&$movekey	($rndkey0,&QWP(0,$key));
 
-    &set_label("${p}3_loop");
+    &set_label("${p}4_loop");
 	eval"&aes${p}	($inout0,$rndkey1)";
 	eval"&aes${p}	($inout1,$rndkey1)";
 	&dec		($rounds);
@@ -250,7 +269,7 @@ sub aesni_generate4
 	eval"&aes${p}	($inout2,$rndkey0)";
 	eval"&aes${p}	($inout3,$rndkey0)";
 	&$movekey	($rndkey0,&QWP(0,$key));
-    &jnz		(&label("${p}3_loop"));
+    &jnz		(&label("${p}4_loop"));
 
     eval"&aes${p}	($inout0,$rndkey1)";
     eval"&aes${p}	($inout1,$rndkey1)";
@@ -263,10 +282,73 @@ sub aesni_generate4
     &ret();
     &function_end_B("_aesni_${p}rypt4");
 }
+
+sub aesni_generate6
+{ my $p=shift;
+
+    &function_begin_B("_aesni_${p}rypt6");
+    &static_label("_aesni_${p}rypt6_enter");
+	&$movekey	($rndkey0,&QWP(0,$key));
+	&shr		($rounds,1);
+	&$movekey	($rndkey1,&QWP(16,$key));
+	&lea		($key,&DWP(32,$key));
+	&xorps		($inout0,$rndkey0);
+	&pxor		($inout1,$rndkey0);	# pxor does better here
+	eval"&aes${p}	($inout0,$rndkey1)";
+	&pxor		($inout2,$rndkey0);
+	eval"&aes${p}	($inout1,$rndkey1)";
+	&pxor		($inout3,$rndkey0);
+	&dec		($rounds);
+	eval"&aes${p}	($inout2,$rndkey1)";
+	&pxor		($inout4,$rndkey0);
+	eval"&aes${p}	($inout3,$rndkey1)";
+	&pxor		($inout5,$rndkey0);
+	eval"&aes${p}	($inout4,$rndkey1)";
+	&$movekey	($rndkey0,&QWP(0,$key));
+	eval"&aes${p}	($inout5,$rndkey1)";
+	&jmp		(&label("_aesni_${p}rypt6_enter"));
+
+    &set_label("${p}6_loop",16);
+	eval"&aes${p}	($inout0,$rndkey1)";
+	eval"&aes${p}	($inout1,$rndkey1)";
+	&dec		($rounds);
+	eval"&aes${p}	($inout2,$rndkey1)";
+	eval"&aes${p}	($inout3,$rndkey1)";
+	eval"&aes${p}	($inout4,$rndkey1)";
+	eval"&aes${p}	($inout5,$rndkey1)";
+    &set_label("_aesni_${p}rypt6_enter",16);
+	&$movekey	($rndkey1,&QWP(16,$key));
+	eval"&aes${p}	($inout0,$rndkey0)";
+	eval"&aes${p}	($inout1,$rndkey0)";
+	&lea		($key,&DWP(32,$key));
+	eval"&aes${p}	($inout2,$rndkey0)";
+	eval"&aes${p}	($inout3,$rndkey0)";
+	eval"&aes${p}	($inout4,$rndkey0)";
+	eval"&aes${p}	($inout5,$rndkey0)";
+	&$movekey	($rndkey0,&QWP(0,$key));
+    &jnz		(&label("${p}6_loop"));
+
+    eval"&aes${p}	($inout0,$rndkey1)";
+    eval"&aes${p}	($inout1,$rndkey1)";
+    eval"&aes${p}	($inout2,$rndkey1)";
+    eval"&aes${p}	($inout3,$rndkey1)";
+    eval"&aes${p}	($inout4,$rndkey1)";
+    eval"&aes${p}	($inout5,$rndkey1)";
+    eval"&aes${p}last	($inout0,$rndkey0)";
+    eval"&aes${p}last	($inout1,$rndkey0)";
+    eval"&aes${p}last	($inout2,$rndkey0)";
+    eval"&aes${p}last	($inout3,$rndkey0)";
+    eval"&aes${p}last	($inout4,$rndkey0)";
+    eval"&aes${p}last	($inout5,$rndkey0)";
+    &ret();
+    &function_end_B("_aesni_${p}rypt6");
+}
 &aesni_generate3("enc") if ($PREFIX eq "aesni");
 &aesni_generate3("dec");
 &aesni_generate4("enc") if ($PREFIX eq "aesni");
 &aesni_generate4("dec");
+&aesni_generate6("enc") if ($PREFIX eq "aesni");
+&aesni_generate6("dec");
 
 if ($PREFIX eq "aesni") {
 ######################################################################
@@ -278,37 +360,62 @@ if ($PREFIX eq "aesni") {
 	&mov	($out,&wparam(1));
 	&mov	($len,&wparam(2));
 	&mov	($key,&wparam(3));
-	&mov	($rounds,&wparam(4));
-	&cmp	($len,16);
-	&jb	(&label("ecb_ret"));
+	&mov	($rounds_,&wparam(4));
 	&and	($len,-16);
-	&test	($rounds,$rounds)
+	&jz	(&label("ecb_ret"));
 	&mov	($rounds,&DWP(240,$key));
+	&test	($rounds_,$rounds_);
+	&jz	(&label("ecb_decrypt"));
+
 	&mov	($key_,$key);		# backup $key
 	&mov	($rounds_,$rounds);	# backup $rounds
-	&jz	(&label("ecb_decrypt"));
+	&cmp	($len,0x60);
+	&jb	(&label("ecb_enc_tail"));
+
+	&movdqu	($inout0,&QWP(0,$inp));
+	&movdqu	($inout1,&QWP(0x10,$inp));
+	&movdqu	($inout2,&QWP(0x20,$inp));
+	&movdqu	($inout3,&QWP(0x30,$inp));
+	&movdqu	($inout4,&QWP(0x40,$inp));
+	&movdqu	($inout5,&QWP(0x50,$inp));
+	&lea	($inp,&DWP(0x60,$inp));
+	&sub	($len,0x60);
+	&jmp	(&label("ecb_enc_loop6_enter"));
+
+&set_label("ecb_enc_loop6",16);
+	&movups	(&QWP(0,$out),$inout0);
+	&movdqu	($inout0,&QWP(0,$inp));
+	&movups	(&QWP(0x10,$out),$inout1);
+	&movdqu	($inout1,&QWP(0x10,$inp));
+	&movups	(&QWP(0x20,$out),$inout2);
+	&movdqu	($inout2,&QWP(0x20,$inp));
+	&movups	(&QWP(0x30,$out),$inout3);
+	&movdqu	($inout3,&QWP(0x30,$inp));
+	&movups	(&QWP(0x40,$out),$inout4);
+	&movdqu	($inout4,&QWP(0x40,$inp));
+	&movups	(&QWP(0x50,$out),$inout5);
+	&lea	($out,&DWP(0x60,$out));
+	&movdqu	($inout5,&QWP(0x50,$inp));
+	&lea	($inp,&DWP(0x60,$inp));
+&set_label("ecb_enc_loop6_enter");
 
-	&cmp	($len,0x40);
-	&jbe	(&label("ecb_enc_tail"));
-	&sub	($len,0x40);
-	&jmp	(&label("ecb_enc_loop3"));
+	&call	("_aesni_encrypt6");
 
-&set_label("ecb_enc_loop3",16);
-	&movups	($inout0,&QWP(0,$inp));
-	&movups	($inout1,&QWP(0x10,$inp));
-	&movups	($inout2,&QWP(0x20,$inp));
-	&call	("_aesni_encrypt3");
-	&lea	($inp,&DWP(0x30,$inp));
-	&movups	(&QWP(0,$out),$inout0);
 	&mov	($key,$key_);		# restore $key
-	&movups	(&QWP(0x10,$out),$inout1);
 	&mov	($rounds,$rounds_);	# restore $rounds
+	&sub	($len,0x60);
+	&jnc	(&label("ecb_enc_loop6"));
+
+	&movups	(&QWP(0,$out),$inout0);
+	&movups	(&QWP(0x10,$out),$inout1);
 	&movups	(&QWP(0x20,$out),$inout2);
-	&lea	($out,&DWP(0x30,$out));
-	&sub	($len,0x30);
-	&ja	(&label("ecb_enc_loop3"));
+	&movups	(&QWP(0x30,$out),$inout3);
+	&movups	(&QWP(0x40,$out),$inout4);
+	&movups	(&QWP(0x50,$out),$inout5);
+	&lea	($out,&DWP(0x60,$out));
+	&add	($len,0x60);
+	&jz	(&label("ecb_ret"));
 
-	&add	($len,0x40);
 &set_label("ecb_enc_tail");
 	&movups	($inout0,&QWP(0,$inp));
 	&cmp	($len,0x20);
@@ -316,14 +423,18 @@ if ($PREFIX eq "aesni") {
 	&movups	($inout1,&QWP(0x10,$inp));
 	&je	(&label("ecb_enc_two"));
 	&movups	($inout2,&QWP(0x20,$inp));
-	&cmp	($len,0x30);
-	&je	(&label("ecb_enc_three"));
+	&cmp	($len,0x40);
+	&jb	(&label("ecb_enc_three"));
 	&movups	($inout3,&QWP(0x30,$inp));
-	&call	("_aesni_encrypt4");
+	&je	(&label("ecb_enc_four"));
+	&movups	($inout4,&QWP(0x40,$inp));
+	&xorps	($inout5,$inout5);
+	&call	("_aesni_encrypt6");
 	&movups	(&QWP(0,$out),$inout0);
 	&movups	(&QWP(0x10,$out),$inout1);
 	&movups	(&QWP(0x20,$out),$inout2);
 	&movups	(&QWP(0x30,$out),$inout3);
+	&movups	(&QWP(0x40,$out),$inout4);
 	jmp	(&label("ecb_ret"));
 
 &set_label("ecb_enc_one",16);
@@ -335,7 +446,7 @@ if ($PREFIX eq "aesni") {
 	&jmp	(&label("ecb_ret"));
 
 &set_label("ecb_enc_two",16);
-	&pxor	($inout2,$inout2);
+	&xorps	($inout2,$inout2);
 	&call	("_aesni_encrypt3");
 	&movups	(&QWP(0,$out),$inout0);
 	&movups	(&QWP(0x10,$out),$inout1);
@@ -347,29 +458,65 @@ if ($PREFIX eq "aesni") {
 	&movups	(&QWP(0x10,$out),$inout1);
 	&movups	(&QWP(0x20,$out),$inout2);
 	&jmp	(&label("ecb_ret"));
+
+&set_label("ecb_enc_four",16);
+	&call	("_aesni_encrypt4");
+	&movups	(&QWP(0,$out),$inout0);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&movups	(&QWP(0x20,$out),$inout2);
+	&movups	(&QWP(0x30,$out),$inout3);
+	&jmp	(&label("ecb_ret"));
 ######################################################################
 &set_label("ecb_decrypt",16);
-	&cmp	($len,0x40);
-	&jbe	(&label("ecb_dec_tail"));
-	&sub	($len,0x40);
-	&jmp	(&label("ecb_dec_loop3"));
-
-&set_label("ecb_dec_loop3",16);
-	&movups	($inout0,&QWP(0,$inp));
-	&movups	($inout1,&QWP(0x10,$inp));
-	&movups	($inout2,&QWP(0x20,$inp));
-	&call	("_aesni_decrypt3");
-	&lea	($inp,&DWP(0x30,$inp));
+	&mov	($key_,$key);		# backup $key
+	&mov	($rounds_,$rounds);	# backup $rounds
+	&cmp	($len,0x60);
+	&jb	(&label("ecb_dec_tail"));
+
+	&movdqu	($inout0,&QWP(0,$inp));
+	&movdqu	($inout1,&QWP(0x10,$inp));
+	&movdqu	($inout2,&QWP(0x20,$inp));
+	&movdqu	($inout3,&QWP(0x30,$inp));
+	&movdqu	($inout4,&QWP(0x40,$inp));
+	&movdqu	($inout5,&QWP(0x50,$inp));
+	&lea	($inp,&DWP(0x60,$inp));
+	&sub	($len,0x60);
+	&jmp	(&label("ecb_dec_loop6_enter"));
+
+&set_label("ecb_dec_loop6",16);
 	&movups	(&QWP(0,$out),$inout0);
-	&mov	($key,$key_);		# restore $key
+	&movdqu	($inout0,&QWP(0,$inp));
 	&movups	(&QWP(0x10,$out),$inout1);
+	&movdqu	($inout1,&QWP(0x10,$inp));
+	&movups	(&QWP(0x20,$out),$inout2);
+	&movdqu	($inout2,&QWP(0x20,$inp));
+	&movups	(&QWP(0x30,$out),$inout3);
+	&movdqu	($inout3,&QWP(0x30,$inp));
+	&movups	(&QWP(0x40,$out),$inout4);
+	&movdqu	($inout4,&QWP(0x40,$inp));
+	&movups	(&QWP(0x50,$out),$inout5);
+	&lea	($out,&DWP(0x60,$out));
+	&movdqu	($inout5,&QWP(0x50,$inp));
+	&lea	($inp,&DWP(0x60,$inp));
+&set_label("ecb_dec_loop6_enter");
+
+	&call	("_aesni_decrypt6");
+
+	&mov	($key,$key_);		# restore $key
 	&mov	($rounds,$rounds_);	# restore $rounds
+	&sub	($len,0x60);
+	&jnc	(&label("ecb_dec_loop6"));
+
+	&movups	(&QWP(0,$out),$inout0);
+	&movups	(&QWP(0x10,$out),$inout1);
 	&movups	(&QWP(0x20,$out),$inout2);
-	&lea	($out,&DWP(0x30,$out));
-	&sub	($len,0x30);
-	&ja	(&label("ecb_dec_loop3"));
+	&movups	(&QWP(0x30,$out),$inout3);
+	&movups	(&QWP(0x40,$out),$inout4);
+	&movups	(&QWP(0x50,$out),$inout5);
+	&lea	($out,&DWP(0x60,$out));
+	&add	($len,0x60);
+	&jz	(&label("ecb_ret"));
 
-	&add	($len,0x40);
 &set_label("ecb_dec_tail");
 	&movups	($inout0,&QWP(0,$inp));
 	&cmp	($len,0x20);
@@ -377,14 +524,18 @@ if ($PREFIX eq "aesni") {
 	&movups	($inout1,&QWP(0x10,$inp));
 	&je	(&label("ecb_dec_two"));
 	&movups	($inout2,&QWP(0x20,$inp));
-	&cmp	($len,0x30);
-	&je	(&label("ecb_dec_three"));
+	&cmp	($len,0x40);
+	&jb	(&label("ecb_dec_three"));
 	&movups	($inout3,&QWP(0x30,$inp));
-	&call	("_aesni_decrypt4");
+	&je	(&label("ecb_dec_four"));
+	&movups	($inout4,&QWP(0x40,$inp));
+	&xorps	($inout5,$inout5);
+	&call	("_aesni_decrypt6");
 	&movups	(&QWP(0,$out),$inout0);
 	&movups	(&QWP(0x10,$out),$inout1);
 	&movups	(&QWP(0x20,$out),$inout2);
 	&movups	(&QWP(0x30,$out),$inout3);
+	&movups	(&QWP(0x40,$out),$inout4);
 	&jmp	(&label("ecb_ret"));
 
 &set_label("ecb_dec_one",16);
@@ -396,7 +547,7 @@ if ($PREFIX eq "aesni") {
 	&jmp	(&label("ecb_ret"));
 
 &set_label("ecb_dec_two",16);
-	&pxor	($inout2,$inout2);
+	&xorps	($inout2,$inout2);
 	&call	("_aesni_decrypt3");
 	&movups	(&QWP(0,$out),$inout0);
 	&movups	(&QWP(0x10,$out),$inout1);
@@ -407,6 +558,14 @@ if ($PREFIX eq "aesni") {
 	&movups	(&QWP(0,$out),$inout0);
 	&movups	(&QWP(0x10,$out),$inout1);
 	&movups	(&QWP(0x20,$out),$inout2);
+	&jmp	(&label("ecb_ret"));
+
+&set_label("ecb_dec_four",16);
+	&call	("_aesni_decrypt4");
+	&movups	(&QWP(0,$out),$inout0);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&movups	(&QWP(0x20,$out),$inout2);
+	&movups	(&QWP(0x30,$out),$inout3);
 
 &set_label("ecb_ret");
 &function_end("aesni_ecb_encrypt");
@@ -420,6 +579,7 @@ if ($PREFIX eq "aesni") {
 # does not update *ivec! Nor does it finalize CMAC value
 # (see engine/eng_aesni.c for details)
 #
+{ my $cmac=$inout1;
 &function_begin("aesni_ccm64_encrypt_blocks");
 	&mov	($inp,&wparam(0));
 	&mov	($out,&wparam(1));
@@ -433,7 +593,7 @@ if ($PREFIX eq "aesni") {
 	&mov	(&DWP(48,"esp"),$key_);
 
 	&movdqu	($ivec,&QWP(0,$rounds_));	# load ivec
-	&movdqu	($inout1,&QWP(0,$rounds));	# load cmac
+	&movdqu	($cmac,&QWP(0,$rounds));	# load cmac
 
 	# compose byte-swap control mask for pshufb on stack
 	&mov	(&DWP(0,"esp"),0x0c0d0e0f);
@@ -458,27 +618,47 @@ if ($PREFIX eq "aesni") {
 	&movdqa	($inout0,$ivec);
 
 &set_label("ccm64_enc_outer");
-	&movdqu	($in0,&QWP(0,$inp));
-	&pshufb	($inout0,$inout3);
-	&mov	($key,$key_);
-	&mov	($rounds,$rounds_);
-	&pxor	($inout1,$in0);			# cmac^=inp
-	&pxor	($inout2,$inout2);
+	&movups		($in0,&QWP(0,$inp));
+	&pshufb		($inout0,$inout3);
+	&mov		($key,$key_);
+	&mov		($rounds,$rounds_);
 
-	&call	("_aesni_encrypt3");
+	&$movekey	($rndkey0,&QWP(0,$key));
+	&shr		($rounds,1);
+	&$movekey	($rndkey1,&QWP(16,$key));
+	&xorps		($in0,$rndkey0);
+	&lea		($key,&DWP(32,$key));
+	&xorps		($inout0,$rndkey0);
+	&xorps		($cmac,$in0);		# cmac^=inp
+	&$movekey	($rndkey0,&QWP(0,$key));
+
+&set_label("ccm64_enc2_loop");
+	&aesenc		($inout0,$rndkey1);
+	&dec		($rounds);
+	&aesenc		($cmac,$rndkey1);
+	&$movekey	($rndkey1,&QWP(16,$key));
+	&aesenc		($inout0,$rndkey0);
+	&lea		($key,&DWP(32,$key));
+	&aesenc		($cmac,$rndkey0);
+	&$movekey	($rndkey0,&QWP(0,$key));
+	&jnz		(&label("ccm64_enc2_loop"));
+	&aesenc		($inout0,$rndkey1);
+	&aesenc		($cmac,$rndkey1);
+	&aesenclast	($inout0,$rndkey0);
+	&aesenclast	($cmac,$rndkey0);
 
 	&paddq	($ivec,&QWP(16,"esp"));
 	&dec	($len);
 	&lea	($inp,&DWP(16,$inp));
-	&pxor	($in0,$inout0);			# inp^=E(ivec)
+	&xorps	($in0,$inout0);			# inp^=E(ivec)
 	&movdqa	($inout0,$ivec);
-	&movdqu	(&QWP(0,$out),$in0);
+	&movups	(&QWP(0,$out),$in0);
 	&lea	($out,&DWP(16,$out));
 	&jnz	(&label("ccm64_enc_outer"));
 
 	&mov	("esp",&DWP(48,"esp"));
 	&mov	($out,&wparam(5));
-	&movdqu	(&QWP(0,$out),$inout1);
+	&movups	(&QWP(0,$out),$cmac);
 &function_end("aesni_ccm64_encrypt_blocks");
 
 &function_begin("aesni_ccm64_decrypt_blocks");
@@ -494,7 +674,7 @@ if ($PREFIX eq "aesni") {
 	&mov	(&DWP(48,"esp"),$key_);
 
 	&movdqu	($ivec,&QWP(0,$rounds_));	# load ivec
-	&movdqu	($inout1,&QWP(0,$rounds));	# load cmac
+	&movdqu	($cmac,&QWP(0,$rounds));	# load cmac
 
 	# compose byte-swap control mask for pshufb on stack
 	&mov	(&DWP(0,"esp"),0x0c0d0e0f);
@@ -524,35 +704,56 @@ if ($PREFIX eq "aesni") {
 	{   &call	("_aesni_encrypt1");	}
 
 &set_label("ccm64_dec_outer");
-	&movdqu	($in0,&QWP(0,$inp));
 	&paddq	($ivec,&QWP(16,"esp"));
-	&dec	($len);
-	&lea	($inp,&QWP(16,$inp));
-	&pxor	($in0,$inout0);
+	&movups	($in0,&QWP(0,$inp));		# load inp
+	&xorps	($in0,$inout0);
 	&movdqa	($inout0,$ivec);
+	&lea	($inp,&QWP(16,$inp));
+	&pshufb	($inout0,$inout3);
 	&mov	($key,$key_);
 	&mov	($rounds,$rounds_);
-	&pshufb	($inout0,$inout3);
-	&movdqu	(&QWP(0,$out),$in0);
+	&movups	(&QWP(0,$out),$in0);
 	&lea	($out,&DWP(16,$out));
 
+	&sub	($len,1);
 	&jz	(&label("ccm64_dec_break"));
 
-	&pxor	($inout2,$inout2);
-	&call	("_aesni_encrypt3");
+	&$movekey	($rndkey0,&QWP(0,$key));
+	&shr		($rounds,1);
+	&$movekey	($rndkey1,&QWP(16,$key));
+	&xorps		($in0,$rndkey0);
+	&lea		($key,&DWP(32,$key));
+	&xorps		($inout0,$rndkey0);
+	&xorps		($cmac,$in0);		# cmac^=out
+	&$movekey	($rndkey0,&QWP(0,$key));
 
+&set_label("ccm64_dec2_loop");
+	&aesenc		($inout0,$rndkey1);
+	&dec		($rounds);
+	&aesenc		($cmac,$rndkey1);
+	&$movekey	($rndkey1,&QWP(16,$key));
+	&aesenc		($inout0,$rndkey0);
+	&lea		($key,&DWP(32,$key));
+	&aesenc		($cmac,$rndkey0);
+	&$movekey	($rndkey0,&QWP(0,$key));
+	&jnz		(&label("ccm64_dec2_loop"));
+	&aesenc		($inout0,$rndkey1);
+	&aesenc		($cmac,$rndkey1);
+	&aesenclast	($inout0,$rndkey0);
+	&aesenclast	($cmac,$rndkey0);
 	&jmp	(&label("ccm64_dec_outer"));
 
 &set_label("ccm64_dec_break",16);
 	if ($inline)
-	{   &aesni_inline_generate1("enc",$inout1);	}
+	{   &aesni_inline_generate1("enc",$cmac,$in0);	}
 	else
-	{   &call	("_aesni_encrypt1",$inout1);	}
+	{   &call	("_aesni_encrypt1",$cmac);	}
 
 	&mov	("esp",&DWP(48,"esp"));
 	&mov	($out,&wparam(5));
-	&movdqu	(&QWP(0,$out),$inout1);
+	&movups	(&QWP(0,$out),$cmac);
 &function_end("aesni_ccm64_decrypt_blocks");
+}
 
 ######################################################################
 # void aesni_ctr32_encrypt_blocks (const void *in, void *out,
@@ -562,6 +763,14 @@ if ($PREFIX eq "aesni") {
 # Handles only complete blocks, operates on 32-bit counter and
 # does not update *ivec! (see engine/eng_aesni.c for details)
 #
+# stack layout:
+#	0	pshufb mask
+#	16	vector addend: 0,6,6,6
+# 	32	counter-less ivec
+#	48	1st triplet of counter vector
+#	64	2nd triplet of counter vector
+#	80	saved %esp
+
 &function_begin("aesni_ctr32_encrypt_blocks");
 	&mov	($inp,&wparam(0));
 	&mov	($out,&wparam(1));
@@ -569,14 +778,14 @@ if ($PREFIX eq "aesni") {
 	&mov	($key,&wparam(3));
 	&mov	($rounds_,&wparam(4));
 	&mov	($key_,"esp");
-	&sub	("esp",60);
+	&sub	("esp",88);
 	&and	("esp",-16);			# align stack
-	&mov	(&DWP(48,"esp"),$key_);
+	&mov	(&DWP(80,"esp"),$key_);
 
 	&cmp	($len,1);
 	&je	(&label("ctr32_one_shortcut"));
 
-	&movups	($inout3,&QWP(0,$rounds_));	# load ivec
+	&movdqu	($inout5,&QWP(0,$rounds_));	# load ivec
 
 	# compose byte-swap control mask for pshufb on stack
 	&mov	(&DWP(0,"esp"),0x0c0d0e0f);
@@ -585,139 +794,167 @@ if ($PREFIX eq "aesni") {
 	&mov	(&DWP(12,"esp"),0x00010203);
 
 	# compose counter increment vector on stack
-	&mov	($rounds,3);
+	&mov	($rounds,6);
 	&xor	($key_,$key_);
 	&mov	(&DWP(16,"esp"),$rounds);
 	&mov	(&DWP(20,"esp"),$rounds);
 	&mov	(&DWP(24,"esp"),$rounds);
 	&mov	(&DWP(28,"esp"),$key_);
 
-	&pextrd	($rounds_,$inout3,3);		# pull 32-bit counter
-	&pinsrd	($inout3,$key_,3);		# wipe 32-bit counter
+	&pextrd	($rounds_,$inout5,3);		# pull 32-bit counter
+	&pinsrd	($inout5,$key_,3);		# wipe 32-bit counter
 
 	&mov	($rounds,&DWP(240,$key));	# key->rounds
-	&movdqa	($rndkey0,&QWP(0,"esp"));	# load byte-swap mask
 
-	# $ivec is vector of 3 32-bit counters
-	&pxor	($ivec,$ivec);
+	# compose 2 vectors of 3x32-bit counters
 	&bswap	($rounds_);
-	&pinsrd	($ivec,$rounds_,0);
+	&pxor	($rndkey1,$rndkey1);
+	&pxor	($rndkey0,$rndkey0);
+	&movdqa	($inout0,&QWP(0,"esp"));	# load byte-swap mask
+	&pinsrd	($rndkey1,$rounds_,0);
+	&lea	($key_,&DWP(3,$rounds_));
+	&pinsrd	($rndkey0,$key_,0);
 	&inc	($rounds_);
-	&pinsrd	($ivec,$rounds_,1);
+	&pinsrd	($rndkey1,$rounds_,1);
+	&inc	($key_);
+	&pinsrd	($rndkey0,$key_,1);
 	&inc	($rounds_);
-	&pinsrd	($ivec,$rounds_,2);
-	&pshufb	($ivec,$rndkey0);		# byte swap
-
-	&cmp	($len,4);
-	&jbe	(&label("ctr32_tail"));
-	&movdqa	(&QWP(32,"esp"),$inout3);	# save counter-less ivec
-	&mov	($rounds_,$rounds);
-	&mov	($key_,$key);
-	&sub	($len,4);
-	&jmp	(&label("ctr32_loop3"));
-
-&set_label("ctr32_loop3",16);
-	&pshufd	($inout0,$ivec,3<<6);		# place counter to upper dword
-	&pshufd	($inout1,$ivec,2<<6);
-	&por	($inout0,$inout3);		# merge counter-less ivec
-	&pshufd	($inout2,$ivec,1<<6);
-	&por	($inout1,$inout3);
-	&por	($inout2,$inout3);
-
-	# inline _aesni_encrypt3 and interleave last round
-	# with own code...
-
-	&$movekey	($rndkey0,&QWP(0,$key));
-	&shr		($rounds,1);
-	&$movekey	($rndkey1,&QWP(16,$key));
-	&lea		($key,&DWP(32,$key));
+	&pinsrd	($rndkey1,$rounds_,2);
+	&inc	($key_);
+	&pinsrd	($rndkey0,$key_,2);
+	&movdqa	(&QWP(48,"esp"),$rndkey1);	# save 1st triplet
+	&pshufb	($rndkey1,$inout0);		# byte swap
+	&movdqa	(&QWP(64,"esp"),$rndkey0);	# save 2nd triplet
+	&pshufb	($rndkey0,$inout0);		# byte swap
+
+	&pshufd	($inout0,$rndkey1,3<<6);	# place counter to upper dword
+	&pshufd	($inout1,$rndkey1,2<<6);
+	&cmp	($len,6);
+	&jb	(&label("ctr32_tail"));
+	&movdqa	(&QWP(32,"esp"),$inout5);	# save counter-less ivec
+	&shr	($rounds,1);
+	&mov	($key_,$key);			# backup $key
+	&mov	($rounds_,$rounds);		# backup $rounds
+	&sub	($len,6);
+	&jmp	(&label("ctr32_loop6"));
+
+&set_label("ctr32_loop6",16);
+	&pshufd	($inout2,$rndkey1,1<<6);
+	&movdqa	($rndkey1,&QWP(32,"esp"));	# pull counter-less ivec
+	&pshufd	($inout3,$rndkey0,3<<6);
+	&por	($inout0,$rndkey1);		# merge counter-less ivec
+	&pshufd	($inout4,$rndkey0,2<<6);
+	&por	($inout1,$rndkey1);
+	&pshufd	($inout5,$rndkey0,1<<6);
+	&por	($inout2,$rndkey1);
+	&por	($inout3,$rndkey1);
+	&por	($inout4,$rndkey1);
+	&por	($inout5,$rndkey1);
+
+	# inlining _aesni_encrypt6's prologue gives ~4% improvement...
+	&$movekey	($rndkey0,&QWP(0,$key_));
+	&$movekey	($rndkey1,&QWP(16,$key_));
+	&lea		($key,&DWP(32,$key_));
+	&dec		($rounds);
 	&pxor		($inout0,$rndkey0);
 	&pxor		($inout1,$rndkey0);
-	&pxor		($inout2,$rndkey0);
-	&$movekey	($rndkey0,&QWP(0,$key));
-
-&set_label("ctr32_enc_loop3");
 	&aesenc		($inout0,$rndkey1);
+	&pxor		($inout2,$rndkey0);
 	&aesenc		($inout1,$rndkey1);
-	&dec		($rounds);
+	&pxor		($inout3,$rndkey0);
 	&aesenc		($inout2,$rndkey1);
-	&$movekey	($rndkey1,&QWP(16,$key));
-	&aesenc		($inout0,$rndkey0);
-	&aesenc		($inout1,$rndkey0);
-	&lea		($key,&DWP(32,$key));
-	&aesenc		($inout2,$rndkey0);
+	&pxor		($inout4,$rndkey0);
+	&aesenc		($inout3,$rndkey1);
+	&pxor		($inout5,$rndkey0);
+	&aesenc		($inout4,$rndkey1);
 	&$movekey	($rndkey0,&QWP(0,$key));
-	&jnz		(&label("ctr32_enc_loop3"));
+	&aesenc		($inout5,$rndkey1);
 
-	&aesenc		($inout0,$rndkey1);
-	&aesenc		($inout1,$rndkey1);
-	&aesenc		($inout2,$rndkey1);
-	&movdqa		($rndkey1,&QWP(0,"esp"));	# load byte-swap mask
+	&call		(&label("_aesni_encrypt6_enter"));
+
+	&movups	($rndkey1,&QWP(0,$inp));
+	&movups	($rndkey0,&QWP(0x10,$inp));
+	&xorps	($inout0,$rndkey1);
+	&movups	($rndkey1,&QWP(0x20,$inp));
+	&xorps	($inout1,$rndkey0);
+	&movups	(&QWP(0,$out),$inout0);
+	&movdqa	($rndkey0,&QWP(16,"esp"));	# load increment
+	&xorps	($inout2,$rndkey1);
+	&movdqa	($rndkey1,&QWP(48,"esp"));	# load 1st triplet
+	&movups	(&QWP(0x10,$out),$inout1);
+	&movups	(&QWP(0x20,$out),$inout2);
+
+	&paddd	($rndkey1,$rndkey0);		# 1st triplet increment
+	&paddd	($rndkey0,&QWP(64,"esp"));	# 2nd triplet increment
+	&movdqa	($inout0,&QWP(0,"esp"));	# load byte swap mask
+
+	&movups	($inout1,&QWP(0x30,$inp));
+	&movups	($inout2,&QWP(0x40,$inp));
+	&xorps	($inout3,$inout1);
+	&movups	($inout1,&QWP(0x50,$inp));
+	&lea	($inp,&DWP(0x60,$inp));
+	&movdqa	(&QWP(48,"esp"),$rndkey1);	# save 1st triplet
+	&pshufb	($rndkey1,$inout0);		# byte swap
+	&xorps	($inout4,$inout2);
+	&movups	(&QWP(0x30,$out),$inout3);
+	&xorps	($inout5,$inout1);
+	&movdqa	(&QWP(64,"esp"),$rndkey0);	# save 2nd triplet
+	&pshufb	($rndkey0,$inout0);		# byte swap
+	&movups	(&QWP(0x40,$out),$inout4);
+	&pshufd	($inout0,$rndkey1,3<<6);
+	&movups	(&QWP(0x50,$out),$inout5);
+	&lea	($out,&DWP(0x60,$out));
 
-	&aesenclast	($inout0,$rndkey0);
-	&pshufb		($ivec,$rndkey1);		# byte swap
-	&movdqu		($in0,&QWP(0,$inp));
-	&aesenclast	($inout1,$rndkey0);
-	&paddd		($ivec,&QWP(16,"esp"));		# counter increment
-	&movdqu		($in1,&QWP(0x10,$inp));
-	&aesenclast	($inout2,$rndkey0);
-	&pshufb		($ivec,$rndkey1);		# byte swap
-	&movdqu		($rndkey0,&QWP(0x20,$inp));
-	&lea		($inp,&DWP(0x30,$inp));
-
-	&pxor	($in0,$inout0);
-	&mov	($key,$key_);
-	&pxor	($in1,$inout1);
-	&movdqu	(&QWP(0,$out),$in0);
-	&pxor	($rndkey0,$inout2);
-	&movdqu	(&QWP(0x10,$out),$in1);
-	&movdqu	(&QWP(0x20,$out),$rndkey0);
-	&movdqa	($inout3,&QWP(32,"esp"));	# load counter-less ivec
-
-	&sub	($len,3);
-	&lea	($out,&DWP(0x30,$out));
 	&mov	($rounds,$rounds_);
-	&ja	(&label("ctr32_loop3"));
+	&pshufd	($inout1,$rndkey1,2<<6);
+	&sub	($len,6);
+	&jnc	(&label("ctr32_loop6"));
 
-	&pextrd	($rounds_,$ivec,1);		# might need last counter value
-	&add	($len,4);
-	&bswap	($rounds_);
+	&add	($len,6);
+	&jz	(&label("ctr32_ret"));
+	&mov	($key,$key_);
+	&lea	($rounds,&DWP(1,"",$rounds,2));	# restore $rounds
+	&movdqa	($inout5,&QWP(32,"esp"));	# pull count-less ivec
 
 &set_label("ctr32_tail");
-	&pshufd	($inout0,$ivec,3<<6);
-	&pshufd	($inout1,$ivec,2<<6);
-	&por	($inout0,$inout3);
+	&por	($inout0,$inout5);
 	&cmp	($len,2);
 	&jb	(&label("ctr32_one"));
-	&lea	($rounds_,&DWP(1,$rounds_));
-	&pshufd	($inout2,$ivec,1<<6);
-	&por	($inout1,$inout3);
-	&je	(&label("ctr32_two"));
-	&bswap	($rounds_);
-	&por	($inout2,$inout3);
-	&cmp	($len,3);
-	&je	(&label("ctr32_three"));
-
-	&pinsrd	($inout3,$rounds_,3);		# compose last counter value
 
-	&call	("_aesni_encrypt4");
+	&pshufd	($inout2,$rndkey1,1<<6);
+	&por	($inout1,$inout5);
+	&je	(&label("ctr32_two"));
 
-	&movdqu	($in0,&QWP(0,$inp));
-	&movdqu	($rndkey1,&QWP(0x10,$inp));
-	&pxor	($in0,$inout0);
-	&movdqu	($rndkey0,&QWP(0x20,$inp));
-	&pxor	($rndkey1,$inout1);
-	&movdqu	($ivec,&QWP(0x30,$inp));
-	&pxor	($rndkey0,$inout2);
-	&movdqu	(&QWP(0,$out),$in0);
-	&pxor	($ivec,$inout3);
-	&movdqu	(&QWP(0x10,$out),$rndkey1);
-	&movdqu	(&QWP(0x20,$out),$rndkey0);
-	&movdqu	(&QWP(0x30,$out),$ivec);
+	&pshufd	($inout3,$rndkey0,3<<6);
+	&por	($inout2,$inout5);
+	&cmp	($len,4);
+	&jb	(&label("ctr32_three"));
+
+	&pshufd	($inout4,$rndkey0,2<<6);
+	&por	($inout3,$inout5);
+	&je	(&label("ctr32_four"));
+
+	&por	($inout4,$inout5);
+	&call	("_aesni_encrypt6");
+	&movups	($rndkey1,&QWP(0,$inp));
+	&movups	($rndkey0,&QWP(0x10,$inp));
+	&xorps	($inout0,$rndkey1);
+	&movups	($rndkey1,&QWP(0x20,$inp));
+	&xorps	($inout1,$rndkey0);
+	&movups	($rndkey0,&QWP(0x30,$inp));
+	&xorps	($inout2,$rndkey1);
+	&movups	($rndkey1,&QWP(0x40,$inp));
+	&xorps	($inout3,$rndkey0);
+	&movups	(&QWP(0,$out),$inout0);
+	&xorps	($inout4,$rndkey1);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&movups	(&QWP(0x20,$out),$inout2);
+	&movups	(&QWP(0x30,$out),$inout3);
+	&movups	(&QWP(0x40,$out),$inout4);
 	&jmp	(&label("ctr32_ret"));
 
 &set_label("ctr32_one_shortcut",16);
-	&movdqu	($inout0,&QWP(0,$rounds_));	# load ivec
+	&movups	($inout0,&QWP(0,$rounds_));	# load ivec
 	&mov	($rounds,&DWP(240,$key));
 	
 &set_label("ctr32_one");
@@ -725,37 +962,757 @@ if ($PREFIX eq "aesni") {
 	{   &aesni_inline_generate1("enc");	}
 	else
 	{   &call	("_aesni_encrypt1");	}
-	&movdqu	($in0,&QWP(0,$inp));
-	&pxor	($in0,$inout0);
-	&movdqu	(&QWP(0,$out),$in0);
+	&movups	($in0,&QWP(0,$inp));
+	&xorps	($in0,$inout0);
+	&movups	(&QWP(0,$out),$in0);
 	&jmp	(&label("ctr32_ret"));
 
 &set_label("ctr32_two",16);
-	&pxor	($inout2,$inout2);
 	&call	("_aesni_encrypt3");
-	&movdqu	($in0,&QWP(0,$inp));
-	&movdqu	($in1,&QWP(0x10,$inp));
-	&pxor	($in0,$inout0);
-	&pxor	($in1,$inout1);
-	&movdqu	(&QWP(0,$out),$in0);
-	&movdqu	(&QWP(0x10,$out),$in1);
+	&movups	($inout3,&QWP(0,$inp));
+	&movups	($inout4,&QWP(0x10,$inp));
+	&xorps	($inout0,$inout3);
+	&xorps	($inout1,$inout4);
+	&movups	(&QWP(0,$out),$inout0);
+	&movups	(&QWP(0x10,$out),$inout1);
 	&jmp	(&label("ctr32_ret"));
 
 &set_label("ctr32_three",16);
 	&call	("_aesni_encrypt3");
-	&movdqu	($in0,&QWP(0,$inp));
-	&movdqu	($in1,&QWP(0x10,$inp));
-	&movdqu	($rndkey1,&QWP(0x20,$inp));
-	&pxor	($in0,$inout0);
-	&pxor	($in1,$inout1);
-	&movdqu	(&QWP(0,$out),$in0);
-	&pxor	($rndkey1,$inout2);
-	&movdqu	(&QWP(0x10,$out),$in1);
-	&movdqu	(&QWP(0x20,$out),$rndkey1);
+	&movups	($inout3,&QWP(0,$inp));
+	&movups	($inout4,&QWP(0x10,
author	Andy Polyakov <appro@openssl.org>	2011-05-22 18:38:00 +0000
committer	Andy Polyakov <appro@openssl.org>	2011-05-22 18:38:00 +0000
commit	f8501464cc8fd8b7b4983462944a1894b157d735 (patch)
tree	31840baeabd26ad8bbb922752e156cf17d00c995
parent	96abea332c8b70b77e87390cbe016021971fb83b (diff)