summaryrefslogtreecommitdiffstats
path: root/crypto/aes
diff options
context:
space:
mode:
authorAndy Polyakov <appro@openssl.org>2014-02-21 12:14:04 +0100
committerAndy Polyakov <appro@openssl.org>2014-02-21 12:14:04 +0100
commit214368ffee5736836e2dbb80a16a4fbd85f0eaf9 (patch)
tree4d94689b5bd8ee6e6ae590712b8cf418c87b201a /crypto/aes
parent47739161c685790344981c6858fe35c63cd705b1 (diff)
aes/asm/aesni-x86[_64].pl: minor Atom-specific performance tweak.
Diffstat (limited to 'crypto/aes')
-rw-r--r--crypto/aes/asm/aesni-x86.pl63
-rw-r--r--crypto/aes/asm/aesni-x86_64.pl62
2 files changed, 97 insertions, 28 deletions
diff --git a/crypto/aes/asm/aesni-x86.pl b/crypto/aes/asm/aesni-x86.pl
index c3df97db7b..3deb86aed6 100644
--- a/crypto/aes/asm/aesni-x86.pl
+++ b/crypto/aes/asm/aesni-x86.pl
@@ -207,12 +207,45 @@ sub aesni_generate1 # fully unrolled loop
# every *2nd* cycle. Thus 3x interleave was the one providing optimal
# utilization, i.e. when subroutine's throughput is virtually same as
# of non-interleaved subroutine [for number of input blocks up to 3].
-# This is why it makes no sense to implement 2x subroutine.
-# aes[enc|dec] latency in next processor generation is 8, but the
-# instructions can be scheduled every cycle. Optimal interleave for
-# new processor is therefore 8x, but it's unfeasible to accommodate it
-# in XMM registers addreassable in 32-bit mode and therefore 6x is
-# used instead...
+# This is why it originally made no sense to implement 2x subroutine.
+# But times change and it became appropriate to spend extra 192 bytes
+# on 2x subroutine on Atom Silvermont account. For processors that
+# can schedule aes[enc|dec] every cycle optimal interleave factor
+# equals to corresponding instructions latency. 8x is optimal for
+# * Bridge, but it's unfeasible to accommodate such implementation
+# in XMM registers addreassable in 32-bit mode and therefore maximum
+# of 6x is used instead...
+
+sub aesni_generate2
+{ my $p=shift;
+
+ &function_begin_B("_aesni_${p}rypt2");
+ &$movekey ($rndkey0,&QWP(0,$key));
+ &shl ($rounds,4);
+ &$movekey ($rndkey1,&QWP(16,$key));
+ &xorps ($inout0,$rndkey0);
+ &pxor ($inout1,$rndkey0);
+ &$movekey ($rndkey0,&QWP(32,$key));
+ &lea ($key,&DWP(32,$key,$rounds));
+ &neg ($rounds);
+ &add ($rounds,16);
+
+ &set_label("${p}2_loop");
+ eval"&aes${p} ($inout0,$rndkey1)";
+ eval"&aes${p} ($inout1,$rndkey1)";
+ &$movekey ($rndkey1,&QWP(0,$key,$rounds));
+ &add ($rounds,32);
+ eval"&aes${p} ($inout0,$rndkey0)";
+ eval"&aes${p} ($inout1,$rndkey0)";
+ &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
+ &jnz (&label("${p}2_loop"));
+ eval"&aes${p} ($inout0,$rndkey1)";
+ eval"&aes${p} ($inout1,$rndkey1)";
+ eval"&aes${p}last ($inout0,$rndkey0)";
+ eval"&aes${p}last ($inout1,$rndkey0)";
+ &ret();
+ &function_end_B("_aesni_${p}rypt2");
+}
sub aesni_generate3
{ my $p=shift;
@@ -357,6 +390,8 @@ sub aesni_generate6
&ret();
&function_end_B("_aesni_${p}rypt6");
}
+&aesni_generate2("enc") if ($PREFIX eq "aesni");
+&aesni_generate2("dec");
&aesni_generate3("enc") if ($PREFIX eq "aesni");
&aesni_generate3("dec");
&aesni_generate4("enc") if ($PREFIX eq "aesni");
@@ -460,8 +495,7 @@ if ($PREFIX eq "aesni") {
&jmp (&label("ecb_ret"));
&set_label("ecb_enc_two",16);
- &xorps ($inout2,$inout2);
- &call ("_aesni_encrypt3");
+ &call ("_aesni_encrypt2");
&movups (&QWP(0,$out),$inout0);
&movups (&QWP(0x10,$out),$inout1);
&jmp (&label("ecb_ret"));
@@ -561,8 +595,7 @@ if ($PREFIX eq "aesni") {
&jmp (&label("ecb_ret"));
&set_label("ecb_dec_two",16);
- &xorps ($inout2,$inout2);
- &call ("_aesni_decrypt3");
+ &call ("_aesni_decrypt2");
&movups (&QWP(0,$out),$inout0);
&movups (&QWP(0x10,$out),$inout1);
&jmp (&label("ecb_ret"));
@@ -982,7 +1015,7 @@ if ($PREFIX eq "aesni") {
&jmp (&label("ctr32_ret"));
&set_label("ctr32_two",16);
- &call ("_aesni_encrypt3");
+ &call ("_aesni_encrypt2");
&movups ($inout3,&QWP(0,$inp));
&movups ($inout4,&QWP(0x10,$inp));
&xorps ($inout0,$inout3);
@@ -1253,9 +1286,8 @@ if ($PREFIX eq "aesni") {
&lea ($inp,&DWP(16*2,$inp));
&xorps ($inout0,$inout3); # input^=tweak
&xorps ($inout1,$inout4);
- &xorps ($inout2,$inout2);
- &call ("_aesni_encrypt3");
+ &call ("_aesni_encrypt2");
&xorps ($inout0,$inout3); # output^=tweak
&xorps ($inout1,$inout4);
@@ -1596,7 +1628,7 @@ if ($PREFIX eq "aesni") {
&xorps ($inout0,$inout3); # input^=tweak
&xorps ($inout1,$inout4);
- &call ("_aesni_decrypt3");
+ &call ("_aesni_decrypt2");
&xorps ($inout0,$inout3); # output^=tweak
&xorps ($inout1,$inout4);
@@ -1896,8 +1928,7 @@ if ($PREFIX eq "aesni") {
&jmp (&label("cbc_dec_tail_collected"));
&set_label("cbc_dec_two",16);
- &xorps ($inout2,$inout2);
- &call ("_aesni_decrypt3");
+ &call ("_aesni_decrypt2");
&xorps ($inout0,$ivec);
&xorps ($inout1,$in0);
&movups (&QWP(0,$out),$inout0);
diff --git a/crypto/aes/asm/aesni-x86_64.pl b/crypto/aes/asm/aesni-x86_64.pl
index 708fabd3de..31c80ae6bc 100644
--- a/crypto/aes/asm/aesni-x86_64.pl
+++ b/crypto/aes/asm/aesni-x86_64.pl
@@ -288,10 +288,49 @@ ___
# every *2nd* cycle. Thus 3x interleave was the one providing optimal
# utilization, i.e. when subroutine's throughput is virtually same as
# of non-interleaved subroutine [for number of input blocks up to 3].
-# This is why it makes no sense to implement 2x subroutine.
-# aes[enc|dec] latency in next processor generation is 8, but the
-# instructions can be scheduled every cycle. Optimal interleave for
-# new processor is therefore 8x...
+# This is why it originally made no sense to implement 2x subroutine.
+# But times change and it became appropriate to spend extra 192 bytes
+# on 2x subroutine on Atom Silvermont account. For processors that
+# can schedule aes[enc|dec] every cycle optimal interleave factor
+# equals to corresponding instructions latency. 8x is optimal for
+# * Bridge and "super-optimal" for other Intel CPUs...
+
+sub aesni_generate2 {
+my $dir=shift;
+# As already mentioned it takes in $key and $rounds, which are *not*
+# preserved. $inout[0-1] is cipher/clear text...
+$code.=<<___;
+.type _aesni_${dir}rypt2,\@abi-omnipotent
+.align 16
+_aesni_${dir}rypt2:
+ $movkey ($key),$rndkey0
+ shl \$4,$rounds
+ $movkey 16($key),$rndkey1
+ xorps $rndkey0,$inout0
+ xorps $rndkey0,$inout1
+ $movkey 32($key),$rndkey0
+ lea 32($key,$rounds),$key
+ neg %rax # $rounds
+ add \$16,%rax
+
+.L${dir}_loop2:
+ aes${dir} $rndkey1,$inout0
+ aes${dir} $rndkey1,$inout1
+ $movkey ($key,%rax),$rndkey1
+ add \$32,%rax
+ aes${dir} $rndkey0,$inout0
+ aes${dir} $rndkey0,$inout1
+ $movkey -16($key,%rax),$rndkey0
+ jnz .L${dir}_loop2
+
+ aes${dir} $rndkey1,$inout0
+ aes${dir} $rndkey1,$inout1
+ aes${dir}last $rndkey0,$inout0
+ aes${dir}last $rndkey0,$inout1
+ ret
+.size _aesni_${dir}rypt2,.-_aesni_${dir}rypt2
+___
+}
sub aesni_generate3 {
my $dir=shift;
# As already mentioned it takes in $key and $rounds, which are *not*
@@ -524,6 +563,8 @@ _aesni_${dir}rypt8:
.size _aesni_${dir}rypt8,.-_aesni_${dir}rypt8
___
}
+&aesni_generate2("enc") if ($PREFIX eq "aesni");
+&aesni_generate2("dec");
&aesni_generate3("enc") if ($PREFIX eq "aesni");
&aesni_generate3("dec");
&aesni_generate4("enc") if ($PREFIX eq "aesni");
@@ -645,8 +686,7 @@ $code.=<<___;
jmp .Lecb_ret
.align 16
.Lecb_enc_two:
- xorps $inout2,$inout2
- call _aesni_encrypt3
+ call _aesni_encrypt2
movups $inout0,($out)
movups $inout1,0x10($out)
jmp .Lecb_ret
@@ -782,8 +822,7 @@ $code.=<<___;
jmp .Lecb_ret
.align 16
.Lecb_dec_two:
- xorps $inout2,$inout2
- call _aesni_decrypt3
+ call _aesni_decrypt2
movups $inout0,($out)
movups $inout1,0x10($out)
jmp .Lecb_ret
@@ -1875,7 +1914,7 @@ $code.=<<___;
xorps @tweak[0],$inout0
xorps @tweak[1],$inout1
- call _aesni_encrypt3
+ call _aesni_encrypt2
xorps @tweak[0],$inout0
movdqa @tweak[2],@tweak[0]
@@ -2322,7 +2361,7 @@ $code.=<<___;
xorps @tweak[0],$inout0
xorps @tweak[1],$inout1
- call _aesni_decrypt3
+ call _aesni_decrypt2
xorps @tweak[0],$inout0
movdqa @tweak[2],@tweak[0]
@@ -2831,8 +2870,7 @@ $code.=<<___;
.align 16
.Lcbc_dec_two:
movaps $inout1,$in1
- xorps $inout2,$inout2
- call _aesni_decrypt3
+ call _aesni_decrypt2
pxor $iv,$inout0
movaps $in1,$iv
pxor $in0,$inout1