summaryrefslogtreecommitdiffstats
path: root/crypto/aes/asm
diff options
context:
space:
mode:
authorAndy Polyakov <appro@openssl.org>2014-02-14 17:06:15 +0100
committerAndy Polyakov <appro@openssl.org>2014-02-14 17:17:39 +0100
commitb347341c75656cf8bc039bd0ea5e3571c9299687 (patch)
tree142c210cddd2580ae0b31081b040203f0aaaf40a /crypto/aes/asm
parentc00f8d697aed17edbd002e2f6c989d8fbd7c4ecf (diff)
aes/asm/aesni-x86_64.pl: further optimization for Atom Silvermont.
Improve CBC decrypt and CTR by ~13/16%, which adds up to ~25/33% improvement over "pre-Silvermont" version. [Add performance table to aesni-x86.pl]. (cherry picked from commit 5599c7331b90d9d29c9914c2a95c16d91485415a)
Diffstat (limited to 'crypto/aes/asm')
-rw-r--r--crypto/aes/asm/aesni-x86.pl11
-rw-r--r--crypto/aes/asm/aesni-x86_64.pl189
2 files changed, 179 insertions, 21 deletions
diff --git a/crypto/aes/asm/aesni-x86.pl b/crypto/aes/asm/aesni-x86.pl
index 6fcbb9581d..c3df97db7b 100644
--- a/crypto/aes/asm/aesni-x86.pl
+++ b/crypto/aes/asm/aesni-x86.pl
@@ -43,6 +43,17 @@
# Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing
# one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09.
+######################################################################
+# Current large-block performance in cycles per byte processed with
+# 128-bit key (less is better).
+#
+# CBC en-/decrypt CTR XTS ECB
+# Westmere 3.77/1.37 1.37 1.52 1.27
+# * Bridge 5.07/0.98 0.99 1.09 0.91
+# Haswell 4.44/0.80 0.97 1.03 0.72
+# Atom 5.77/3.56 3.67 4.03 3.46
+# Bulldozer 5.80/0.98 1.05 1.24 0.93
+
$PREFIX="aesni"; # if $PREFIX is set to "AES", the script
# generates drop-in replacement for
# crypto/aes/asm/aes-586.pl:-)
diff --git a/crypto/aes/asm/aesni-x86_64.pl b/crypto/aes/asm/aesni-x86_64.pl
index 96ef5c5114..708fabd3de 100644
--- a/crypto/aes/asm/aesni-x86_64.pl
+++ b/crypto/aes/asm/aesni-x86_64.pl
@@ -158,25 +158,19 @@
# in CTR mode AES instruction interleave factor was chosen to be 6x.
######################################################################
-# For reference, AMD Bulldozer spends 5.77 cycles per byte processed
-# with 128-bit key in CBC encrypt and 0.70 cycles in CBC decrypt, 0.70
-# in ECB, 0.71 in CTR, 0.90 in XTS... This means that aes[enc|dec]
-# instruction latency is 9 cycles and that they can be issued every
-# cycle.
-
-######################################################################
-# Haswell spends 4.44 cycles per byte in CBC encrypt, 0.63 in CBC
-# decrypt, CTR and ECB, 0.73 in XTS.
-
-######################################################################
-# Atom Silvermont spends 5.77/4.0 cycles per byte in CBC en-/decrypt,
-# 3.87 in ECB, 4.15 in CTR, 4.12 in XTS. Results for parallelizeable
-# modes [other than XTS] are actually suboptimal, because of penalties
-# incurred by operations on %xmm8-15, which are inevitable with such
-# high instruction interleave factors. This means that performance can
-# be improved by decreasing the interleave factor, but then it would
-# negatively affect other platforms in relatively larger degree.
-# Run-time detection would solve the dilemma...
+# Current large-block performance in cycles per byte processed with
+# 128-bit key (less is better).
+#
+# CBC en-/decrypt CTR XTS ECB
+# Westmere 3.77/1.25 1.25 1.25 1.26
+# * Bridge 5.07/0.74 0.75 0.90 0.85
+# Haswell 4.44/0.63 0.63 0.73 0.63
+# Atom 5.75/3.54 3.56 4.12 3.87(*)
+# Bulldozer 5.77/0.70 0.72 0.90 0.70
+#
+# (*) Atom ECB result is suboptimal because of penalties incurred
+# by operations on %xmm8-15. As ECB is not considered
+# critical, nothing was done to mitigate the problem.
$PREFIX="aesni"; # if $PREFIX is set to "AES", the script
# generates drop-in replacement for
@@ -201,6 +195,7 @@ $movkey = $PREFIX eq "aesni" ? "movups" : "movups";
("%rdi","%rsi","%rdx","%rcx"); # Unix order
$code=".text\n";
+$code.=".extern OPENSSL_ia32cap_P\n";
$rounds="%eax"; # input to and changed by aesni_[en|de]cryptN !!!
# this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ...
@@ -1119,7 +1114,9 @@ $code.=<<___;
lea 7($ctr),%r9
mov %r10d,0x60+12(%rsp)
bswap %r9d
+ mov OPENSSL_ia32cap_P+4(%rip),%r10d
xor $key0,%r9d
+ and \$`1<<26|1<<22`,%r10d # isolate XSAVE+MOVBE
mov %r9d,0x70+12(%rsp)
$movkey 0x10($key),$rndkey1
@@ -1130,10 +1127,104 @@ $code.=<<___;
cmp \$8,$len
jb .Lctr32_tail
+ sub \$6,$len
+ cmp \$`1<<22`,%r10d # check for MOVBE without XSAVE
+ je .Lctr32_6x
+
lea 0x80($key),$key # size optimization
- sub \$8,$len
+ sub \$2,$len
jmp .Lctr32_loop8
+.align 16
+.Lctr32_6x:
+ shl \$4,$rounds
+ mov \$48,$rnds_
+ bswap $key0
+ lea 32($key,$rounds),$key # end of key schedule
+ sub %rax,%r10 # twisted $rounds
+ jmp .Lctr32_loop6
+
+.align 16
+.Lctr32_loop6:
+ add \$6,$ctr
+ $movkey -48($key,$rnds_),$rndkey0
+ aesenc $rndkey1,$inout0
+ mov $ctr,%eax
+ xor $key0,%eax
+ aesenc $rndkey1,$inout1
+ movbe %eax,`0x00+12`(%rsp)
+ lea 1($ctr),%eax
+ aesenc $rndkey1,$inout2
+ xor $key0,%eax
+ movbe %eax,`0x10+12`(%rsp)
+ aesenc $rndkey1,$inout3
+ lea 2($ctr),%eax
+ xor $key0,%eax
+ aesenc $rndkey1,$inout4
+ movbe %eax,`0x20+12`(%rsp)
+ lea 3($ctr),%eax
+ aesenc $rndkey1,$inout5
+ $movkey -32($key,$rnds_),$rndkey1
+ xor $key0,%eax
+
+ aesenc $rndkey0,$inout0
+ movbe %eax,`0x30+12`(%rsp)
+ lea 4($ctr),%eax
+ aesenc $rndkey0,$inout1
+ xor $key0,%eax
+ movbe %eax,`0x40+12`(%rsp)
+ aesenc $rndkey0,$inout2
+ lea 5($ctr),%eax
+ xor $key0,%eax
+ aesenc $rndkey0,$inout3
+ movbe %eax,`0x50+12`(%rsp)
+ mov %r10,%rax # mov $rnds_,$rounds
+ aesenc $rndkey0,$inout4
+ aesenc $rndkey0,$inout5
+ $movkey -16($key,$rnds_),$rndkey0
+
+ call .Lenc_loop6
+
+ movdqu ($inp),$inout6
+ movdqu 0x10($inp),$inout7
+ movdqu 0x20($inp),$in0
+ movdqu 0x30($inp),$in1
+ movdqu 0x40($inp),$in2
+ movdqu 0x50($inp),$in3
+ lea 0x60($inp),$inp
+ $movkey -64($key,$rnds_),$rndkey1
+ pxor $inout0,$inout6
+ movaps 0x00(%rsp),$inout0
+ pxor $inout1,$inout7
+ movaps 0x10(%rsp),$inout1
+ pxor $inout2,$in0
+ movaps 0x20(%rsp),$inout2
+ pxor $inout3,$in1
+ movaps 0x30(%rsp),$inout3
+ pxor $inout4,$in2
+ movaps 0x40(%rsp),$inout4
+ pxor $inout5,$in3
+ movaps 0x50(%rsp),$inout5
+ movdqu $inout6,($out)
+ movdqu $inout7,0x10($out)
+ movdqu $in0,0x20($out)
+ movdqu $in1,0x30($out)
+ movdqu $in2,0x40($out)
+ movdqu $in3,0x50($out)
+ lea 0x60($out),$out
+
+ sub \$6,$len
+ jnc .Lctr32_loop6
+
+ add \$6,$len
+ jz .Lctr32_done
+
+ lea -48($rnds_),$rounds
+ lea -80($key,$rnds_),$key # restore $key
+ neg $rounds
+ shr \$4,$rounds # restore $rounds
+ jmp .Lctr32_tail
+
.align 32
.Lctr32_loop8:
add \$8,$ctr
@@ -2455,10 +2546,15 @@ $code.=<<___;
movdqa $inout3,$in3
movdqu 0x50($inp),$inout5
movdqa $inout4,$in4
+ mov OPENSSL_ia32cap_P+4(%rip),%r9d
cmp \$0x70,$len
jbe .Lcbc_dec_six_or_seven
- sub \$0x70,$len
+ and \$`1<<26|1<<22`,%r9d # isolate XSAVE+MOVBE
+ sub \$0x50,$len
+ cmp \$`1<<22`,%r9d # check for MOVBE without XSAVE
+ je .Lcbc_dec_loop6_enter
+ sub \$0x20,$len
lea 0x70($key),$key # size optimization
jmp .Lcbc_dec_loop8_enter
.align 16
@@ -2638,6 +2734,51 @@ $code.=<<___;
movdqa $inout6,$inout0
jmp .Lcbc_dec_tail_collected
+.align 16
+.Lcbc_dec_loop6:
+ movups $inout5,($out)
+ lea 0x10($out),$out
+ movdqu 0x00($inp),$inout0 # load input
+ movdqu 0x10($inp),$inout1
+ movdqa $inout0,$in0
+ movdqu 0x20($inp),$inout2
+ movdqa $inout1,$in1
+ movdqu 0x30($inp),$inout3
+ movdqa $inout2,$in2
+ movdqu 0x40($inp),$inout4
+ movdqa $inout3,$in3
+ movdqu 0x50($inp),$inout5
+ movdqa $inout4,$in4
+.Lcbc_dec_loop6_enter:
+ lea 0x60($inp),$inp
+ movdqa $inout5,$inout6
+
+ call _aesni_decrypt6
+
+ pxor $iv,$inout0 # ^= IV
+ movdqa $inout6,$iv
+ pxor $in0,$inout1
+ movdqu $inout0,($out)
+ pxor $in1,$inout2
+ movdqu $inout1,0x10($out)
+ pxor $in2,$inout3
+ movdqu $inout2,0x20($out)
+ pxor $in3,$inout4
+ mov $key_,$key
+ movdqu $inout3,0x30($out)
+ pxor $in4,$inout5
+ mov $rnds_,$rounds
+ movdqu $inout4,0x40($out)
+ lea 0x50($out),$out
+ sub \$0x60,$len
+ ja .Lcbc_dec_loop6
+
+ movdqa $inout5,$inout0
+ add \$0x50,$len
+ jle .Lcbc_dec_tail_collected
+ movups $inout5,($out)
+ lea 0x10($out),$out
+
.Lcbc_dec_tail:
movups ($inp),$inout0
sub \$0x10,$len
@@ -3360,8 +3501,14 @@ sub aesni {
return $line;
}
+sub movbe {
+ ".byte 0x0f,0x38,0xf1,0x44,0x24,".shift;
+}
+
$code =~ s/\`([^\`]*)\`/eval($1)/gem;
$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
+#$code =~ s/\bmovbe\s+%eax/bswap %eax; mov %eax/gm; # debugging artefact
+$code =~ s/\bmovbe\s+%eax,\s*([0-9]+)\(%rsp\)/movbe($1)/gem;
print $code;