summaryrefslogtreecommitdiffstats
path: root/crypto/rc4/asm/rc4-x86_64.pl
diff options
context:
space:
mode:
authorAndy Polyakov <appro@openssl.org>2010-05-13 21:01:24 +0000
committerAndy Polyakov <appro@openssl.org>2010-05-13 21:01:24 +0000
commit629fd3aa913f547f6228740d5068193f283abe94 (patch)
treef1cd883000ee1336bf7cef94edda5f418b7f690f /crypto/rc4/asm/rc4-x86_64.pl
parent1aa8a6297c600f3ef13895df887691a3ca244ab6 (diff)
rc4-x86_64.pl: "Westmere" optimization.
Diffstat (limited to 'crypto/rc4/asm/rc4-x86_64.pl')
-rwxr-xr-xcrypto/rc4/asm/rc4-x86_64.pl5
1 files changed, 5 insertions, 0 deletions
diff --git a/crypto/rc4/asm/rc4-x86_64.pl b/crypto/rc4/asm/rc4-x86_64.pl
index 677be5fe25..23fe4d9996 100755
--- a/crypto/rc4/asm/rc4-x86_64.pl
+++ b/crypto/rc4/asm/rc4-x86_64.pl
@@ -58,6 +58,10 @@
# fit for Core2 and therefore the code was modified to skip cloop8 on
# this CPU.
+# Intel Westmere was observed to perform suboptimally. Adding yet
+# another movzb to cloop1 improved performance by almost 50%! Core2
+# performance is improved too, but nominally...
+
$flavour = shift;
$output = shift;
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
@@ -229,6 +233,7 @@ $code.=<<___;
.align 16
.Lcloop1:
add $TX[0]#b,$YY#b
+ movzb $YY#b,$YY#d
movzb ($dat,$YY),$TY#d
movb $TX[0]#b,($dat,$YY)
movb $TY#b,($dat,$XX[0])