diff options
author | Andy Polyakov <appro@openssl.org> | 2010-05-13 21:01:24 +0000 |
---|---|---|
committer | Andy Polyakov <appro@openssl.org> | 2010-05-13 21:01:24 +0000 |
commit | 629fd3aa913f547f6228740d5068193f283abe94 (patch) | |
tree | f1cd883000ee1336bf7cef94edda5f418b7f690f /crypto/rc4/asm/rc4-x86_64.pl | |
parent | 1aa8a6297c600f3ef13895df887691a3ca244ab6 (diff) |
rc4-x86_64.pl: "Westmere" optimization.
Diffstat (limited to 'crypto/rc4/asm/rc4-x86_64.pl')
-rwxr-xr-x | crypto/rc4/asm/rc4-x86_64.pl | 5 |
1 files changed, 5 insertions, 0 deletions
diff --git a/crypto/rc4/asm/rc4-x86_64.pl b/crypto/rc4/asm/rc4-x86_64.pl index 677be5fe25..23fe4d9996 100755 --- a/crypto/rc4/asm/rc4-x86_64.pl +++ b/crypto/rc4/asm/rc4-x86_64.pl @@ -58,6 +58,10 @@ # fit for Core2 and therefore the code was modified to skip cloop8 on # this CPU. +# Intel Westmere was observed to perform suboptimally. Adding yet +# another movzb to cloop1 improved performance by almost 50%! Core2 +# performance is improved too, but nominally... + $flavour = shift; $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } @@ -229,6 +233,7 @@ $code.=<<___; .align 16 .Lcloop1: add $TX[0]#b,$YY#b + movzb $YY#b,$YY#d movzb ($dat,$YY),$TY#d movb $TX[0]#b,($dat,$YY) movb $TY#b,($dat,$XX[0]) |