summaryrefslogtreecommitdiffstats
path: root/crypto/rc4
diff options
context:
space:
mode:
authorAndy Polyakov <appro@openssl.org>2007-04-02 09:50:14 +0000
committerAndy Polyakov <appro@openssl.org>2007-04-02 09:50:14 +0000
commit9babf3929bf1f546aa646d9e1e2a934ccfe0b333 (patch)
tree9b31da5ec08ad7c267b081a3c3a3ff340d9c1fc0 /crypto/rc4
parent2ec0be9e778b7603494f8b9b1ccfc12b9a269760 (diff)
RC4_set_key for x86_64 and Core2 optimization.
PR: 1447
Diffstat (limited to 'crypto/rc4')
-rwxr-xr-xcrypto/rc4/asm/rc4-x86_64.pl115
1 files changed, 115 insertions, 0 deletions
diff --git a/crypto/rc4/asm/rc4-x86_64.pl b/crypto/rc4/asm/rc4-x86_64.pl
index 5236afec12..36a9429ef7 100755
--- a/crypto/rc4/asm/rc4-x86_64.pl
+++ b/crypto/rc4/asm/rc4-x86_64.pl
@@ -49,6 +49,14 @@
# is not implemented, then this final RC4_CHAR code-path should be
# preferred, as it provides better *all-round* performance].
+# Intel Core2 was observed to perform poorly on both code paths:-( It
+# apparently suffers from some kind of partial register stall, which
+# occurs in 64-bit mode only [as virtually identical 32-bit loop was
+# observed to outperform 64-bit one by almost 50%]. Adding two movzb to
+# cloop1 boosts its performance by 80%! This loop appears to be optimal
+# fit for Core2 and therefore the code was modified to skip cloop8 on
+# this CPU.
+
$output=shift;
open STDOUT,"| $^X ../perlasm/x86_64-xlate.pl $output";
@@ -152,6 +160,8 @@ $code.=<<___;
movzb ($dat,$XX[0]),$TX[0]#d
test \$-8,$len
jz .Lcloop1
+ cmp \$0,260($dat)
+ jnz .Lcloop1
push %rbx
jmp .Lcloop8
.align 16
@@ -235,6 +245,111 @@ $code.=<<___;
.size RC4,.-RC4
___
+$idx="%r8";
+$ido="%r9";
+
+$code.=<<___;
+.extern OPENSSL_ia32cap_P
+.globl RC4_set_key
+.type RC4_set_key,\@function,3
+.align 16
+RC4_set_key:
+ lea 8($dat),$dat
+ lea ($inp,$len),$inp
+ neg $len
+ mov $len,%rcx
+ xor %eax,%eax
+ xor $ido,$ido
+ xor %r10,%r10
+ xor %r11,%r11
+
+ mov OPENSSL_ia32cap_P(%rip),$idx#d
+ bt \$20,$idx#d
+ jnc .Lw1stloop
+ bt \$30,$idx#d
+ setc $ido#b
+ mov $ido#d,260($dat)
+ jmp .Lc1stloop
+
+.align 16
+.Lw1stloop:
+ mov %eax,($dat,%rax,4)
+ add \$1,%al
+ jnc .Lw1stloop
+
+ xor $ido,$ido
+ xor $idx,$idx
+.align 16
+.Lw2ndloop:
+ mov ($dat,$ido,4),%r10d
+ add ($inp,$len,1),$idx#b
+ add %r10b,$idx#b
+ add \$1,$len
+ mov ($dat,$idx,4),%r11d
+ cmovz %rcx,$len
+ mov %r10d,($dat,$idx,4)
+ mov %r11d,($dat,$ido,4)
+ add \$1,$ido#b
+ jnc .Lw2ndloop
+ jmp .Lexit_key
+
+.align 16
+.Lc1stloop:
+ mov %al,($dat,%rax)
+ add \$1,%al
+ jnc .Lc1stloop
+
+ xor $ido,$ido
+ xor $idx,$idx
+.align 16
+.Lc2ndloop:
+ mov ($dat,$ido),%r10b
+ add ($inp,$len),$idx#b
+ add %r10b,$idx#b
+ add \$1,$len
+ mov ($dat,$idx),%r11b
+ jnz .Lcnowrap
+ mov %rcx,$len
+.Lcnowrap:
+ mov %r10b,($dat,$idx)
+ mov %r11b,($dat,$ido)
+ add \$1,$ido#b
+ jnc .Lc2ndloop
+ movl \$-1,256($dat)
+
+.align 16
+.Lexit_key:
+ xor %eax,%eax
+ mov %eax,-8($dat)
+ mov %eax,-4($dat)
+ ret
+.size RC4_set_key,.-RC4_set_key
+
+.globl RC4_options
+.type RC4_options,\@function,0
+.align 16
+RC4_options:
+ .picmeup %rax
+ lea .Lopts-.(%rax),%rax
+ mov OPENSSL_ia32cap_P(%rip),%edx
+ bt \$20,%edx
+ jnc .Ldone
+ add \$12,%rax
+ bt \$30,%edx
+ jnc .Ldone
+ add \$13,%rax
+.Ldone:
+ ret
+.align 64
+.Lopts:
+.asciz "rc4(8x,int)"
+.asciz "rc4(8x,char)"
+.asciz "rc4(1x,char)"
+.asciz "RC4 for x86_64, OpenSSL project"
+.align 64
+.size RC4_options,.-RC4_options
+___
+
$code =~ s/#([bwd])/$1/gm;
print $code;